In [1]:
import pandas as pd
df = pd.read_csv('Loan_Dataset.csv')
df.head()

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,0,Graduate,No,5849,,1.0,Urban,Y
1,Male,1,Graduate,No,4583,128.0,1.0,Rural,N
2,Male,0,Graduate,Yes,3000,66.0,1.0,Urban,Y
3,Male,0,Not Graduate,No,2583,120.0,1.0,Urban,Y
4,Male,0,Graduate,No,6000,141.0,1.0,Urban,Y


In [2]:
df.isna().mean()*100

Gender             2.117264
Dependents         2.442997
Education          0.000000
Self_Employed      5.211726
ApplicantIncome    0.000000
LoanAmount         3.583062
Credit_History     8.143322
Property_Area      0.000000
Loan_Status        0.000000
dtype: float64

In [3]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status'])

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
imp1 = SimpleImputer(strategy = "median")
imp2 = SimpleImputer(strategy = "most_frequent")

tr = ColumnTransformer([
    ('Median',imp1,[5]),
    ('Mode1',imp2,[0,1,3,6]),],
    remainder='passthrough')


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

order = ['Urban','Semiurban','Rural']
trf1 = ColumnTransformer([
    ('ohe',OneHotEncoder(),[1,2,3,5]),
    ('oe',OrdinalEncoder(categories = [order], dtype=int),[7]),
    
], remainder ='passthrough')


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
trf2 = ColumnTransformer([
    ('mms',MaxAbsScaler(),[0,1,2,3,4,5,6,7,8,9,10,12]),
    ('ss',StandardScaler(),[11,13])
    
], remainder ='passthrough')

In [7]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [8]:
from sklearn.pipeline import Pipeline,make_pipeline
pipe = Pipeline([
    ('tr',tr),
    ('trf1',trf1),
    ('trf2',trf2),
    ('clf',clf)
])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(df.drop(columns=['Loan_Status'],axis=1),df['Loan_Status'],test_size=0.2,random_state=42)

In [10]:
X_train

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area
83,Male,0,Graduate,No,6000,265.0,,Semiurban
90,Male,0,Graduate,No,2958,131.0,1.0,Semiurban
227,Male,2,Graduate,No,6250,210.0,1.0,Semiurban
482,Male,0,Graduate,No,2083,128.0,1.0,Semiurban
464,Male,0,Graduate,No,4166,98.0,0.0,Semiurban
...,...,...,...,...,...,...,...,...
71,Male,2,Not Graduate,Yes,1875,97.0,1.0,Semiurban
106,Male,2,Graduate,No,11417,225.0,1.0,Urban
270,Female,0,Graduate,No,3237,30.0,1.0,Urban
435,Female,,Graduate,No,10047,,1.0,Semiurban


In [11]:
y_train

83     0
90     1
227    1
482    1
464    0
      ..
71     1
106    1
270    1
435    1
102    1
Name: Loan_Status, Length: 491, dtype: int32

In [12]:
pipe.fit(X_train,y_train)

In [13]:
from sklearn.metrics import accuracy_score

# Prediction
y_pred = pipe.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Logistic Regression model accuracy (in %):", acc*100)


Logistic Regression model accuracy (in %): 78.86178861788618


In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.810513296227582

In [15]:
y_pred1 = pipe.predict(X_train)

acc = accuracy_score(y_train, y_pred1)
print("Logistic Regression model accuracy (in %):", acc*100)

Logistic Regression model accuracy (in %): 81.4663951120163


In [16]:
param_grid = {
    "clf__penalty": ['l1', 'l2'],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

In [17]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\base.py", line 1474, i

In [18]:
grid.best_score_

0.810513296227582

In [19]:
grid.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l2'}

In [20]:
import pickle
pickle.dump(clf,open('lr_pipe5.pkl','wb'))

In [21]:
import pickle
import numpy as np

In [22]:
lr_pipe = pickle.load(open('lr_pipe5.pkl','rb'))

In [30]:
# Define preprocessing function
def preprocess_input(data):
    # Apply the same preprocessing steps as in the pipeline
    data = pd.DataFrame(data, columns=X_train.columns)
    data = pipe.named_steps['tr'].transform(data)
    data = pipe.named_steps['trf1'].transform(data)
    data = pipe.named_steps['trf2'].transform(data)
    return data

# Preprocess the input data
preprocessed_test_data = preprocess_input(test_data)

# Make predictions using the logistic regression pipeline
prediction = lr_pipe.predict(preprocessed_test_data)
print("Prediction:", prediction)



ValueError: Found unknown categories [1] in column 1 during transform

In [26]:
lr_pipe.predict(test_data)

ValueError: could not convert string to float: 'Female'

In [28]:
df['Gender'].unique()

array(['Male', 'Female', nan], dtype=object)