In [141]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [145]:
loan = pd.read_csv("loan.csv")
loan

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [147]:
loan.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [149]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [151]:
X = loan.drop('Loan_Status', axis=1)
y = loan['Loan_Status']

In [153]:
imp_cat = SimpleImputer(strategy='constant', fill_value="unknown")
imp_num = SimpleImputer(strategy="median")
trans_imp = make_column_transformer((imp_cat, make_column_selector(dtype_include=object)), 
                                    (imp_num, make_column_selector(dtype_exclude=object)),
                                   verbose_feature_names_out=False)
trans_imp = trans_imp.set_output(transform="pandas")
X_imp = trans_imp.fit_transform(X)
print(X_imp.isnull().sum().sum())
print(X_imp.columns)

0
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
       'Credit_History'],
      dtype='object')


In [155]:
trans_imp = trans_imp.set_output(transform="pandas")
loan_imp = trans_imp.fit_transform(loan)
print(loan_imp.isnull().sum())
print(loan_imp.columns)

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
Property_Area        0
Loan_Status          0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
dtype: int64
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
       'Credit_History'],
      dtype='object')


In [157]:
ohe = OneHotEncoder(
    handle_unknown="ignore", 
    sparse_output=False, 
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
trans_ohe = trans_ohe.set_output(transform='pandas')
X_imp_ohe = trans_ohe.fit_transform(X_imp)
X_imp_ohe.dtypes

ApplicantIncome            float64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
                            ...   
Education_Not Graduate     float64
Self_Employed_Yes          float64
Self_Employed_unknown      float64
Property_Area_Semiurban    float64
Property_Area_Urban        float64
Length: 631, dtype: object

In [117]:
lr = LogisticRegression(random_state = 24)

kfold = StratifiedKFold(n_splits=5, 
                        random_state=24, 
                        shuffle=True)

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

pipe = Pipeline([('IMP',trans_imp), ('OHE',trans_ohe),("SCL", scaler_std), ('MODEL', lr)])

params = {'MODEL__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
         'MODEL__C':np.linspace(0.001, 10, 20)}

gcv = GridSearchCV(
    pipe, 
    param_grid = params, 
    scoring='neg_log_loss',     
    cv=kfold, 
    verbose=3)

In [215]:
# For KNN

knn = KNeighborsClassifier()

kfold = StratifiedKFold(n_splits=5, 
                        random_state=24, 
                        shuffle=True)

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

pipe = Pipeline([('IMP',trans_imp), ('OHE',trans_ohe), ("SCL", None), ('MODEL', knn)])

params = {'MODEL__n_neighbors':list(range(1, 21)),
          "SCL" : [scaler_mm, scaler_std, None]}

gcv = GridSearchCV(
    pipe, 
    param_grid = params, 
    scoring='roc_auc',     
    # scoring='neg_log_loss',  
    cv=kfold, 
    verbose=3)

In [217]:
gcv.fit(X,y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END MODEL__n_neighbors=1, SCL=MinMaxScaler();, score=0.641 total time=   0.0s
[CV 2/5] END MODEL__n_neighbors=1, SCL=MinMaxScaler();, score=0.657 total time=   0.0s
[CV 3/5] END MODEL__n_neighbors=1, SCL=MinMaxScaler();, score=0.654 total time=   0.0s
[CV 4/5] END MODEL__n_neighbors=1, SCL=MinMaxScaler();, score=0.689 total time=   0.0s
[CV 5/5] END MODEL__n_neighbors=1, SCL=MinMaxScaler();, score=0.685 total time=   0.0s
[CV 1/5] END MODEL__n_neighbors=1, SCL=StandardScaler();, score=0.500 total time=   0.0s
[CV 2/5] END MODEL__n_neighbors=1, SCL=StandardScaler();, score=0.500 total time=   0.0s
[CV 3/5] END MODEL__n_neighbors=1, SCL=StandardScaler();, score=0.500 total time=   0.0s
[CV 4/5] END MODEL__n_neighbors=1, SCL=StandardScaler();, score=0.500 total time=   0.0s
[CV 5/5] END MODEL__n_neighbors=1, SCL=StandardScaler();, score=0.500 total time=   0.0s
[CV 1/5] END ....MODEL__n_neighbors=1, SCL=None;, score=0.

In [219]:
gcv.best_score_

0.7302016447604682

In [221]:
gcv.best_params_

{'MODEL__n_neighbors': 3, 'SCL': MinMaxScaler()}

In [223]:
bm = gcv.best_estimator_          # This will retrun the best model and note that it also contains the pipeline

In [225]:
test = pd.read_csv("test.csv")
test


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [227]:
submit = pd.read_csv

In [229]:
y_pred = bm.predict(test)

In [231]:
sample = pd.read_csv("sample_submission.csv")
sample.Loan_Status = y_pred

In [233]:
sample.to_csv("submission.csv", index=False)

In [None]:
# 