In [28]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.pipeline import Pipeline
import os 
import warnings
warnings.filterwarnings('ignore')

In [29]:
loan = pd.read_csv("loan_train.csv",index_col=0)
loan

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [30]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [31]:
loan.isnull().sum()     # Total 149 null values

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [32]:
X = loan.drop(columns='Loan_Status')
y = loan.Loan_Status

In [33]:
# Using Imputer
siCate = SimpleImputer(strategy = 'constant',fill_value = 'unknown').set_output(transform = 'pandas')
siNum = SimpleImputer(strategy = 'median').set_output(transform = 'pandas')

In [34]:
# MCT in Imputer  for num & Cat
mct = make_column_transformer((siNum, make_column_selector(dtype_exclude=object)),
                             (siCate,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False)
mct = mct.set_output(transform= 'pandas')
# x_imp = mct.fit_transform(X)
# x_imp.isnull().sum()

In [35]:
# MCT in OHE for encoding num & cat
ohe = OneHotEncoder(handle_unknown = 'ignore',drop='first', sparse_output=False)

ohe_mct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False)
ohe_mct = ohe_mct.set_output(transform= 'pandas')
# x_ohe = ohe_mct.fit_transform(x_imp)
# x_ohe.isnull().sum()

In [36]:

log = LogisticRegression()
pipe = Pipeline([('MCT',mct),('OHE_MCT',ohe_mct),('LOG',log)])
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)

params = {'LOG__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
         'LOG__C':np.linspace(0.001,10,5)}
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)  
# Use neg_log_loss is always better but it give negative output for that multiply by -1 it
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...LOG__C=0.001, LOG__solver=lbfgs;, score=0.431 total time=   0.0s
[CV 2/5] END ...LOG__C=0.001, LOG__solver=lbfgs;, score=0.494 total time=   0.0s
[CV 3/5] END ...LOG__C=0.001, LOG__solver=lbfgs;, score=0.440 total time=   0.0s
[CV 4/5] END ...LOG__C=0.001, LOG__solver=lbfgs;, score=0.604 total time=   0.0s
[CV 5/5] END ...LOG__C=0.001, LOG__solver=lbfgs;, score=0.561 total time=   0.0s
[CV 1/5] END LOG__C=0.001, LOG__solver=liblinear;, score=0.437 total time=   0.0s
[CV 2/5] END LOG__C=0.001, LOG__solver=liblinear;, score=0.636 total time=   0.0s
[CV 3/5] END LOG__C=0.001, LOG__solver=liblinear;, score=0.458 total time=   0.0s
[CV 4/5] END LOG__C=0.001, LOG__solver=liblinear;, score=0.629 total time=   0.0s
[CV 5/5] END LOG__C=0.001, LOG__solver=liblinear;, score=0.554 total time=   0.0s
[CV 1/5] END LOG__C=0.001, LOG__solver=newton-cg;, score=0.485 total time=   0.0s
[CV 2/5] END LOG__C=0.001, LOG__solver=ne

In [37]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LOG__C': np.float64(2.50075), 'LOG__solver': 'lbfgs'}
0.7591899053818559


In [38]:
gcv.estimator

In [39]:
# Inferancing

In [40]:
tst = pd.read_csv('loan_test.csv')

In [41]:
bm_loan = gcv.best_estimator_
y_pred = bm_loan.predict(tst)

In [42]:
submit = pd.read_csv('loan_sample_submission.csv')
submit.price = y_pred
submit.to_csv('Submission_Loan_Predictiion.csv', index=False)

# KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [47]:
knn = KNeighborsClassifier()
sc,mm = StandardScaler(), MinMaxScaler()
pipe = Pipeline([('MCT',mct),('OHE_MCT',ohe_mct),('SC',None),('KNN',knn)])
kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
params = {'KNN__n_neighbors' : np.arange(1,8),'SC':[None,sc,mm]}
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='roc_auc')  
# Use neg_log_loss is always better but it give negative output for that multiply by -1 it
gcv.fit(X,y)

In [46]:
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': np.int64(3), 'SC': MinMaxScaler()}
0.7246499733496637


In [48]:
tst = pd.read_csv('loan_test.csv')
bm_loan = gcv.best_estimator_
y_pred = bm_loan.predict(tst)
submit = pd.read_csv('loan_sample_submission.csv')
submit.price = y_pred
submit.to_csv('Submission_Loan_Predictiion_KNN.csv', index=False)