In [57]:
## Basic imports 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline

warnings.filterwarnings('ignore')

In [58]:
## import models

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [59]:
df = pd.read_csv('data/cleaned_loan_approval_dataset.csv')
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [60]:
df.columns = df.columns.str.strip()
df.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [61]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [62]:
loan_status_map = {'Rejected': 0, 'Approved': 1}
df['loan_status'] = df['loan_status'].str.strip()
df['loan_status'] = df['loan_status'].map(loan_status_map)

In [63]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [64]:
cat_fea = X.select_dtypes(include='object').columns
num_fea = X.select_dtypes(exclude='object').columns

In [65]:
print(f"Categorical features are {cat_fea}")
print(f"Numerical features are {num_fea}")

Categorical features are Index(['education', 'self_employed'], dtype='object')
Numerical features are Index(['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
       'cibil_score', 'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value'],
      dtype='object')


In [66]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [67]:
std_scaler = StandardScaler()
oh_encoder = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer([
    ('OneHotEncoder',oh_encoder,cat_fea),
    ('StandardScaler',std_scaler,num_fea),
])

In [68]:
X = preprocessor.fit_transform(X)

In [70]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,-0.294102,1.617979,1.633052,0.192617,1.032792,-0.780058,2.877289,0.832028,0.930304
1,1.0,1.0,-1.473548,-0.341750,-0.324414,-0.508091,-1.061051,-0.733924,-0.631921,-0.694993,-0.515936
2,0.0,0.0,0.295621,1.439822,1.610933,1.594031,-0.544840,-0.057300,-0.107818,1.996520,2.407316
3,0.0,0.0,0.295621,1.119139,1.721525,-0.508091,-0.771045,1.649637,-0.381263,0.897943,0.899533
4,1.0,1.0,1.475067,1.689242,1.002681,1.594031,-1.264055,0.757724,0.735304,1.568075,0.007172
...,...,...,...,...,...,...,...,...,...,...,...
4264,0.0,1.0,1.475067,-1.446324,-1.419268,0.192617,-1.641063,-0.718546,-1.019301,-1.299210,-1.285213
4265,1.0,1.0,-1.473548,-0.626801,-0.423946,1.594031,-0.237434,-0.503257,-0.472412,-0.453306,-0.946732
4266,1.0,0.0,-0.294102,0.513405,0.969504,1.243677,-0.829046,-0.964591,1.692361,0.326683,0.714907
4267,1.0,0.0,-0.883825,-0.341750,-0.258059,-0.508091,1.044393,0.111856,-0.973727,-0.112748,0.253341


In [71]:
y

0       1
1       0
2       0
3       0
4       0
       ..
4264    0
4265    1
4266    0
4267    1
4268    1
Name: loan_status, Length: 4269, dtype: int64

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [73]:
def model_evalution(true,predicted):
    acc = accuracy_score(true,predicted)
    pre = precision_score(true,predicted)
    recall = recall_score(true,predicted)
    f1 = f1_score(true,predicted)

    return acc,pre,recall,f1

In [74]:
models = {
    'Logistic Regression' :LogisticRegression(),
    'KNeighbors Classifier' :KNeighborsClassifier(),
    'DecisionTree Classifier' : DecisionTreeClassifier(),
    'SVC' : SVC(),
    'RandomForest Classifier' : RandomForestClassifier(),
    'AdaBoost Classifier' :AdaBoostClassifier(),
    'XGB Classifier' : XGBClassifier(),
    'CatBoost Classifier' : CatBoostClassifier(),
}

model_list = []
accuracy_score_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    accuracy_score_train, precision_score_train, recall_score_train, f1_score_train = model_evalution(y_train,y_pred_train)
    accuracy_score_test, precision_score_test, recall_score_test, f1_score_test = model_evalution(y_test,y_pred_test)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Training set")
    print("-Accuracy score {:.4f}".format(accuracy_score_train))
    print("-Precision score {:.4f}".format(precision_score_train))
    print("-Recall score {:.4f}".format(recall_score_train))
    print("-F1 score {:.4f}".format(f1_score_train))

    print("-"*35)

    print("Model Performance for Testing set")
    print("-Accuracy score {:.4f}".format(accuracy_score_test))
    print("-Precision score {:.4f}".format(precision_score_test))
    print("-Recall score {:.4f}".format(recall_score_test))
    print("-F1 score {:.4f}".format(f1_score_test))
    accuracy_score_list.append(accuracy_score_test)

    print("="*35)
    print("\n")
    



Logistic Regression
Model Performance for Training set
-Accuracy score 0.9219
-Precision score 0.9381
-Recall score 0.9353
-F1 score 0.9367
-----------------------------------
Model Performance for Testing set
-Accuracy score 0.9054
-Precision score 0.9212
-Recall score 0.9307
-F1 score 0.9259


KNeighbors Classifier
Model Performance for Training set
-Accuracy score 0.9513
-Precision score 0.9601
-Recall score 0.9611
-F1 score 0.9606
-----------------------------------
Model Performance for Testing set
-Accuracy score 0.8961
-Precision score 0.9289
-Recall score 0.9056
-F1 score 0.9171


DecisionTree Classifier
Model Performance for Training set
-Accuracy score 1.0000
-Precision score 1.0000
-Recall score 1.0000
-F1 score 1.0000
-----------------------------------
Model Performance for Testing set
-Accuracy score 0.9841
-Precision score 0.9853
-Recall score 0.9897
-F1 score 0.9875


SVC
Model Performance for Training set
-Accuracy score 0.9569
-Precision score 0.9733
-Recall score 0.9

In [77]:
pd.DataFrame(list(zip(model_list,accuracy_score_list)),columns=['Model_name','Accuracy_score']).sort_values(by=['Accuracy_score'],ascending=False)

Unnamed: 0,Model_name,Accuracy_score
2,DecisionTree Classifier,0.984082
7,CatBoost Classifier,0.984082
4,RandomForest Classifier,0.98221
6,XGB Classifier,0.98221
5,AdaBoost Classifier,0.965356
3,SVC,0.93633
0,Logistic Regression,0.905431
1,KNeighbors Classifier,0.896067
