# Loan Eligibility Prediction

### import require packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load the Dataset

In [2]:
df = pd.read_csv("Loan_default.csv")

## Perform Exploratory Data Analysis

In [6]:
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSigner     255347 non-null  object 
 17  Default   

In [5]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Default
count,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0
mean,43.498306,82499.304597,127578.865512,574.264346,59.541976,2.501036,13.492773,36.025894,0.500212,0.116128
std,14.990258,38963.013729,70840.706142,158.903867,34.643376,1.117018,6.636443,16.96933,0.230917,0.320379
min,18.0,15000.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0
25%,31.0,48825.5,66156.0,437.0,30.0,2.0,7.77,24.0,0.3,0.0
50%,43.0,82466.0,127556.0,574.0,60.0,2.0,13.46,36.0,0.5,0.0
75%,56.0,116219.0,188985.0,712.0,90.0,3.0,19.25,48.0,0.7,0.0
max,69.0,149999.0,249999.0,849.0,119.0,4.0,25.0,60.0,0.9,1.0


In [7]:
df.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

## Data preprocessing / Data cleansing

In [8]:
df.drop('LoanID',axis= 1, inplace= True)
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [9]:
# convort the categorical into numerical 
from sklearn.preprocessing import LabelEncoder

df['Education'] = LabelEncoder().fit_transform(df['Education'])
df['EmploymentType'] = LabelEncoder().fit_transform(df['EmploymentType'])
df['MaritalStatus'] = LabelEncoder().fit_transform(df['MaritalStatus'])
df['HasMortgage'] = LabelEncoder().fit_transform(df['HasMortgage'])
df['HasDependents'] = LabelEncoder().fit_transform(df['HasDependents'])
df['LoanPurpose'] = LabelEncoder().fit_transform(df['LoanPurpose'])
df['HasCoSigner'] = LabelEncoder().fit_transform(df['HasCoSigner'])

# models function

In [25]:
class Models:
    def __init__(self ,X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        

    # Built the Logistic Regression model
    def model_LG(self):
        from sklearn.linear_model import LogisticRegressionCV
        model_lg = LogisticRegressionCV().fit(self.X_train, self.Y_train)
        return model_lg

    # Built the Naive Bayes 
    def model_NB(self):
        from sklearn.naive_bayes import GaussianNB
        model_nb = GaussianNB().fit(self.X_train, self.Y_train)
        return model_nb

    # Built the KNN model
    def model_KNN(self):
        from sklearn.neighbors import KNeighborsClassifier
        model_knn = KNeighborsClassifier().fit(self.X_train, self.Y_train)
        return model_knn

    # Built the Support Vector Machine
    def model_SVM(self):
        from sklearn.svm import SVC
        model_svm = SVC(C=3.0).fit(self.X_train, self.Y_train)
        return model_svm
        
    # Built the Decison tree model 
    def model_DT(self):
        from sklearn.tree import DecisionTreeClassifier
        model_dt = DecisionTreeClassifier().fit(self.X_train, self.Y_train)
        return model_dt

    # Built the model for Random forest
    def model_Rand_forest(self):
        from sklearn.ensemble import RandomForestClassifier
        model_rand_forest = RandomForestClassifier().fit(self.X_train, self.Y_train)
        return model_rand_forest

    # Built the catboost model
    def model_Cat_Boost(self):
        from catboost import CatBoostClassifier
        model_catboost = CatBoostClassifier().fit(self.X_train, self.Y_train)
        return model_catboost

    # Built the adaboost model
    def model_Ada_Boost(self):
        from sklearn.ensemble import AdaBoostClassifier
        model_adaboost = AdaBoostClassifier().fit(self.X_train, self.Y_train)
        return model_adaboost

    def model_XG_Boost(self):
        from xgboost import XGBClassifier
        model_xgboost = XGBClassifier().fit(self.X_train, self.Y_train)
        return model_xgboost
        
    

### classification report and Model Evaluation function

In [11]:
class Model_Evaluation_class:
    def __init__(self ,x_test, y_test):
        self.x_test = x_test
        self.y_test = y_test

    def Classification_Report(self, model):
        from sklearn.metrics import classification_report
        return classification_report(self.y_test, model.predict(self.x_test))

    def model_evaluate(self, model, model_name):
        from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
        
        Accuracy = accuracy_score(self.y_test, model.predict(self.x_test))
        Precision = precision_score(self.y_test, model.predict(self.x_test))
        Recall = recall_score(self.y_test, model.predict(self.x_test))
        f1_Score = f1_score(self.y_test, model.predict(self.x_test))
        return model_name,Accuracy,Precision,Recall,f1_Score

# Deal with imbalanced Data

### Balancing data by using under sampling majority class

In [12]:
# separate the class 1, 0 count 
class_0_count, class_1_count = df['Default'].value_counts()

# separate  the class 1, 0 into 2 df
df_class_0 = df[df['Default']==0]
df_class_1 = df[df['Default']==1]

In [13]:
print(df_class_0['Default'].value_counts())
print(df_class_1['Default'].value_counts())

Default
0    225694
Name: count, dtype: int64
Default
1    29653
Name: count, dtype: int64


In [14]:
df_class_0_under = df_class_0.sample(30347)

In [15]:
balance_df_by_under = pd.concat([df_class_1, df_class_0_under],axis= 0)
balance_df_by_under.value_counts()

Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education  EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  HasCoSigner  Default
69   143755  189520      454          71              1               16.02         24        0.29      1          3               0              0            1              3            1            0          1
     144248  176622      845          40              2               2.16          24        0.21      1          0               1              0            0              0            0            0          1
     144312  75335       605          111             2               18.92         24        0.22      3          1               2              0            0              2            1            0          1
     144324  79090       526          77              3               16.27         48        0.56      0          2               1              0      

In [16]:
balance_df_by_under.corr()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
Age,1.0,0.014264,-0.018789,0.006625,0.017161,-0.006494,-0.035967,-0.00221,-0.002128,0.00271,-0.007049,3.9e-05,0.000211,0.013923,0.002448,0.006731,-0.258713
Income,0.014264,1.0,-0.068121,-0.000992,0.005234,-0.005266,-0.014405,0.003667,0.004011,0.004636,-0.014085,-0.003193,0.004053,0.006455,-0.002661,-0.00596,-0.148174
LoanAmount,-0.018789,-0.068121,1.0,-0.001873,-0.007295,0.008775,0.009716,0.001636,1e-05,0.005712,0.007013,-0.009302,0.000876,0.000829,-0.001549,-0.004488,0.137822
CreditScore,0.006625,-0.000992,-0.001873,1.0,0.004649,-8e-06,-0.005211,-0.003203,-0.006875,-0.000257,-5.1e-05,-0.001676,0.001377,-0.000917,0.005229,0.000327,-0.051123
MonthsEmployed,0.017161,0.005234,-0.007295,0.004649,1.0,-0.008551,-0.016298,-0.005259,0.001846,0.006958,-0.004642,0.00914,0.002567,0.001301,-0.002066,0.004865,-0.149697
NumCreditLines,-0.006494,-0.005266,0.008775,-8e-06,-0.008551,1.0,0.007078,-0.000589,-0.001421,0.005621,0.001206,0.003329,-0.001167,0.00168,0.001249,-0.004901,0.044411
InterestRate,-0.035967,-0.014405,0.009716,-0.005211,-0.016298,0.007078,1.0,-0.001626,0.007634,-0.00194,0.005642,-0.015595,-0.006957,-0.00514,0.00015,-0.010318,0.205486
LoanTerm,-0.00221,0.003667,0.001636,-0.003203,-0.005259,-0.000589,-0.001626,1.0,0.00691,-0.004594,-0.000993,0.003132,0.000734,0.005396,0.000992,-0.005604,-0.000641
DTIRatio,-0.002128,0.004011,1e-05,-0.006875,0.001846,-0.001421,0.007634,0.00691,1.0,0.003516,0.001208,0.005554,-0.006554,0.00317,-0.000749,-0.000612,0.027705
Education,0.00271,0.004636,0.005712,-0.000257,0.006958,0.005621,-0.00194,-0.004594,0.003516,1.0,0.004407,-0.008158,0.001817,-0.002503,-0.009302,0.006498,-0.035461


## Split the data set into x and y

In [17]:
x = balance_df_by_under[['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'DTIRatio', 'Education',
       'EmploymentType', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner']]
print(x.head())
y = balance_df_by_under['Default']
y.head()
balance_df_by_under.value_counts()

    Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
2    46   84208      129188          451              26               3   
5    25   90298       90448          720              18               2   
8    36   42053       92357          827              83               1   
11   28  149227      139759          375              56               3   
18   19   40718       78515          319             119               2   

    InterestRate  DTIRatio  Education  EmploymentType  HasMortgage  \
2          21.17      0.31          2               3            1   
5          22.72      0.10          1               3            1   
8          23.94      0.20          0               2            1   
11          5.84      0.80          3               0            0   
18         14.00      0.17          0               2            1   

    HasDependents  LoanPurpose  HasCoSigner  
2               1            0            0  
5               0            1

Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education  EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  HasCoSigner  Default
69   143755  189520      454          71              1               16.02         24        0.29      1          3               0              0            1              3            1            0          1
     144248  176622      845          40              2               2.16          24        0.21      1          0               1              0            0              0            0            0          1
     144312  75335       605          111             2               18.92         24        0.22      3          1               2              0            0              2            1            0          1
     144324  79090       526          77              3               16.27         48        0.56      0          2               1              0      

## Scaling the data

In [18]:
from sklearn.preprocessing import MinMaxScaler

x = pd.DataFrame(MinMaxScaler().fit_transform(x),columns=['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'DTIRatio', 'Education',
       'EmploymentType', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner'])
x.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,DTIRatio,Education,EmploymentType,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,0.54902,0.512663,0.506898,0.275046,0.218487,0.666667,0.833478,0.2625,0.666667,1.0,1.0,1.0,0.0,0.0
1,0.137255,0.557775,0.348773,0.765027,0.151261,0.333333,0.90087,0.0,0.333333,1.0,1.0,0.0,0.25,1.0
2,0.352941,0.200397,0.356565,0.959927,0.697479,0.0,0.953913,0.125,0.0,0.666667,1.0,0.0,0.5,0.0
3,0.196078,0.994296,0.550046,0.136612,0.470588,0.666667,0.166957,0.875,1.0,0.0,0.0,0.0,0.5,1.0
4,0.019608,0.190508,0.300066,0.034608,1.0,0.333333,0.521739,0.0875,0.0,0.666667,1.0,0.0,0.5,0.0


## Split the data into training and testing dataset

In [19]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(x, y, train_size= 0.7, random_state=12350, stratify=y)

### Build multiple model on above dataset

In [28]:
m2 = Models(X_train,Y_train)
models = [(m2.model_LG(),"LogisticRegression",(0,0)),(m2.model_KNN(), "KNeighborsClassifier",(0,1)),
          (m2.model_NB(),"Naive Bayes",(1,0)),(m2.model_SVM(),"Support Vector Machine",(1,1)),
          (m2.model_DT(),"DecisionTreeClassifier",(2,0)),(m2.model_Rand_forest(),"RandomForest",(2,1)),
         (m2.model_Ada_Boost(), "AdaBoost",(3,0)), (m2.model_Cat_Boost(),"Cat_Boost",(3,1)),
         (m2.model_XG_Boost(), "XGBoost",(4,0))]



Learning rate set to 0.050823
0:	learn: 0.6851859	total: 9.94ms	remaining: 9.93s
1:	learn: 0.6782005	total: 18.2ms	remaining: 9.09s
2:	learn: 0.6716452	total: 25.6ms	remaining: 8.49s
3:	learn: 0.6656861	total: 33.1ms	remaining: 8.24s
4:	learn: 0.6601697	total: 40.2ms	remaining: 8s
5:	learn: 0.6554616	total: 47.3ms	remaining: 7.83s
6:	learn: 0.6514670	total: 54.8ms	remaining: 7.77s
7:	learn: 0.6474832	total: 61.9ms	remaining: 7.67s
8:	learn: 0.6439395	total: 68.9ms	remaining: 7.58s
9:	learn: 0.6404470	total: 76.5ms	remaining: 7.57s
10:	learn: 0.6370550	total: 84.3ms	remaining: 7.58s
11:	learn: 0.6342924	total: 90.8ms	remaining: 7.47s
12:	learn: 0.6316778	total: 97.6ms	remaining: 7.41s
13:	learn: 0.6293104	total: 105ms	remaining: 7.36s
14:	learn: 0.6270228	total: 112ms	remaining: 7.33s
15:	learn: 0.6251338	total: 118ms	remaining: 7.28s
16:	learn: 0.6230241	total: 127ms	remaining: 7.33s
17:	learn: 0.6211291	total: 133ms	remaining: 7.27s
18:	learn: 0.6193964	total: 141ms	remaining: 7.26s
1

## Evaluate all models


In [29]:
model_evaluation_obj = Model_Evaluation_class(x_test, y_test)
Results2 = []
for model,model_name,_ in models:
    Results2.append(model_evaluation_obj.model_evaluate(model, model_name))

df2 = pd.DataFrame(Results2,columns=['Algorithm','Accuracy', 'Precision', 'Recall', 'F1_Score'])
df2

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1_Score
0,LogisticRegression,0.678944,0.675329,0.674798,0.675063
1,KNeighborsClassifier,0.6185,0.613188,0.617806,0.615488
2,Naive Bayes,0.681111,0.675217,0.683566,0.679365
3,Support Vector Machine,0.680222,0.678409,0.671088,0.674729
4,DecisionTreeClassifier,0.583778,0.58021,0.570818,0.575476
5,RandomForest,0.675111,0.678454,0.651304,0.664602
6,AdaBoost,0.684833,0.681618,0.679856,0.680736
7,Cat_Boost,0.686444,0.687155,0.671088,0.679026
8,XGBoost,0.671333,0.669859,0.660522,0.665157


## Classification report for all models

In [30]:
for model,model_name,_ in models:
    print(f"Classification report for {model_name} is :")
    print(model_evaluation_obj.Classification_Report(model))
    print("-"*80)

Classification report for LogisticRegression is :
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      9104
           1       0.68      0.67      0.68      8896

    accuracy                           0.68     18000
   macro avg       0.68      0.68      0.68     18000
weighted avg       0.68      0.68      0.68     18000

--------------------------------------------------------------------------------
Classification report for KNeighborsClassifier is :
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      9104
           1       0.61      0.62      0.62      8896

    accuracy                           0.62     18000
   macro avg       0.62      0.62      0.62     18000
weighted avg       0.62      0.62      0.62     18000

--------------------------------------------------------------------------------
Classification report for Naive Bayes is :
              precision    recall  f1-s