# Loan Eligibility Prediction  (final and fixed)

### import require packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)

# 1. Load the Dataset

In [2]:
df = pd.read_csv("Loan_default.csv")

# 2. Perform Exploratory Data Analysis or Data preprocessing

In [3]:
df.shape

(255347, 18)

In [4]:
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56.0,85994.0,50587.0,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69.0,50432.0,124440.0,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46.0,84208.0,129188.0,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32.0,31713.0,44799.0,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60.0,20437.0,9139.0,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,,0


## 1. Missing Values

In [5]:
df.isnull().sum()

LoanID             0
Age               21
Income            51
LoanAmount         1
CreditScore        0
MonthsEmployed     0
NumCreditLines     0
InterestRate       0
LoanTerm           0
DTIRatio           1
Education         14
EmploymentType     0
MaritalStatus     26
HasMortgage       15
HasDependents     25
LoanPurpose       34
HasCoSigner       12
Default            0
dtype: int64

In [6]:
# Fill value by mean mode 

df['Income'].fillna(df['Income'].mean(), inplace=True)

df['Age'].fillna(df['Age'].mode()[0], inplace=True)
df['Education'].fillna(df['Education'].mode()[0],inplace=True)
df['MaritalStatus'].fillna(df['MaritalStatus'].mode()[0],inplace=True)
df['HasMortgage'].fillna(df['HasMortgage'].mode()[0],inplace=True)
df['HasDependents'].fillna(df['HasDependents'].mode()[0],inplace=True) 
df['HasDependents'].fillna(df['HasDependents'].mode()[0],inplace=True)                        

df.dropna(axis=0, inplace=True)

In [7]:
df.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [8]:
df = df.drop(['LoanID'], axis=1)
df.columns

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner', 'Default'],
      dtype='object')

## 2. Numerical Variables

### Discrete variable

### Continuous Variable

## 3.Outliers

### Categorical Variables


### Find out the relationship between categorical variable and dependent feature Default


## 4. Label encoding

In [9]:
# convort the categorical into numerical 
from sklearn.preprocessing import LabelEncoder

df['Education'] = LabelEncoder().fit_transform(df['Education'])
df['EmploymentType'] = LabelEncoder().fit_transform(df['EmploymentType'])
df['MaritalStatus'] = LabelEncoder().fit_transform(df['MaritalStatus'])
df['HasMortgage'] = LabelEncoder().fit_transform(df['HasMortgage'])
df['HasDependents'] = LabelEncoder().fit_transform(df['HasDependents'])
df['LoanPurpose'] = LabelEncoder().fit_transform(df['LoanPurpose'])
df['HasCoSigner'] = LabelEncoder().fit_transform(df['HasCoSigner'])

## 5. Deal with imbalanced Data (SMOTE)

In [10]:
# pip install imbalanced-Learn

class_0_count, class_1_count = df['Default'].value_counts()

# separate  the class 1, 0 into 2 df
df_class_0 = df[df['Default']==0]
df_class_1 = df[df['Default']==1]

print(df_class_0['Default'].value_counts())
print(df_class_1['Default'].value_counts())

0    225650
Name: Default, dtype: int64
1    29649
Name: Default, dtype: int64


In [11]:
## class 0 ==> 60000
#  class 1 ==> 29649
# merge together

df_class_0 = df_class_0.sample(60000)

df2 = pd.concat([df_class_1, df_class_0],axis= 0)
df2.value_counts()


Age   Income    LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education  EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  HasCoSigner  Default
18.0  15100.0   230976.0    330          77              1               20.76         36        0.85      1          3               0              1            1              3            0            1          1
50.0  55265.0   81644.0     821          73              2               7.35          36        0.35      0          0               0              0            0              4            0            0          1
      55947.0   168039.0    763          1               2               17.43         36        0.67      1          3               2              1            1              0            0            1          1
      55762.0   232530.0    561          46              2               4.91          60        0.23      1          2               2      

In [12]:
x = df2.drop(['Default','NumCreditLines','EmploymentType','LoanTerm','MaritalStatus','LoanPurpose'],axis = 'columns')
y = df2['Default']
print(y.value_counts())
x.value_counts

0    60000
1    29649
Name: Default, dtype: int64


<bound method DataFrame.value_counts of          Age    Income  LoanAmount  CreditScore  MonthsEmployed  InterestRate  \
2       46.0   84208.0    129188.0          451              26         21.17   
5       25.0   90298.0     90448.0          720              18         22.72   
8       36.0   42053.0     92357.0          827              83         23.94   
11      28.0  149227.0    139759.0          375              56          5.84   
18      19.0   40718.0     78515.0          319             119         14.00   
...      ...       ...         ...          ...             ...           ...   
223947  22.0   63646.0    204101.0          672              98         22.16   
45928   52.0   71232.0    102128.0          783             116          5.04   
252727  53.0   91251.0    153999.0          724               9         23.04   
185148  33.0   81368.0     25493.0          622              12          3.32   
14445   61.0  111333.0     14005.0          778             118      

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy= 'minority',random_state=12345)
x_sm, y_sm = smote.fit_resample(x,y)
y_sm.value_counts()


1    60000
0    60000
Name: Default, dtype: int64

In [14]:
# sns.pairplot(df2)

# 2. Feature Selection

### Scaling the data

In [15]:
from sklearn.preprocessing import MinMaxScaler

x = pd.DataFrame(MinMaxScaler().fit_transform(x_sm),columns=['Age', 'Income', 'LoanAmount', 'CreditScore',
    'MonthsEmployed','InterestRate', 'DTIRatio', 'Education','HasMortgage', 'HasDependents','HasCoSigner'])
x.head()
# Age	Income	LoanAmount	CreditScore	MonthsEmployed	InterestRate DTIRatio Education	HasMortgage	HasDependents HasCoSigner	

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,InterestRate,DTIRatio,Education,HasMortgage,HasDependents,HasCoSigner
0,0.54902,0.512671,0.506904,0.275046,0.218487,0.833478,0.2625,0.666667,1.0,1.0,0.0
1,0.137255,0.557784,0.348777,0.765027,0.151261,0.90087,0.0,0.333333,1.0,0.0,1.0
2,0.352941,0.2004,0.356569,0.959927,0.697479,0.953913,0.125,0.0,1.0,0.0,0.0
3,0.196078,0.994311,0.550052,0.136612,0.470588,0.166957,0.875,1.0,0.0,0.0,1.0
4,0.019608,0.190511,0.30007,0.034608,1.0,0.521739,0.0875,0.0,1.0,0.0,0.0


## Split the data into training and testing dataset

In [16]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(x_sm, y_sm, train_size= 0.8, random_state=783437, stratify=y_sm)

# 3. Model Building

In [17]:
# Built the catboost model
from catboost import CatBoostClassifier
model_catboost = CatBoostClassifier().fit(X_train,Y_train)
model_catboost


Learning rate set to 0.072338
0:	learn: 0.6768788	total: 58.4ms	remaining: 58.4s
1:	learn: 0.6625370	total: 70ms	remaining: 34.9s
2:	learn: 0.6505444	total: 79.6ms	remaining: 26.4s
3:	learn: 0.6401671	total: 89.9ms	remaining: 22.4s
4:	learn: 0.6304428	total: 96.1ms	remaining: 19.1s
5:	learn: 0.6221036	total: 102ms	remaining: 16.8s
6:	learn: 0.6148411	total: 107ms	remaining: 15.2s
7:	learn: 0.6083106	total: 113ms	remaining: 14s
8:	learn: 0.6024115	total: 118ms	remaining: 13s
9:	learn: 0.5970800	total: 123ms	remaining: 12.2s
10:	learn: 0.5922643	total: 129ms	remaining: 11.6s
11:	learn: 0.5876903	total: 134ms	remaining: 11s
12:	learn: 0.5836093	total: 140ms	remaining: 10.6s
13:	learn: 0.5799249	total: 145ms	remaining: 10.2s
14:	learn: 0.5766185	total: 151ms	remaining: 9.89s
15:	learn: 0.5735492	total: 156ms	remaining: 9.57s
16:	learn: 0.5708252	total: 161ms	remaining: 9.32s
17:	learn: 0.5682608	total: 167ms	remaining: 9.11s
18:	learn: 0.5658951	total: 172ms	remaining: 8.9s
19:	learn: 0.56

<catboost.core.CatBoostClassifier at 0x755ec7ff44f0>

## Model Evaluation code

In [18]:
from sklearn.metrics import classification_report
        
print(classification_report(y_test, model_catboost.predict(x_test)))


              precision    recall  f1-score   support

           0       0.75      0.86      0.80     12000
           1       0.83      0.72      0.77     12000

    accuracy                           0.79     24000
   macro avg       0.79      0.79      0.79     24000
weighted avg       0.79      0.79      0.79     24000



## save the model

In [19]:
import pickle

with open("model.pkl", "wb") as file:
    pickle.dump(model_catboost, file)