## Predicting If Loan Will Be Approved Or Not Based On Customer Profile

In [9]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings ('ignore')

In [10]:
df = pd.read_csv ('loan_train.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [11]:
df.shape

(614, 13)

In [12]:
df.describe ()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [13]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [14]:
# Checking percentage of missing values
df.isnull().sum() * 100 / len (df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [15]:
# We will drop missing values for columns less than 5% missing values
# We will fill columns with missing values up to 5%

columns = ['Gender', 'Dependents', 'LoanAmount' ,'Loan_Amount_Term']
df = df.dropna (subset = columns)
df.isnull().sum() * 100 / len (df)

Loan_ID              0.000000
Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [16]:
# Filling Missing Values For Columns with fillna over 5%

df ['Self_Employed'] = df ['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df ['Credit_History'] = df ['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [17]:
df.isnull().sum() * 100 / len (df)

Loan_ID              0.0
Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

In [18]:
# There are some columns with symbols attached to their data eg in dependents we have 3+ dependents
# we will substitute that with 4
df ['Dependents']

1       1
2       0
3       0
4       0
5       2
       ..
609     0
610    3+
611     1
612     2
613     0
Name: Dependents, Length: 553, dtype: object

In [19]:
df ['Dependents'] = df ['Dependents'].replace (to_replace = '3+', value = 4)
df ['Dependents'].unique()

array(['1', '0', '2', 4], dtype=object)

### HANDLING CATEGORICAL COLUMNS

In [20]:
# We will be using the mapping function to change our categorical columns to numerical
df ['Gender'] = df ['Gender'].map({'Male': 1, 'Female':0}).astype('int')
df ['Married'] = df ['Married'].map({'Yes': 1, 'No':0}).astype('int')
df ['Education'] = df ['Education'].map({'Graduate': 1, 'Not Graduate':0}).astype('int')
df ['Self_Employed'] = df ['Self_Employed'].map({'Yes': 1, 'No':0}).astype('int')
df ['Property_Area'] = df ['Property_Area'].map({'Rural': 0, 'Semiurban':2, 'Urban': 1}).astype('int')
df ['Loan_Status'] = df ['Loan_Status'].map({'Y': 1, 'N':0}).astype('int')

In [21]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


In [22]:
x = df.drop (columns = ['Loan_ID', 'Loan_Status'])
y = df ['Loan_Status']

In [23]:
x.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1


In [24]:
# Perform Feature Scaling For Our Features That Are Not In Thesame Range.
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ]

In [25]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler ()
x [cols] = scaler.fit_transform (x [cols])
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,1
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,1
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,1
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,1
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,1


In [26]:
# Split Data Into Training And Testing and then cross validating data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [27]:
model_df = {}
def model_val (model, x, y):
    x_train, x_test, y_train, y_test = train_test_split (x, y, 
                                                         test_size = 0.20,
                                                         stratify = y,
                                                        random_state = 42)
    model.fit (x_train, y_train)
    y_pred = model.predict (x_test)
    print (f'{model} accuracy is {accuracy_score (y_test, y_pred)}')
    
    score = cross_val_score (model, x,y, cv = 5)
    print (f'{model} Average Cross Val Score is {np.mean (score)}')
    model_df [model] = round (np.mean (score) * 100, 2)

## LOGISTIC REGRESSION

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression ()
model_val (model, x, y)

LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression() Average Cross Val Score is 0.8047829647829647


## SUPPORT VECTOR CLASSIFIER (SVC)

In [29]:
from sklearn import svm
model = svm.SVC ()
model_val (model, x, y)

SVC() accuracy is 0.7927927927927928
SVC() Average Cross Val Score is 0.7938902538902539


## DECISION TREE CLASSIFIER

In [30]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier ()
model_val (model, x, y)

DecisionTreeClassifier() accuracy is 0.7477477477477478
DecisionTreeClassifier() Average Cross Val Score is 0.7053071253071252


## RANDOM FOREST CLASSIFIER

In [31]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier ()
model_val (model, x, y)

RandomForestClassifier() accuracy is 0.8288288288288288
RandomForestClassifier() Average Cross Val Score is 0.7884684684684685


## GRADIENT BOOSTING CLASSIFIER

In [32]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier ()
model_val (model, x, y)

GradientBoostingClassifier() accuracy is 0.8018018018018018
GradientBoostingClassifier() Average Cross Val Score is 0.7685503685503685


In [33]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 70.53,
 RandomForestClassifier(): 78.85,
 GradientBoostingClassifier(): 76.86}

## HYPERPARAMETER TUNING

In [34]:
from sklearn.model_selection import RandomizedSearchCV

### LOGISTIC REGRESSION

In [35]:
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":['liblinear']}

In [36]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                  n_iter=20,cv=5,verbose=True)

In [37]:
rs_log_reg.fit(x,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [38]:
rs_log_reg.best_score_

0.8047829647829647

In [39]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.23357214690901212}

### SVC

In [40]:
svc_grid = {'C':[0.25,0.50,0.75,1],"kernel":["linear"]}

In [41]:
rs_svc=RandomizedSearchCV(svm.SVC(),
                  param_distributions=svc_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [42]:
rs_svc.fit(x,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [43]:
rs_svc.best_score_

0.8066011466011467

In [44]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

### RANDOM FOREST CLASSIFIER

In [45]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['auto','sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }

In [46]:
rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [47]:
rs_rf.fit(x,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [48]:
rs_rf.best_score_

0.8084193284193285

In [49]:
rs_rf.best_params_

{'n_estimators': 320,
 'min_samples_split': 20,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 3}

## SAVE THE MODEL

In [50]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 70.53,
 RandomForestClassifier(): 78.85,
 GradientBoostingClassifier(): 76.86}

In [51]:
model = svm.SVC ()

In [52]:
model.fit (x_train, y_train)

NameError: name 'x_train' is not defined

In [None]:
model.score (x_train, y_train)

In [None]:
import joblib

In [None]:
joblib.dump(model,'loan_status_prediction')

In [None]:
model = joblib.load ('loan_status_prediction')

## TESTING PREDICTION

In [None]:
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [None]:
df

In [None]:
result = model.predict (df)
print (result)

if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

## GUI

In [None]:
from tkinter import *
import joblib
import pandas as pd

In [None]:
def show_entry():
    
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())
    
    model = joblib.load('loan_status_predict')
    df = pd.DataFrame({
    'Gender':p1,
    'Married':p2,
    'Dependents':p3,
    'Education':p4,
    'Self_Employed':p5,
    'ApplicantIncome':p6,
    'CoapplicantIncome':p7,
    'LoanAmount':p8,
    'Loan_Amount_Term':p9,
    'Credit_History':p10,
    'Property_Area':p11
},index=[0])
    result = model.predict(df)
    
    if result == 1:
        Label(master, text="Loan approved").grid(row=31)
    else:
        Label(master, text="Loan Not Approved").grid(row=31)
        
    
master =Tk()
master.title("Loan Status Prediction Using Machine Learning")
label = Label(master,text = "Loan Status Prediction",bg = "black",
               fg = "white").grid(row=0,columnspan=2)

Label(master,text = "Gender [1:Male ,0:Female]").grid(row=1)
Label(master,text = "Married [1:Yes,0:No]").grid(row=2)
Label(master,text = "Dependents [1,2,3,4]").grid(row=3)
Label(master,text = "Education").grid(row=4)
Label(master,text = "Self_Employed").grid(row=5)
Label(master,text = "ApplicantIncome").grid(row=6)
Label(master,text = "CoapplicantIncome").grid(row=7)
Label(master,text = "LoanAmount").grid(row=8)
Label(master,text = "Loan_Amount_Term").grid(row=9)
Label(master,text = "Credit_History").grid(row=10)
Label(master,text = "Property_Area").grid(row=11)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)

Button(master,text="Predict",command=show_entry).grid()

mainloop()