# Loan Approval Prediction System

## Import required packages and dataset 

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json("loan_data.json")

In [3]:
df

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,LP001002,Male,No,0,Graduate,No,1,Urban,medium,Y
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,LP001006,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,LP001008,Male,No,0,Graduate,No,1,Urban,medium,Y
...,...,...,...,...,...,...,...,...,...,...
506,LP002978,Female,No,0,Graduate,No,1,Rural,low,Y
507,LP002979,Male,Yes,3+,Graduate,No,1,Rural,medium,Y
508,LP002983,Male,Yes,1,Graduate,No,1,Urban,medium,Y
509,LP002984,Male,Yes,2,Graduate,No,1,Urban,medium,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Application_ID      511 non-null    object
 1   Gender              511 non-null    object
 2   Married             511 non-null    object
 3   Dependents          511 non-null    object
 4   Education           511 non-null    object
 5   Self_Employed       511 non-null    object
 6   Credit_History      511 non-null    int64 
 7   Property_Area       511 non-null    object
 8   Income              511 non-null    object
 9   Application_Status  511 non-null    object
dtypes: int64(1), object(9)
memory usage: 40.0+ KB


In [5]:
df['Dependents'].unique()

array(['0', '1', '2', '3+'], dtype=object)

## Percentage of total applicants for each unique value of dependents

In [6]:
df['Dependents'].value_counts(normalize=True)*100

0     57.534247
2     17.221135
1     16.634051
3+     8.610568
Name: Dependents, dtype: float64

## Average number of dependents per applicant 

In [7]:
df.groupby('Dependents')['Application_ID'].count()

Dependents
0     294
1      85
2      88
3+     44
Name: Application_ID, dtype: int64

In [8]:
df_dependents_group = df.groupby('Dependents')['Application_ID'].count() / 4

In [9]:
df_dependents_group

Dependents
0     73.50
1     21.25
2     22.00
3+    11.00
Name: Application_ID, dtype: float64

## Percentage of applications approved for self-employed applicants 

In [10]:
df.Self_Employed.value_counts()

No     441
Yes     70
Name: Self_Employed, dtype: int64

In [11]:
df.groupby(['Self_Employed', 'Application_Status']).size()

Self_Employed  Application_Status
No             N                     140
               Y                     301
Yes            N                      24
               Y                      46
dtype: int64

In [12]:
df.groupby(['Self_Employed', 'Application_Status']).size()/sum(df.groupby(['Self_Employed', 'Application_Status']).size()) * 100

Self_Employed  Application_Status
No             N                     27.397260
               Y                     58.904110
Yes            N                      4.696673
               Y                      9.001957
dtype: float64

In [13]:
df.groupby(['Self_Employed', 'Application_Status']).size()/sum(df.groupby(['Self_Employed', 'Application_Status']).size()) * 100

Self_Employed  Application_Status
No             N                     27.397260
               Y                     58.904110
Yes            N                      4.696673
               Y                      9.001957
dtype: float64

In [14]:
df_self_employed_approved = df.groupby(['Self_Employed', 'Application_Status']).size()[3]/sum(df.groupby(['Self_Employed', 'Application_Status']).size()) * 100

In [15]:
df_self_employed_approved

9.001956947162427

## Percentage of rejections for married male applicants 

In [16]:
df.groupby(['Gender', 'Married', 'Application_Status']).size()

Gender  Married  Application_Status
Female  No       N                      26
                 Y                      40
        Yes      N                       8
                 Y                      17
Male    No       N                      43
                 Y                      71
        Yes      N                      87
                 Y                     219
dtype: int64

In [17]:
df.groupby(['Gender', 'Married', 'Application_Status']).size()/sum(df.groupby(['Gender', 'Married', 'Application_Status']).size()) * 100

Gender  Married  Application_Status
Female  No       N                      5.088063
                 Y                      7.827789
        Yes      N                      1.565558
                 Y                      3.326810
Male    No       N                      8.414873
                 Y                     13.894325
        Yes      N                     17.025440
                 Y                     42.857143
dtype: float64

In [18]:
df_male_married_rejected = df.groupby(['Gender', 'Married', 'Application_Status']).size()[6]/sum(df.groupby(['Gender', 'Married']).size()) * 100

In [19]:
df_male_married_rejected

17.025440313111545

## Property area has the maximum approval ratio 

In [20]:
df_approval_area_count = df.groupby(df.loc[df['Application_Status']=='Y']['Property_Area'])
df_approval_area_count['Application_Status'].count()

Property_Area
Rural         90
Semiurban    153
Urban        104
Name: Application_Status, dtype: int64

In [21]:
df_rejection_area_count = df.groupby(df.loc[df['Application_Status']=='N']['Property_Area'])
df_rejection_area_count['Application_Status'].count()

Property_Area
Rural        59
Semiurban    44
Urban        61
Name: Application_Status, dtype: int64

In [22]:
df_approval_ratio = df_approval_area_count['Application_Status'].count() / (df_approval_area_count['Application_Status'].count() + df_rejection_area_count['Application_Status'].count())

In [23]:
df_max_approval = df_approval_ratio.max()
df_max_approval_ratio = df_approval_ratio.idxmax()
print("Maximum Area '{}' Ratio '{}'".format(df_max_approval_ratio,df_max_approval))

Maximum Area 'Semiurban' Ratio '0.7766497461928934'


## Average number of dependents per income group 

In [24]:
df.groupby('Dependents')['Income'].count()

Dependents
0     294
1      85
2      88
3+     44
Name: Income, dtype: int64

In [25]:
df_dependents_income_group = df.groupby('Dependents')['Income'].count() / 4

In [26]:
df_dependents_income_group

Dependents
0     73.50
1     21.25
2     22.00
3+    11.00
Name: Income, dtype: float64

## Simple predictive model to assess whether a loan application will be approved or rejected 

In [27]:
df.head()

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,LP001002,Male,No,0,Graduate,No,1,Urban,medium,Y
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,LP001006,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,LP001008,Male,No,0,Graduate,No,1,Urban,medium,Y


In [28]:
df.drop('Application_ID', axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,Male,No,0,Graduate,No,1,Urban,medium,Y
1,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,Male,No,0,Graduate,No,1,Urban,medium,Y


In [30]:
label_encoder = preprocessing.LabelEncoder()
columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Income', 'Application_Status']
for col in columns:
    df[col] = label_encoder.fit_transform(df[col])

In [31]:
y = df['Application_Status']
y

0      1
1      0
2      1
3      1
4      1
      ..
506    1
507    1
508    1
509    1
510    0
Name: Application_Status, Length: 511, dtype: int32

In [32]:
X = df.drop('Application_Status', axis=1)
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income
0,1,0,0,0,0,1,2,2
1,1,1,1,0,0,1,0,2
2,1,1,0,0,1,1,2,1
3,1,1,0,1,0,1,2,1
4,1,0,0,0,0,1,2,2
...,...,...,...,...,...,...,...,...
506,0,0,0,0,0,1,0,1
507,1,1,3,0,0,1,0,2
508,1,1,1,0,0,1,2,2
509,1,1,2,0,0,1,2,2


In [33]:
df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
Gender,1.0,0.363499,0.200692,0.044667,-0.007948,0.024682,-0.020576,0.06089,0.052533
Married,0.363499,1.0,0.375597,-0.002516,0.01975,0.020519,0.029479,-0.019007,0.09856
Dependents,0.200692,0.375597,1.0,0.03925,0.05118,-0.039492,-0.000748,0.075123,0.016984
Education,0.044667,-0.002516,0.03925,1.0,-0.014796,-0.07572,-0.033095,-0.022029,-0.088699
Self_Employed,-0.007948,0.01975,0.05118,-0.014796,1.0,-0.016306,-0.052259,-0.011734,-0.018705
Credit_History,0.024682,0.020519,-0.039492,-0.07572,-0.016306,1.0,-0.003404,0.018913,0.545934
Property_Area,-0.020576,0.029479,-0.000748,-0.033095,-0.052259,-0.003404,1.0,-0.079382,0.016778
Income,0.06089,-0.019007,0.075123,-0.022029,-0.011734,0.018913,-0.079382,1.0,0.044086
Application_Status,0.052533,0.09856,0.016984,-0.088699,-0.018705,0.545934,0.016778,0.044086,1.0


In [34]:
df.drop(['Education','Self_Employed'],axis=1,inplace=True)

In [35]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4, random_state=69)

In [36]:
print(X_train.shape)
print(X_val_test.shape)
print(y_train.shape)
print(y_val_test.shape)

(306, 8)
(205, 8)
(306,)
(205,)


In [37]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=69)

In [38]:
print(X_val.shape)
print(X_test.shape)
print(y_val.shape)
print(y_test.shape)

(102, 8)
(103, 8)
(102,)
(103,)


In [39]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

### Logistic Regression 

In [41]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(X_train, y_train)
predicted= regressor.predict(X_test)

In [42]:
print('MSE_Score:  ', mean_squared_error(y_test, predicted))
print('R2_Score:  ', r2_score(y_test, predicted))
print('Accuracy_Score:  ', accuracy_score(y_test, predicted))

MSE_Score:   0.1941747572815534
R2_Score:   0.13445378151260512
Accuracy_Score:   0.8058252427184466


### XGB Regressor 

In [44]:
import xgboost as xgb
param = {
    'max_depth': 7,
    'eta': 0.05, 
    'objective': 'binary:hinge',
    'eval_metric':"mae"}  
    #'num_class': 2}  
num_round = 100
dtrain = xgb.DMatrix(X,y)
bst = xgb.train(param,dtrain,num_round)
preds = bst.predict(dtrain)
from sklearn.metrics import precision_score
print(precision_score(y, preds, average='macro')*100)

89.73994048389183


In [46]:
from sklearn.metrics import mean_absolute_error
X = X_train
y = y_train
reg = xgb.XGBRegressor(
    tree_method="hist",
    eval_metric=mean_absolute_error,
    
)
reg.fit(X, y, eval_set=[(X, y)])
predicted = regressor.predict(X_test)
accuracy_score(y_test, predicted)

[0]	validation_0-rmse:0.43106	validation_0-mean_absolute_error:0.42530
[1]	validation_0-rmse:0.38926	validation_0-mean_absolute_error:0.37081
[2]	validation_0-rmse:0.35824	validation_0-mean_absolute_error:0.32666
[3]	validation_0-rmse:0.33981	validation_0-mean_absolute_error:0.29497
[4]	validation_0-rmse:0.32781	validation_0-mean_absolute_error:0.27096
[5]	validation_0-rmse:0.31948	validation_0-mean_absolute_error:0.25342
[6]	validation_0-rmse:0.31269	validation_0-mean_absolute_error:0.23988
[7]	validation_0-rmse:0.30882	validation_0-mean_absolute_error:0.22926
[8]	validation_0-rmse:0.30345	validation_0-mean_absolute_error:0.21957
[9]	validation_0-rmse:0.29997	validation_0-mean_absolute_error:0.21309
[10]	validation_0-rmse:0.29795	validation_0-mean_absolute_error:0.20781
[11]	validation_0-rmse:0.29581	validation_0-mean_absolute_error:0.20334
[12]	validation_0-rmse:0.29453	validation_0-mean_absolute_error:0.19989
[13]	validation_0-rmse:0.29306	validation_0-mean_absolute_error:0.19746
[1

0.8058252427184466

In [48]:
params = {'objective':'binary:hinge'}
regressor = xgb.XGBRegressor(objective='binary:hinge',max_depth=5,n_estimators=50,eval_metric="rmse",
                             learning_rate=0.03,
                            sampling_method='uniform')

regressor.fit(X_train, y_train)
predicted = regressor.predict(X_test)

In [49]:
print('MSE_Score:  ', mean_squared_error(y_test, predicted))
print('R2_Score:  ', r2_score(y_test, predicted))
print('Accuracy_Score:  ', accuracy_score(y_test, predicted))

MSE_Score:   0.24271844660194175
R2_Score:   -0.08193277310924363
Accuracy_Score:   0.7572815533980582
