In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [7]:
loan = pd.read_csv('Data/Loan_Default.csv')
print(loan.head())

      ID  year loan_limit             Gender approv_in_adv loan_type  \
0  24890  2019         cf  Sex Not Available         nopre     type1   
1  24891  2019         cf               Male         nopre     type2   
2  24892  2019         cf               Male           pre     type1   
3  24893  2019         cf               Male         nopre     type1   
4  24894  2019         cf              Joint           pre     type1   

  loan_purpose Credit_Worthiness open_credit business_or_commercial  ...  \
0           p1                l1        nopc                  nob/c  ...   
1           p1                l1        nopc                    b/c  ...   
2           p1                l1        nopc                  nob/c  ...   
3           p4                l1        nopc                  nob/c  ...   
4           p1                l1        nopc                  nob/c  ...   

   credit_type  Credit_Score  co-applicant_credit_type    age  \
0          EXP           758                 

In [8]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [9]:
loan.year.unique()

array([2019])

Since we have only one year,2019, i guess we can drop year column

In [10]:
loan.drop(columns=['year'], inplace=True)

In [11]:
#converting all column names into lower case as it will be difficult

# Convert all column names to lowercase
loan.columns = loan.columns.str.lower()


In [12]:
loan.isnull().sum()

id                               0
loan_limit                    3344
gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
credit_worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
interest_rate_spread         36639
upfront_charges              39642
term                            41
neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
secured_by                       0
total_units                      0
income                        9150
credit_type                      0
credit_score                     0
co-applicant_credit_type         0
age                            200
submission_of_application      200
ltv                 

Apparently there are too many null variables in columns, let us separete them to numrical and categorical columns and then work on Null values.

In [13]:
# Getting dataframe columns by datatype
dtypes = pd.DataFrame(loan.dtypes).reset_index()

cat_vars = []
num_vars = []
for i, l in zip(dtypes['index'], dtypes[0]):
    if l == 'object':
        cat_vars.append(i)
    else:
        num_vars.append(i)

In [14]:
num_vars

['id',
 'loan_amount',
 'rate_of_interest',
 'interest_rate_spread',
 'upfront_charges',
 'term',
 'property_value',
 'income',
 'credit_score',
 'ltv',
 'status',
 'dtir1']

In [15]:
#let us try to replace the null values of Numerical columns by their mean, from above num_vars, pick those columns
#which have null values. We are filling using fillna.

loan['rate_of_interest']= loan['rate_of_interest'].fillna(loan['rate_of_interest'].mean())
loan['interest_rate_spread']=loan['interest_rate_spread'].fillna(loan['interest_rate_spread'].mean())
loan['upfront_charges']=loan['upfront_charges'].fillna(loan['upfront_charges'].mean())
loan['term']=loan['term'].fillna(loan['term'].mean())
loan['property_value']=loan['property_value'].fillna(loan['property_value'].mean())
loan['income']=loan['income'].fillna(loan['income'].mean())
loan['ltv']=loan['ltv'].fillna(loan['ltv'].mean())
loan['dtir1']=loan['dtir1'].fillna(loan['dtir1'].mean())

In [16]:
loan.isna().sum()

id                              0
loan_limit                   3344
gender                          0
approv_in_adv                 908
loan_type                       0
loan_purpose                  134
credit_worthiness               0
open_credit                     0
business_or_commercial          0
loan_amount                     0
rate_of_interest                0
interest_rate_spread            0
upfront_charges                 0
term                            0
neg_ammortization             121
interest_only                   0
lump_sum_payment                0
property_value                  0
construction_type               0
occupancy_type                  0
secured_by                      0
total_units                     0
income                          0
credit_type                     0
credit_score                    0
co-applicant_credit_type        0
age                           200
submission_of_application     200
ltv                             0
region        

So only Categorical columns are left. And we can observe that we have only 3300 maximum Null values, so we will drop them as the dataset is too large.

In [17]:
loan = loan.dropna()

In [18]:
loan.isnull().sum()

id                           0
loan_limit                   0
gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
credit_worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
rate_of_interest             0
interest_rate_spread         0
upfront_charges              0
term                         0
neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
secured_by                   0
total_units                  0
income                       0
credit_type                  0
credit_score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
ltv                          0
region                       0
security_type                0
status                       0
dtir1   

There are no Null values. Let us work on unique values in categorical columns, so we have better understanding of data. Classifying and storing all categorical columns in a list by checking for unique values < 10. We need to encode these columns.

In [19]:
n_unique_values = 10
all_columns = loan.columns.to_list()
categorical_columns = []

for i, column in enumerate(all_columns):
    temp = loan.loc[:, column].unique()
    if len(temp) < n_unique_values:
        print (f"{i+1}. {column} has {len(temp)} unique values, which are {temp}") 
        categorical_columns.append(column)
del temp

2. loan_limit has 2 unique values, which are ['cf' 'ncf']
3. gender has 4 unique values, which are ['Sex Not Available' 'Male' 'Joint' 'Female']
4. approv_in_adv has 2 unique values, which are ['nopre' 'pre']
5. loan_type has 3 unique values, which are ['type1' 'type2' 'type3']
6. loan_purpose has 4 unique values, which are ['p1' 'p4' 'p3' 'p2']
7. credit_worthiness has 2 unique values, which are ['l1' 'l2']
8. open_credit has 2 unique values, which are ['nopc' 'opc']
9. business_or_commercial has 2 unique values, which are ['nob/c' 'b/c']
15. neg_ammortization has 2 unique values, which are ['not_neg' 'neg_amm']
16. interest_only has 2 unique values, which are ['not_int' 'int_only']
17. lump_sum_payment has 2 unique values, which are ['not_lpsm' 'lpsm']
19. construction_type has 2 unique values, which are ['sb' 'mh']
20. occupancy_type has 3 unique values, which are ['pr' 'sr' 'ir']
21. secured_by has 2 unique values, which are ['home' 'land']
22. total_units has 4 unique values, whic

let us remove columns which we are not going to be useful and then work on encoding

In [20]:
loan = loan.drop('id',axis=1)

In [21]:
#let us separate categorical columns to do encoding

cat_col = loan[categorical_columns]

loan = loan.drop(categorical_columns, axis=1)

### Encoding all the categorical columns

In [22]:
cat_col=pd.get_dummies(cat_col)

In [23]:
cat_col.head()

Unnamed: 0,status,loan_limit_cf,loan_limit_ncf,gender_Female,gender_Joint,gender_Male,gender_Sex Not Available,approv_in_adv_nopre,approv_in_adv_pre,loan_type_type1,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,region_North,region_North-East,region_central,region_south,security_type_Indriect,security_type_direct
0,1,1,0,0,0,0,1,1,0,1,...,0,0,0,1,0,0,0,1,0,1
1,1,1,0,0,0,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,1
2,0,1,0,0,0,1,0,0,1,1,...,0,0,0,1,0,0,0,1,0,1
3,0,1,0,0,0,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
4,0,1,0,0,1,0,0,0,1,1,...,0,0,1,0,1,0,0,0,0,1


In [24]:
# Combing the columns having continous values with new encoded dataframe

loan_df = pd.concat([loan, cat_col], axis=1)

In [25]:
loan_df.head()

Unnamed: 0,loan_amount,rate_of_interest,interest_rate_spread,upfront_charges,term,property_value,income,credit_score,ltv,dtir1,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,region_North,region_North-East,region_central,region_south,security_type_Indriect,security_type_direct
0,116500,4.045476,0.441656,3224.996127,360.0,118000.0,1740.0,758,98.728814,45.0,...,0,0,0,1,0,0,0,1,0,1
1,206500,4.045476,0.441656,3224.996127,360.0,497893.465696,4980.0,552,72.746457,37.732932,...,0,0,0,1,1,0,0,0,0,1
2,406500,4.56,0.2,595.0,360.0,508000.0,9480.0,834,80.019685,46.0,...,0,0,0,1,0,0,0,1,0,1
3,456500,4.25,0.681,3224.996127,360.0,658000.0,11880.0,587,69.3769,42.0,...,0,0,1,0,1,0,0,0,0,1
4,696500,4.0,0.3042,0.0,360.0,758000.0,10440.0,602,91.886544,39.0,...,0,0,1,0,1,0,0,0,0,1


In [26]:
#let us keep the track of top correlated features

corr = loan_df.corr().abs()
features = []
correlations = []
for idx, correlation in corr['status'].T.iteritems():
    if correlation >= .10 and idx != 'status':
        features.append(idx)
        correlations.append(correlation)
corr_status_df = pd.DataFrame(
    {'Correlations':correlations, 'Features': features}).sort_values(by=['Correlations'], ascending=False)

In [27]:
print('Correlations with status')
display(corr_status_df)

Correlations with status


Unnamed: 0,Correlations,Features
6,0.589909,credit_type_EQUI
3,0.191781,lump_sum_payment_not_lpsm
2,0.191781,lump_sum_payment_lpsm
9,0.141979,co-applicant_credit_type_EXP
8,0.141979,co-applicant_credit_type_CIB
4,0.140994,credit_type_CIB
1,0.133661,neg_ammortization_not_neg
0,0.133661,neg_ammortization_neg_amm
5,0.12463,credit_type_CRIF
7,0.123233,credit_type_EXP


### Splitting the data

In [28]:
target = loan_df['status']
feature = loan_df.drop('status',axis=1)

In [29]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.25)

### feature scaling

In [30]:
#scaling using standard scaler

from sklearn.preprocessing import StandardScaler

stdscaler=StandardScaler()
x_train=stdscaler.fit_transform(x_train)
x_test=stdscaler.transform(x_test)

# KNN

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [32]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=7)

In [33]:
y_pred = knn.predict(x_test)

In [34]:
print('Accuracy Score of Train Data is : ',knn.score(x_train,y_train))

Accuracy Score of Train Data is :  0.9019696815357404


In [35]:
score = accuracy_score(y_test,y_pred)
print('Accuracy Score of Test Data is : ', score)

Accuracy Score of Test Data is :  0.8756528503167018


In [36]:
print(confusion_matrix(y_test,y_pred))

[[26230   999]
 [ 3477  5290]]


In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92     27229
           1       0.84      0.60      0.70      8767

    accuracy                           0.88     35996
   macro avg       0.86      0.78      0.81     35996
weighted avg       0.87      0.88      0.87     35996



From Classification report we can observe, Accuracy Score of Test Data by KNN is 87.7% == 88%

# Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression()

In [40]:
y_pred = lr.predict(x_test)

In [41]:
print('The Accuracy Score of Train Data by Logistic Regression is : ',lr.score(x_train,y_train))

The Accuracy Score of Train Data by Logistic Regression is :  0.8671506755442785


In [42]:
score1 = accuracy_score(y_test,y_pred)
print('The Accuracy Score of Test Data by Logistic Regression is : ',score1)

The Accuracy Score of Test Data by Logistic Regression is :  0.8663184798310923


In [43]:
print(confusion_matrix(y_test,y_pred))

[[26975   254]
 [ 4558  4209]]


In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92     27229
           1       0.94      0.48      0.64      8767

    accuracy                           0.87     35996
   macro avg       0.90      0.74      0.78     35996
weighted avg       0.88      0.87      0.85     35996



From Above, we can see the accuracy score as 86.9% == 87%

# Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [53]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)

In [54]:
print('Accuracy Score of Train Data by Decision Tree is : ', dt.score(x_train,y_train))
print('Accuracy Score of Test Data by Decision Tree is : ', accuracy_score(y_test,y_pred))

Accuracy Score of Train Data by Decision Tree is :  0.8565568077638975
Accuracy Score of Test Data by Decision Tree is :  0.8562895877319702


In [55]:
print(confusion_matrix(y_test,y_pred))

[[27228     1]
 [ 5172  3595]]


In [56]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91     27229
           1       1.00      0.41      0.58      8767

    accuracy                           0.86     35996
   macro avg       0.92      0.71      0.75     35996
weighted avg       0.88      0.86      0.83     35996



So we have 86% Accuracy Score with Decision Tree. 