# Will a person default his/her car loan payment?

![](https://outline-prod.imgix.net/20191003-RgIQuFEohniqlf16PO6r?auto=format&q=60&w=1280&s=1ded7c95447fe7399c5fe80e4e6f9ad9)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score,roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [101]:
dftrain = pd.read_csv('../input/lt-vehicle-loan-default-prediction/train.csv', parse_dates = ["Date.of.Birth","DisbursalDate"])
dftest = pd.read_csv('../input/lt-vehicle-loan-default-prediction/test.csv', parse_dates = ["Date.of.Birth","DisbursalDate"])


In [102]:
#Check train test 
print(dftrain.shape)
print(dftest.shape)


(233154, 41)
(112392, 40)


In [103]:
dfinfo = pd.read_csv('../input/lt-vehicle-loan-default-prediction/data_dictionary.csv')

dfinfo

Unnamed: 0.1,Unnamed: 0,Variable Name,Description,Unnamed: 2
0,0,UniqueID,Identifier for customers,
1,1,loan_default,Payment default in the first EMI on due date,
2,2,disbursed_amount,Amount of Loan disbursed,
3,3,asset_cost,Cost of the Asset,
4,4,ltv,Loan to Value of the asset,
5,5,branch_id,Branch where the loan was disbursed,
6,6,supplier_id,Vehicle Dealer where the loan was disbursed,
7,7,manufacturer_id,"Vehicle manufacturer(Hero, Honda, TVS etc.)",
8,8,Current_pincode,Current pincode of the customer,
9,9,Date.of.Birth,Date of birth of the customer,


In [104]:
dftrain.head(5)

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,1984-01-01,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,1985-07-31,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,1985-08-24,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,1993-12-30,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,1977-09-12,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


# EDA

In [105]:
dftrain.dtypes

UniqueID                                        int64
disbursed_amount                                int64
asset_cost                                      int64
ltv                                           float64
branch_id                                       int64
supplier_id                                     int64
manufacturer_id                                 int64
Current_pincode_ID                              int64
Date.of.Birth                          datetime64[ns]
Employment.Type                                object
DisbursalDate                          datetime64[ns]
State_ID                                        int64
Employee_code_ID                                int64
MobileNo_Avl_Flag                               int64
Aadhar_flag                                     int64
PAN_flag                                        int64
VoterID_flag                                    int64
Driving_flag                                    int64
Passport_flag               

In [106]:
dftrain.describe()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,State_ID,Employee_code_ID,...,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,loan_default
count,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,...,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0
mean,535917.573376,54356.993528,75865.07,74.74653,72.936094,19638.635035,69.028054,3396.880247,7.262243,1549.477148,...,0.007244,5427.793,7295.923,7179.998,13105.48,323.2684,0.381833,0.097481,0.206615,0.217071
std,68315.693711,12971.314171,18944.78,11.456636,69.834995,3491.949566,22.141304,2238.147502,4.48223,975.261278,...,0.111079,170237.0,183156.0,182592.5,151367.9,15553.69,0.955107,0.384439,0.706498,0.412252
min,417428.0,13320.0,37000.0,10.03,1.0,10524.0,45.0,1.0,1.0,1.0,...,0.0,-574647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,476786.25,47145.0,65717.0,68.88,14.0,16535.0,48.0,1511.0,4.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,535978.5,53803.0,70946.0,76.8,61.0,20333.0,86.0,2970.0,6.0,1451.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,595039.75,60413.0,79201.75,83.67,130.0,23000.0,86.0,5677.0,10.0,2362.0,...,0.0,0.0,0.0,0.0,1999.0,0.0,0.0,0.0,0.0,0.0
max,671084.0,990572.0,1628992.0,95.0,261.0,24803.0,156.0,7345.0,22.0,3795.0,...,8.0,36032850.0,30000000.0,30000000.0,25642810.0,4170901.0,35.0,20.0,36.0,1.0


In [None]:
import pandas_profiling

In [None]:
pf1=pandas_profiling.ProfileReport(dftrain)


In [107]:
pf1

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [108]:
dftrain["PERFORM_CNS.SCORE.DESCRIPTION"].value_counts()

No Bureau History Available                                116950
C-Very Low Risk                                             16045
A-Very Low Risk                                             14124
D-Very Low Risk                                             11358
B-Very Low Risk                                              9201
M-Very High Risk                                             8776
F-Low Risk                                                   8485
K-High Risk                                                  8277
H-Medium Risk                                                6855
E-Low Risk                                                   5821
I-Medium Risk                                                5557
G-Low Risk                                                   3988
Not Scored: Sufficient History Not Available                 3765
J-High Risk                                                  3748
Not Scored: Not Enough Info available on the customer        3672
Not Scored

In [110]:
def risk_level(risk):
    list1 = risk.split("-")
    if len(list1)==1:
        return "Risk_Unavaliable"
    else:
        return list1[1]


In [111]:
dftrain["Risk_level"] = dftrain["PERFORM_CNS.SCORE.DESCRIPTION"].apply(lambda x: risk_level(x))
dftest["Risk_level"] = dftest["PERFORM_CNS.SCORE.DESCRIPTION"].apply(lambda x: risk_level(x))

In [112]:
dftrain["Risk_level"][:5]

0    Risk_Unavaliable
1         Medium Risk
2    Risk_Unavaliable
3      Very High Risk
4    Risk_Unavaliable
Name: Risk_level, dtype: object

In [113]:
from datetime import date

def AgeinYears(date,df):
    disbDate = df["DisbursalDate"][i]
    return disbDate.year - date.year - ((disbDate.month, disbDate.day) < (date.month, date.day))

In [114]:
AgeinYrsTrain=[]
for i in range(len(dftrain)):
    AgeinYrsTrain.append(AgeinYears(dftrain["Date.of.Birth"][i],dftrain))
AgeinYrsTest=[]
for i in range(len(dftest)):
    AgeinYrsTest.append(AgeinYears(dftest["Date.of.Birth"][i],dftest))
    

In [115]:
dftrain["AgeinYrs"]=AgeinYrsTrain
dftest["AgeInYears"]=AgeinYrsTest

In [116]:
dftrain.drop(["UniqueID","Employee_code_ID"], axis=1, inplace=True)
dftest.drop(["UniqueID","Employee_code_ID"], axis=1, inplace=True)

In [117]:
def duration(duration):
    list1=duration.split(" ")
    sumyrs = float(list1[0][:-3]) + float(list1[1][:-3])/12
    return round(sumyrs,2)

In [118]:
dftrain["AVERAGE.ACCT.AGE_Years"] = dftrain["AVERAGE.ACCT.AGE"].apply(lambda x: duration(x))
dftest["AVERAGE.ACCT.AGE_Years"] = dftest["AVERAGE.ACCT.AGE"].apply(lambda x: duration(x))
dftrain["CREDIT.HISTORY.LENGTH_Years"] = dftrain["CREDIT.HISTORY.LENGTH"].apply(lambda x: duration(x))
dftest["CREDIT.HISTORY.LENGTH_Years"] = dftest["CREDIT.HISTORY.LENGTH"].apply(lambda x: duration(x))

In [119]:
dftrain["CREDIT.HISTORY.LENGTH_Years"][:5]

0    0.00
1    1.92
2    0.00
3    1.25
4    0.00
Name: CREDIT.HISTORY.LENGTH_Years, dtype: float64

In [120]:
dftrain.drop(["AVERAGE.ACCT.AGE","Aadhar_flag","Current_pincode_ID","Date.of.Birth","DisbursalDate","Driving_flag","MobileNo_Avl_Flag","PAN_flag","PRI.DISBURSED.AMOUNT","SEC.ACTIVE.ACCTS","Passport_flag","supplier_id","branch_id","VoterID_flag","State_ID","SEC.SANCTIONED.AMOUNT","SEC.OVERDUE.ACCTS","SEC.NO.OF.ACCTS","SEC.INSTAL.AMT","SEC.DISBURSED.AMOUNT","SEC.CURRENT.BALANCE"], axis=1, inplace=True)
dftest.drop(["AVERAGE.ACCT.AGE","Aadhar_flag","Current_pincode_ID","Date.of.Birth","DisbursalDate","Driving_flag","MobileNo_Avl_Flag","PAN_flag","PRI.DISBURSED.AMOUNT","SEC.ACTIVE.ACCTS","Passport_flag","supplier_id","branch_id","VoterID_flag","State_ID","SEC.SANCTIONED.AMOUNT","SEC.OVERDUE.ACCTS","SEC.NO.OF.ACCTS","SEC.INSTAL.AMT","SEC.DISBURSED.AMOUNT","SEC.CURRENT.BALANCE"], axis=1, inplace=True)

In [121]:
dftrain.drop(["PERFORM_CNS.SCORE.DESCRIPTION","CREDIT.HISTORY.LENGTH"], axis=1, inplace=True)
dftest.drop(["PERFORM_CNS.SCORE.DESCRIPTION","CREDIT.HISTORY.LENGTH"], axis=1, inplace=True)

In [122]:
dftrain.head(10)

Unnamed: 0,disbursed_amount,asset_cost,ltv,manufacturer_id,Employment.Type,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,loan_default,Risk_level,AgeinYrs,AVERAGE.ACCT.AGE_Years,CREDIT.HISTORY.LENGTH_Years
0,50578,58400,89.55,45,Salaried,0,0,0,0,0,0,0,0,0,0,0,Risk_Unavaliable,34,0.0,0.0
1,47145,65550,73.23,45,Self employed,598,1,1,1,27600,50200,1991,0,1,0,1,Medium Risk,33,1.92,1.92
2,53278,61360,89.63,45,Self employed,0,0,0,0,0,0,0,0,0,0,0,Risk_Unavaliable,32,0.0,0.0
3,57513,66113,88.48,45,Self employed,305,3,0,0,0,0,31,0,0,1,1,Very High Risk,24,0.67,1.25
4,52378,60300,88.39,45,Self employed,0,0,0,0,0,0,0,0,0,1,1,Risk_Unavaliable,41,0.0,0.0
5,54513,61900,89.66,45,Self employed,825,2,0,0,0,0,1347,0,0,0,0,Very Low Risk,28,1.75,2.0
6,46349,61500,76.42,45,Salaried,0,0,0,0,0,0,0,0,0,0,0,Risk_Unavaliable,30,0.0,0.0
7,43894,61900,71.89,45,Salaried,17,1,1,0,72879,74500,0,0,0,0,0,Risk_Unavaliable,29,0.17,0.17
8,53713,61973,89.56,45,Self employed,718,1,1,0,-41,365384,0,0,0,1,0,Very Low Risk,26,4.67,4.67
9,52603,61300,86.95,45,Salaried,818,1,0,0,0,0,2608,0,0,0,0,Very Low Risk,-50,1.58,1.58


In [123]:
dftrain.isnull().sum()

disbursed_amount                          0
asset_cost                                0
ltv                                       0
manufacturer_id                           0
Employment.Type                        7661
PERFORM_CNS.SCORE                         0
PRI.NO.OF.ACCTS                           0
PRI.ACTIVE.ACCTS                          0
PRI.OVERDUE.ACCTS                         0
PRI.CURRENT.BALANCE                       0
PRI.SANCTIONED.AMOUNT                     0
PRIMARY.INSTAL.AMT                        0
NEW.ACCTS.IN.LAST.SIX.MONTHS              0
DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS       0
NO.OF_INQUIRIES                           0
loan_default                              0
Risk_level                                0
AgeinYrs                                  0
AVERAGE.ACCT.AGE_Years                    0
CREDIT.HISTORY.LENGTH_Years               0
dtype: int64

In [124]:
#Replacing negative age with positive age. Assume typing error
dftrain["AgeinYrs"][dftrain["AgeinYrs"]<0]=-dftrain["AgeinYrs"]

In [125]:
dftrain["AgeinYrs"].describe()

count    233154.000000
mean         33.272022
std           8.710797
min          17.000000
25%          26.000000
50%          32.000000
75%          41.000000
max          52.000000
Name: AgeinYrs, dtype: float64

In [126]:
dftrain.dropna(subset = ["Employment.Type"], inplace=True)
dftest.dropna(subset = ["Employment.Type"], inplace=True)

In [127]:
dftrain.isnull().sum()

disbursed_amount                       0
asset_cost                             0
ltv                                    0
manufacturer_id                        0
Employment.Type                        0
PERFORM_CNS.SCORE                      0
PRI.NO.OF.ACCTS                        0
PRI.ACTIVE.ACCTS                       0
PRI.OVERDUE.ACCTS                      0
PRI.CURRENT.BALANCE                    0
PRI.SANCTIONED.AMOUNT                  0
PRIMARY.INSTAL.AMT                     0
NEW.ACCTS.IN.LAST.SIX.MONTHS           0
DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS    0
NO.OF_INQUIRIES                        0
loan_default                           0
Risk_level                             0
AgeinYrs                               0
AVERAGE.ACCT.AGE_Years                 0
CREDIT.HISTORY.LENGTH_Years            0
dtype: int64

In [None]:
pf2=pandas_profiling.ProfileReport(dftrain)


In [128]:
pf2

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [129]:
dftrain["manufacturer_id"].value_counts()

86     106062
45      55207
51      26243
48      15721
49       9700
120      9417
67       2366
145       760
153        11
152         5
156         1
Name: manufacturer_id, dtype: int64

In [130]:
dftrain.drop(dftrain[dftrain.manufacturer_id==156].index, inplace=True)

In [131]:
dftest["manufacturer_id"].value_counts()

86     50138
45     29657
51     13324
48      5902
49      4356
120     4267
67       940
145      348
153       13
152        3
155        1
Name: manufacturer_id, dtype: int64

In [132]:
dftest.drop(dftest[dftest.manufacturer_id==155].index, inplace=True)

In [133]:
dftest["manufacturer_id"].value_counts()

86     50138
45     29657
51     13324
48      5902
49      4356
120     4267
67       940
145      348
153       13
152        3
Name: manufacturer_id, dtype: int64

In [134]:
dftrain_onehot1 = pd.get_dummies(dftrain, columns=['manufacturer_id',"Employment.Type","Risk_level"], prefix = ['MID_',"ET_","RL_"],drop_first=True)
dftest_onehot1 = pd.get_dummies(dftest, columns=['manufacturer_id',"Employment.Type","Risk_level"], prefix = ['MID_',"ET_","RL_"],drop_first=True)

In [135]:
dftrain_onehot1.head()
#Reference Dummy: ManufacturerID-45,EmployeeType: Salaried, Risk_Type, Risk_level: High Risk 

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,50578,58400,89.55,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,47145,65550,73.23,598,1,1,1,27600,50200,1991,...,0,0,0,0,1,0,1,0,0,0
2,53278,61360,89.63,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,57513,66113,88.48,305,3,0,0,0,0,31,...,0,0,0,0,1,0,0,0,1,0
4,52378,60300,88.39,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [136]:
dftest_onehot1.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,53478,63558,86.54,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,55513,63163,89.45,749,2,1,0,43898,48780,5605,...,0,0,0,0,1,0,0,0,0,1
2,65282,84320,79.93,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,46905,63896,76.58,14,1,1,1,132480,255000,0,...,0,0,0,0,1,0,0,1,0,0
4,51428,63896,86.08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [137]:
X= dftrain_onehot1.loc[:, dftrain_onehot1.columns != 'loan_default']
Y= dftrain_onehot1["loan_default"]

In [138]:
X.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS',
       'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'NO.OF_INQUIRIES', 'AgeinYrs', 'AVERAGE.ACCT.AGE_Years',
       'CREDIT.HISTORY.LENGTH_Years', 'MID__48', 'MID__49', 'MID__51',
       'MID__67', 'MID__86', 'MID__120', 'MID__145', 'MID__152', 'MID__153',
       'ET__Self employed', 'RL__Low Risk', 'RL__Medium Risk',
       'RL__Risk_Unavaliable', 'RL__Very High Risk', 'RL__Very Low Risk'],
      dtype='object')

In [139]:
variables = ['disbursed_amount', 'asset_cost', 'ltv', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS',
       'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'NO.OF_INQUIRIES', 'AgeinYrs', 'AVERAGE.ACCT.AGE_Years',
       'CREDIT.HISTORY.LENGTH_Years']

In [140]:
X.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,50578,58400,89.55,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,47145,65550,73.23,598,1,1,1,27600,50200,1991,...,0,0,0,0,1,0,1,0,0,0
2,53278,61360,89.63,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,57513,66113,88.48,305,3,0,0,0,0,31,...,0,0,0,0,1,0,0,0,1,0
4,52378,60300,88.39,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [141]:
Y.head()

0    0
1    1
2    0
3    1
4    1
Name: loan_default, dtype: int64

# Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
scaled_training= X.copy()

In [142]:
X.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,50578,58400,89.55,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,47145,65550,73.23,598,1,1,1,27600,50200,1991,...,0,0,0,0,1,0,1,0,0,0
2,53278,61360,89.63,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,57513,66113,88.48,305,3,0,0,0,0,31,...,0,0,0,0,1,0,0,0,1,0
4,52378,60300,88.39,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [None]:
scaled_training[variables] = scaler.fit_transform(scaled_training[variables])

In [143]:
scaled_training.head(5)

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,0.064333,0.031555,0.933129,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0.058406,0.042098,0.732883,0.67191,0.002208,0.006944,0.04,0.064978,5e-05,7.8e-05,...,0,0,0,0,1,0,1,0,0,0
2,0.068995,0.035919,0.93411,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0
3,0.076308,0.042928,0.92,0.342697,0.006623,0.0,0.0,0.06471,0.0,1e-06,...,0,0,0,0,1,0,0,0,1,0
4,0.067441,0.034356,0.918896,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0


# Oversampling

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter
X_resampled, y_resampled = SMOTE().fit_resample(scaled_training, Y)
print(sorted(Counter(y_resampled).items()))

In [None]:
X_resampled_ADS, y_resampled_ADS = ADASYN().fit_resample(scaled_training, Y)
print(sorted(Counter(y_resampled).items()))

In [144]:
X_resampled_ADS

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,0.064333,0.031555,0.933129,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000,...,0,0,0,0,0,0,0,1,0,0
1,0.058406,0.042098,0.732883,0.671910,0.002208,0.006944,0.04,0.064978,0.000050,0.000078,...,0,0,0,0,1,0,1,0,0,0
2,0.068995,0.035919,0.934110,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000,...,0,0,0,0,1,0,0,1,0,0
3,0.076308,0.042928,0.920000,0.342697,0.006623,0.000000,0.00,0.064710,0.000000,0.000001,...,0,0,0,0,1,0,0,0,1,0
4,0.067441,0.034356,0.918896,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348373,0.061641,0.048286,0.719540,0.826240,0.006590,0.008476,0.00,0.064772,0.000013,0.000097,...,1,0,0,0,0,0,0,0,0,1
348374,0.059494,0.045773,0.716414,0.829071,0.002376,0.007474,0.00,0.064758,0.000012,0.000066,...,1,0,0,0,0,0,0,0,0,1
348375,0.093114,0.064947,0.868833,0.002253,0.000260,0.000819,0.00,0.065270,0.000059,0.000054,...,1,0,0,0,0,0,0,1,0,0
348376,0.076487,0.055308,0.804821,0.013651,0.001578,0.004963,0.00,0.068102,0.000357,0.000330,...,1,0,0,0,0,0,0,1,0,0


In [145]:
y_resampled_ADS

0         0
1         1
2         0
3         1
4         1
         ..
348373    1
348374    1
348375    1
348376    1
348377    1
Name: loan_default, Length: 348378, dtype: int64

In [146]:
X_resampled

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,0.064333,0.031555,0.933129,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000e+00,...,0,0,0,0,0,0,0,1,0,0
1,0.058406,0.042098,0.732883,0.671910,0.002208,0.006944,0.04,0.064978,0.000050,7.764361e-05,...,0,0,0,0,1,0,1,0,0,0
2,0.068995,0.035919,0.934110,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000e+00,...,0,0,0,0,1,0,0,1,0,0
3,0.076308,0.042928,0.920000,0.342697,0.006623,0.000000,0.00,0.064710,0.000000,1.208916e-06,...,0,0,0,0,1,0,0,0,1,0
4,0.067441,0.034356,0.918896,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000e+00,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353045,0.033056,0.048176,0.415315,0.000000,0.000000,0.000000,0.00,0.064710,0.000000,0.000000e+00,...,0,0,0,0,0,0,0,1,0,0
353046,0.063594,0.079104,0.527153,0.827508,0.008710,0.012058,0.00,0.064756,0.000044,2.887016e-05,...,0,0,0,0,1,0,0,0,0,1
353047,0.103370,0.077413,0.857709,0.685965,0.002208,0.006238,0.00,0.069691,0.000997,0.000000e+00,...,1,0,0,0,0,0,1,0,0,0
353048,0.061017,0.044613,0.762726,0.821485,0.007888,0.016853,0.00,0.065696,0.000143,2.458838e-07,...,0,0,0,0,0,0,0,0,0,1


# Splitting into training and testing

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_resampled_ADS, y_resampled_ADS, test_size=0.30, random_state=42)

In [148]:
x_train.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
123200,0.053232,0.043814,0.676687,0.734831,0.002208,0.006944,0.0,0.06795,0.00051,0.0,...,0,0,0,0,0,1,0,0,0,0
57811,0.075172,0.04553,0.877301,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0
301303,0.063157,0.042957,0.785812,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
175635,0.100316,0.064437,0.93816,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0
111117,0.021841,0.041291,0.362822,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [149]:
Counter(y_train)

Counter({0: 123707, 1: 120157})

In [150]:
Counter(y_test)

Counter({0: 52818, 1: 51696})

# Logistic Regression

In [None]:
log1 = LogisticRegression(penalty='l1',solver="liblinear", max_iter=1000).fit(x_train,y_train)

In [152]:
log1.score(x_train, y_train)

0.5852442344913559

In [None]:
predTrain = log1.predict(x_train)

In [151]:
predTrain[:5]

array([0, 1, 1, 1, 0])

In [None]:
PredictionsTrain = pd.DataFrame(columns=["Prediction","Actual"])
PredictionsTrain["Prediction"]=predTrain
PredictionsTrain["Actual"] = y_train.tolist()


In [153]:
PredictionsTrain.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,1,0
2,1,1
3,1,0
4,0,0


In [154]:
log1.score(x_test, y_test)

0.5848881489561207

In [None]:
predTest = log1.predict(x_test)


In [155]:
predTest

array([0, 0, 0, ..., 0, 1, 0])

In [None]:
PredictionsTest = pd.DataFrame(columns=["Prediction","Actual"])
PredictionsTest["Prediction"]=predTest
PredictionsTest["Actual"] = y_test.tolist()


In [156]:
PredictionsTest.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,1
3,1,1
4,1,0


In [157]:
CoeffLogR = pd.DataFrame(columns=["Variable","Coefficients"])
CoeffLogR["Variable"]=X.columns
CoeffLogR["Coefficients"]=log1.coef_.tolist()[0]
CoeffLogR.sort_values("Coefficients", ascending = False)


Unnamed: 0,Variable,Coefficients
12,NO.OF_INQUIRIES,3.641794
1,asset_cost,3.574409
11,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,2.929137
6,PRI.OVERDUE.ACCTS,2.517903
2,ltv,2.314781
14,AVERAGE.ACCT.AGE_Years,1.117873
8,PRI.SANCTIONED.AMOUNT,0.453854
16,MID__48,0.260368
25,ET__Self employed,0.173926
27,RL__Medium Risk,0.110099


In [158]:
# confusion matrix
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_test, predTest)))

Confusion Matrix
       0      1
0  29166  23652
1  19733  31963


In [159]:
print('Accuracy',accuracy_score(y_test, predTest))
print('Recall',recall_score(y_test, predTest))
print('F1_score',f1_score(y_test, predTest))
print('ROC-AUC_score',roc_auc_score(y_test, predTest))


Accuracy 0.5848881489561207
Recall 0.61828768183225
F1_score 0.59570780255519
ROC-AUC_score 0.5852428980557365


# Random Forest Classifier

In [None]:
regrRM = RandomForestClassifier(n_estimators=300)
regrRM.fit(x_train, y_train)

In [160]:
regrRM.score(x_train, y_train)

0.9995284256798872

In [161]:
regrRM.score(x_test, y_test)

0.7806801002736475

In [162]:
predTestRF = regrRM.predict(x_test)
predTestRF

array([1, 1, 1, ..., 0, 1, 0])

In [163]:
print('Accuracy',accuracy_score(y_test, predTestRF))
print('Recall',recall_score(y_test, predTestRF))
print('F1_score',f1_score(y_test, predTestRF))
print('ROC-AUC_score',roc_auc_score(y_test, predTestRF))


Accuracy 0.7806801002736475
Recall 0.7418562364593005
F1_score 0.7699148799486067
ROC-AUC_score 0.7802677372989069


In [164]:
# Confusion matrix
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_test, predTestRF)))

Confusion Matrix
       0      1
0  43241   9577
1  13345  38351


# Crossvalidation

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
# Create the random grid
rm_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:

rf2 = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
rf2_random = RandomizedSearchCV(estimator = rf2, param_distributions = rm_grid, n_iter = 2, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf2_random.fit(x_train, y_train)

In [165]:
rf2_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}

In [166]:
rf2_random.score(x_train, y_train)

0.9995284256798872

In [167]:
rf2_random.score(x_test, y_test)

0.7853014907093786

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adb = AdaBoostClassifier(n_estimators=300,learning_rate=0.1, random_state=42)

In [None]:
adb.fit(x_train, y_train)

In [168]:
adb.score(x_train, y_train)

0.6440721057638684

In [169]:
adb.score(x_test, y_test)

0.6446026369672962

In [None]:
predTestadb = adb.predict(x_test)

In [170]:
predTestadb

array([1, 1, 1, ..., 0, 1, 0])

In [171]:
print('Accuracy',accuracy_score(y_test, predTestadb))
print('Recall',recall_score(y_test, predTestadb))
print('F1_score',f1_score(y_test, predTestadb))
print('ROC-AUC_score',roc_auc_score(y_test, predTestadb))

Accuracy 0.6446026369672962
Recall 0.6215761374187558
F1_score 0.6337244847648161
ROC-AUC_score 0.6443580637868136


In [172]:
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_test, predTestadb)))

Confusion Matrix
       0      1
0  35237  17581
1  19563  32133


# Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbt = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200)

In [175]:
gbt.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [176]:
gbt.score(x_train, y_train)

0.7186546599744119

In [177]:
gbt.score(x_test, y_test)

0.7185066115544329

In [178]:
predTestgbt = gbt.predict(x_test)
predTestgbt

array([0, 0, 1, ..., 0, 1, 0])

In [179]:
print('Accuracy',accuracy_score(y_test, predTestgbt))
print('Recall',recall_score(y_test, predTestgbt))
print('F1_score',f1_score(y_test, predTestgbt))
print('ROC-AUC_score',roc_auc_score(y_test, predTestgbt))

Accuracy 0.7185066115544329
Recall 0.6874613122872176
F1_score 0.707257855877729
ROC-AUC_score 0.7181768676624093


In [None]:
##Fitting to Test data provided

In [None]:
scaled_testing=dftest_onehot1.copy()

In [180]:
scaled_training.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,0.064333,0.031555,0.933129,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0.058406,0.042098,0.732883,0.67191,0.002208,0.006944,0.04,0.064978,5e-05,7.8e-05,...,0,0,0,0,1,0,1,0,0,0
2,0.068995,0.035919,0.93411,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0
3,0.076308,0.042928,0.92,0.342697,0.006623,0.0,0.0,0.06471,0.0,1e-06,...,0,0,0,0,1,0,0,0,1,0
4,0.067441,0.034356,0.918896,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0


In [181]:
scaled_testing.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,...,MID__120,MID__145,MID__152,MID__153,ET__Self employed,RL__Low Risk,RL__Medium Risk,RL__Risk_Unavaliable,RL__Very High Risk,RL__Very Low Risk
0,0.045061,0.019036,0.900435,0.0,0.0,0.0,0.0,0.029104,0.005721,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0.047251,0.018715,0.934683,0.852105,0.015385,0.026316,0.0,0.029738,0.006301,6.6e-05,...,0,0,0,0,1,0,0,0,0,1
2,0.057766,0.035895,0.822643,0.0,0.0,0.0,0.0,0.029104,0.005721,0.0,...,0,0,0,0,0,0,0,1,0,0
3,0.037986,0.01931,0.783218,0.015927,0.007692,0.026316,0.045455,0.031019,0.008751,0.0,...,0,0,0,0,1,0,0,1,0,0
4,0.042854,0.01931,0.895022,0.0,0.0,0.0,0.0,0.029104,0.005721,0.0,...,0,0,0,0,0,0,0,1,0,0


In [182]:
scaled_testing=scaled_testing.rename(columns={"AgeInYears": "AgeinYrs"})

In [None]:
scaled_testing[variables] = scaler.fit_transform(scaled_testing[variables])

In [183]:
pred = rf2_random.predict(scaled_testing)
pred

array([0, 1, 0, ..., 0, 0, 0])