Based on the insights from Data Cleaning, we keep only a subset of the columns available to avoid any leakage of future information which is unavailable when the contract is introduced for the first time in the secondary market on the P2P Lending Platform. 

In [1]:
# import the necessary packages
import numpy as np
import os
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn import preprocessing as pp
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../Data/Loan_status_Cleaned.csv')
df.shape

(50000, 13)

In [3]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,home_ownership,annual_inc,issue_d,loan_status,purpose,dti,total_acc,pub_rec_bankruptcies
0,16000,60 months,17.58%,402.65,D,RENT,65000.0,Dec-2011,Fully Paid,debt_consolidation,15.56,16.0,0.0
1,12000,36 months,14.27%,411.71,C,MORTGAGE,70000.0,Dec-2011,Charged Off,other,8.4,12.0,0.0
2,3500,36 months,7.90%,109.52,A,MORTGAGE,61440.0,Dec-2011,Fully Paid,vacation,6.46,19.0,0.0
3,10000,36 months,13.49%,339.31,C,MORTGAGE,50000.0,Dec-2011,Fully Paid,credit_card,5.88,15.0,0.0
4,6000,36 months,10.65%,195.44,B,RENT,36000.0,Dec-2011,Fully Paid,other,14.43,9.0,0.0


# Feature Engineering

In [6]:
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')

In [8]:
df['int_rate'] = df['int_rate'].str.replace('%', '').astype('float')
df['int_rate'].head()

0    17.58
1    14.27
2     7.90
3    13.49
4    10.65
Name: int_rate, dtype: float64

In [9]:
df['annual_inc'] = df['annual_inc'].astype('int')
df['annual_inc'].head()

0    65000
1    70000
2    61440
3    50000
4    36000
Name: annual_inc, dtype: int32

In [24]:
# replace missing values with zero for 

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [21]:
# find where total_acc is null
df['total_acc'].dtype

dtype('float64')

In [26]:
# numeric features
# replae lower or equal to 0 with NaN in one line
df.loc[df['Amount'] <= 0, 'Amount'] = np.nan
df.loc[df['AmountOfPreviousLoansBeforeLoan'] <= 0, 'AmountOfPreviousLoansBeforeLoan'] = np.nan
df.loc[df['CreditScoreEeMini'] <= 0, 'CreditScoreEeMini'] = np.nan
df.loc[df['FreeCash'] <= 0, 'FreeCash'] = np.nan
df.loc[df['IncomeTotal'] <= 0, 'IncomeTotal'] = np.nan
df.loc[df['LiabilitiesTotal'] <= 0, 'LiabilitiesTotal'] = np.nan
df.loc[df['MonthlyPayment'] <= 0, 'MonthlyPayment'] = np.nan
df.loc[df['NoOfPreviousLoansBeforeLoan'] <= 0, 'NoOfPreviousLoansBeforeLoan'] = np.nan
df.loc[df['NrOfScheduledPayments'] <= 0, 'NrOfScheduledPayments'] = np.nan
df.loc[df['PreviousEarlyRepaymentsCountBeforeLoan'] <= 0, 'PreviousEarlyRepaymentsCountBeforeLoan'] = np.nan
df.loc[df['PreviousRepaymentsBeforeLoan'] <= 0, 'PreviousRepaymentsBeforeLoan'] = np.nan

# log transform
df['Amount'] = np.log1p(df['Amount'])
df['AmountOfPreviousLoansBeforeLoan'] = np.log1p(df['AmountOfPreviousLoansBeforeLoan'])
df['CreditScoreEeMini'] = np.log1p(df['CreditScoreEeMini'])
df['Duration'] = np.log1p(df['Duration'])
df['FreeCash'] = np.log1p(df['FreeCash'])
df['IncomeTotal'] = np.log1p(df['IncomeTotal'])
df['LiabilitiesTotal'] = np.log1p(df['LiabilitiesTotal'])
df['MonthlyPayment'] = np.log1p(df['MonthlyPayment'])
df['NoOfPreviousLoansBeforeLoan'] = np.log1p(df['NoOfPreviousLoansBeforeLoan'])
df['NrOfScheduledPayments'] = np.log1p(df['NrOfScheduledPayments'])
df['PreviousEarlyRepaymentsCountBeforeLoan'] = np.log1p(df['PreviousEarlyRepaymentsCountBeforeLoan'])
df['PreviousRepaymentsBeforeLoan'] = np.log1p(df['PreviousRepaymentsBeforeLoan'])

# cap interest rate at 100%
df.loc[df['Interest'] > 100, 'Interest'] = 100

# divide interest rate by 100
df['Interest'] = df['Interest'] / 100

In [27]:
# check how many observations per year the dataset contains
df['LoanDate'].dt.year.value_counts()

LoanDate
2019    55839
2020    27519
2018    25103
2021    21584
2017    17933
2016    10514
2015     8046
2014     7455
2013     2512
2010     1157
2009      665
2012      457
2011      451
Name: count, dtype: int64

In [28]:
df.shape

(179235, 28)

In [29]:
df = df.sort_values(by='LoanDate')
df.set_index('LoanDate', inplace=True)

# Train-Test Split
Year 2019 is used as test set, whereas all the years before are used as train set

In [30]:
test_set = df.iloc[(df.index.year >= 2019)]
train_set = df.iloc[(df.index.year < 2019)]

# Exploratory Data Analysis

In [31]:
train_set.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,74293.0,39.941233,12.100845,0.0,30.0,38.0,49.0,77.0
Amount,74293.0,7.361657,0.925665,2.000128,6.398595,7.496652,8.066521,9.27153
AmountOfPreviousLoansBeforeLoan,37053.0,8.097002,1.01131,2.623668,7.438972,8.193124,8.886133,10.570342
CreditScoreEeMini,41053.0,6.853395,0.124113,6.216606,6.908755,6.908755,6.908755,6.908755
FreeCash,34287.0,5.455615,1.568414,0.00995,4.988628,5.891644,6.480045,11.975084
IncomeTotal,74230.0,7.151165,0.676233,0.693147,6.694562,7.09091,7.53356,13.827459
Interest,74293.0,0.339273,0.209317,0.02,0.2,0.2894,0.3786,1.0
LiabilitiesTotal,65196.0,6.139703,0.937905,1.386294,5.66296,6.216606,6.748771,12.058216
MonthlyPayment,63976.0,4.424172,0.942273,2.60417,3.646168,4.51623,5.121028,7.770451
NewCreditCustomer,74293.0,0.557186,0.496722,0.0,0.0,1.0,1.0,1.0
