Based on the insights from Data Cleaning, we keep only a subset of the columns available to avoid any leakage of future information which is unavailable when the contract is introduced for the first time in the secondary market on the P2P Lending Platform. 

In [30]:
# import the necessary packages
import numpy as np
import os
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn import preprocessing as pp
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import GridSearchCV

In [31]:
df = pd.read_csv('../Data/Loan_status_Cleaned.csv')
df.shape

(50000, 15)

In [32]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,home_ownership,annual_inc,issue_d,loan_status,purpose,dti,total_acc,acc_now_delinq,pct_tl_nvr_dlq,pub_rec_bankruptcies
0,15000,36 months,7.90%,469.36,A,RENT,45000.0,Dec-2011,Fully Paid,debt_consolidation,8.48,27,0,,0.0
1,16000,60 months,19.91%,423.11,E,RENT,81000.0,Dec-2011,Fully Paid,credit_card,20.52,21,0,,0.0
2,7100,36 months,16.77%,252.33,D,MORTGAGE,33000.0,Dec-2011,Fully Paid,debt_consolidation,24.0,8,0,,0.0
3,10000,36 months,8.90%,317.54,A,RENT,24000.0,Dec-2011,Fully Paid,debt_consolidation,7.0,21,0,,0.0
4,2500,36 months,14.27%,85.78,C,RENT,49500.0,Dec-2011,Fully Paid,debt_consolidation,10.33,20,0,,0.0


Categorical columns:
- term
- grade
- home_ownership
- purpose

Numeric columns:
- loan_amnt
- installment
- annual_inc
- dti
- total_acc
- acc_now_delinq
- pct_tl_nvr_dlq

Date column:
- issue_d

# Feature Engineering

In [33]:
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')

In [34]:
df['int_rate'] = df['int_rate'].str.replace('%', '').astype('float')
df['int_rate'].head()

0     7.90
1    19.91
2    16.77
3     8.90
4    14.27
Name: int_rate, dtype: float64

In [35]:
df['annual_inc'] = df['annual_inc'].astype('int')
df['annual_inc'].head()

0    45000
1    81000
2    33000
3    24000
4    49500
Name: annual_inc, dtype: int32

In [36]:
# replace the only missing value in total_acc with 0 and change the type to int
df['total_acc'] = df['total_acc'].astype('int')

In [37]:
# numeric features

# replae lower or equal to 0 with NaN
df.loc[df['loan_amnt'] <= 0, 'loan_amnt'] = np.nan
df.loc[df['acc_now_delinq'] <= 0, 'acc_now_delinq'] = np.nan
df.loc[df['annual_inc'] <= 0, 'annual_inc'] = np.nan
df.loc[df['dti'] <= 0, 'dti'] = np.nan

# log transform
df['annual_inc'] = np.log1p(df['annual_inc'])
df['loan_amnt'] = np.log1p(df['loan_amnt'])
df['installment'] = np.log1p(df['installment'])


# create account delinquency ratio
df['acc_delinq_ratio'] = df['acc_now_delinq'] / df['total_acc']

# take log of total account
df['total_acc'] = np.log1p(df['total_acc'])

# drop acc_now_delinq
df.drop('acc_now_delinq', axis=1, inplace=True)

# cap interest rate at 100%
df.loc[df['int_rate'] > 100, 'int_rate'] = 100

# divide interest rate by 100
df['int_rate'] = df['int_rate'] / 100

# divide pecentage of never delinquent accounts by 100
df['pct_tl_nvr_dlq'] = df['pct_tl_nvr_dlq'] / 100

In [38]:
# check how many observations per year the dataset contains
df['issue_d'].dt.year.value_counts()

issue_d
2019    8758
2018    8536
2017    7575
2016    7484
2015    7176
2014    4032
2020    2487
2013    2330
2012     911
2011     354
2010     207
2009      96
2008      47
2007       7
Name: count, dtype: int64

In [39]:
df.shape

(50000, 15)

Addition of macroeconomic variables: GDP growth rate (lagged by one year in the past), and Volatility index lagged one month in the past.

In [49]:
# load macroeconomic data
macro = pd.read_csv('../Data/Macro.csv')

macro.head()

Unnamed: 0,GDP,Date,VIX
0,,2006-12-01,
1,2.782811,2007-01-01,11.56
2,2.782811,2007-02-01,10.42
3,2.782811,2007-03-01,15.42
4,2.782811,2007-04-01,14.64


In [None]:
# merge macroeconomic data with loan data
df = df.merge(macro, how='left', left_on='issue_d', right_on='Date')

df.head()

In [41]:
df = df.sort_values(by='issue_d')
df.set_index('issue_d', inplace=True)

In [42]:
df.index

DatetimeIndex(['2007-07-01', '2007-07-01', '2007-10-01', '2007-10-01',
               '2007-11-01', '2007-12-01', '2007-12-01', '2008-01-01',
               '2008-01-01', '2008-01-01',
               ...
               '2020-09-01', '2020-09-01', '2020-09-01', '2020-09-01',
               '2020-09-01', '2020-09-01', '2020-09-01', '2020-09-01',
               '2020-09-01', '2020-09-01'],
              dtype='datetime64[ns]', name='issue_d', length=50000, freq=None)

In [43]:
# find the last six months of the dataset
df.index[-1] - pd.DateOffset(months=6)

Timestamp('2020-03-01 00:00:00')

In [44]:
# look at the number of observations in the last six months
df.loc[df.index > df.index[-1] - pd.DateOffset(months=6)].shape

(728, 14)

In [45]:
# extrat contracts in the last six months for portfolio mangement and validation
df_portfolio = df.loc[df.index > df.index[-1] - pd.DateOffset(months=6)].copy()

In [None]:
# remove observations in the last six months from the dataset
df = df.loc[df.index <= df.index[-1] - pd.DateOffset(months=6)].copy()
# retrieve the latest date available in the dataset
df.index[-1]

# Train-Test Split

In [47]:
# return the count of observations per year from the index
df.index.year.value_counts()

issue_d
2019    8758
2018    8536
2017    7575
2016    7484
2015    7176
2014    4032
2020    2487
2013    2330
2012     911
2011     354
2010     207
2009      96
2008      47
2007       7
Name: count, dtype: int64

In [30]:

test_set = df.iloc[(df.index.year >= 2019)]
train_set = df.iloc[(df.index.year < 2019)]

# Exploratory Data Analysis

In [31]:
train_set.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,74293.0,39.941233,12.100845,0.0,30.0,38.0,49.0,77.0
Amount,74293.0,7.361657,0.925665,2.000128,6.398595,7.496652,8.066521,9.27153
AmountOfPreviousLoansBeforeLoan,37053.0,8.097002,1.01131,2.623668,7.438972,8.193124,8.886133,10.570342
CreditScoreEeMini,41053.0,6.853395,0.124113,6.216606,6.908755,6.908755,6.908755,6.908755
FreeCash,34287.0,5.455615,1.568414,0.00995,4.988628,5.891644,6.480045,11.975084
IncomeTotal,74230.0,7.151165,0.676233,0.693147,6.694562,7.09091,7.53356,13.827459
Interest,74293.0,0.339273,0.209317,0.02,0.2,0.2894,0.3786,1.0
LiabilitiesTotal,65196.0,6.139703,0.937905,1.386294,5.66296,6.216606,6.748771,12.058216
MonthlyPayment,63976.0,4.424172,0.942273,2.60417,3.646168,4.51623,5.121028,7.770451
NewCreditCustomer,74293.0,0.557186,0.496722,0.0,0.0,1.0,1.0,1.0
