# Predicting LendingClub Defaults

## Cleaning Data

In [1]:
import pandas as pd
import numpy as np

loans_2007 = pd.read_csv('loans_2007.csv',low_memory=False)

In [2]:
loans_2007.head(1)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


In [3]:
len(loans_2007.columns)

52

In [4]:
def listify(string):
    split = string.split("\n")
    split = [s for s in split if s != ""]
    return split

In [5]:
drop1 = '''
id
member_id
funded_amnt
funded_amnt_inv
grade
sub_grade
emp_title
issue_d
'''

In [6]:
drop2 = '''
zip_code
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
'''

In [7]:
drop3 = '''
total_rec_int
total_rec_late_fee
recoveries
collection_recovery_fee
last_pymnt_d
last_pymnt_amnt
'''

In [8]:
drop = listify(drop1) + listify(drop2) + listify(drop3)
loans_2007 = loans_2007.drop(drop,axis=1)

In [9]:
print(loans_2007.head(1))
print(len(loans_2007.columns))

   loan_amnt        term int_rate  installment emp_length home_ownership  \
0     5000.0   36 months   10.65%       162.87  10+ years           RENT   

   annual_inc verification_status loan_status pymnt_plan    ...      \
0     24000.0            Verified  Fully Paid          n    ...       

  initial_list_status last_credit_pull_d collections_12_mths_ex_med  \
0                   f           Jun-2016                        0.0   

   policy_code  application_type acc_now_delinq  chargeoff_within_12_mths  \
0          1.0        INDIVIDUAL            0.0                       0.0   

   delinq_amnt  pub_rec_bankruptcies  tax_liens  
0          0.0                   0.0        0.0  

[1 rows x 32 columns]
32


In [10]:
print(loans_2007["loan_status"].value_counts())

Fully Paid                                             33136
Charged Off                                             5634
Does not meet the credit policy. Status:Fully Paid      1988
Current                                                  961
Does not meet the credit policy. Status:Charged Off      761
Late (31-120 days)                                        24
In Grace Period                                           20
Late (16-30 days)                                          8
Default                                                    3
Name: loan_status, dtype: int64


In [11]:
keep = ["Fully Paid","Charged Off"]
drop_row = [row for row,status in zip(loans_2007.index,loans_2007["loan_status"]) if status not in keep]
loans_2007 = loans_2007.drop(drop_row)

In [12]:
mapping = {
    'loan_status': {
        'Fully Paid': 1,
        'Charged Off': 0
    }
}

loans_2007 = loans_2007.replace(mapping)

In [13]:
drop_columns = []
for column in loans_2007.columns:
    non_null = loans_2007[column].dropna()
    if len(non_null.unique()) == 1:
        drop_columns.append(column)
loans_2007 = loans_2007.drop(drop_columns,axis=1)
print(drop_columns)

['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']


In [14]:
loans_2007.to_csv('filtered_loans_2007.csv',index=False)

## Feature Preparation

In [15]:
loans = pd.read_csv('filtered_loans_2007.csv')

In [16]:
loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,...,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,pub_rec_bankruptcies
0,5000.0,36 months,10.65%,162.87,10+ years,RENT,24000.0,Verified,1,credit_card,...,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,Jun-2016,0.0
1,2500.0,60 months,15.27%,59.83,< 1 year,RENT,30000.0,Source Verified,0,car,...,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,Sep-2013,0.0
2,2400.0,36 months,15.96%,84.33,10+ years,RENT,12252.0,Not Verified,1,small_business,...,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,Jun-2016,0.0
3,10000.0,36 months,13.49%,339.31,10+ years,RENT,49200.0,Source Verified,1,other,...,0.0,Feb-1996,1.0,10.0,0.0,5598.0,21%,37.0,Apr-2016,0.0
4,5000.0,36 months,7.90%,156.46,3 years,RENT,36000.0,Source Verified,1,wedding,...,0.0,Nov-2004,3.0,9.0,0.0,7963.0,28.3%,12.0,Jan-2016,0.0


In [17]:
null_counts = loans.isnull().sum()
print(null_counts)
print(len(loans))

loan_amnt                 0
term                      0
int_rate                  0
installment               0
emp_length                0
home_ownership            0
annual_inc                0
verification_status       0
loan_status               0
purpose                   0
title                    10
addr_state                0
dti                       0
delinq_2yrs               0
earliest_cr_line          0
inq_last_6mths            0
open_acc                  0
pub_rec                   0
revol_bal                 0
revol_util               50
total_acc                 0
last_credit_pull_d        2
pub_rec_bankruptcies    697
dtype: int64
38770


In [18]:
loans = loans.drop('pub_rec_bankruptcies', axis=1)
loans = loans.dropna()

In [19]:
print(loans.dtypes.value_counts())

object     11
float64    10
int64       1
dtype: int64


In [20]:
object_columns_df = loans.select_dtypes(include=['object'])
object_columns_df.head(1)

Unnamed: 0,term,int_rate,emp_length,home_ownership,verification_status,purpose,title,addr_state,earliest_cr_line,revol_util,last_credit_pull_d
0,36 months,10.65%,10+ years,RENT,Verified,credit_card,Computer,AZ,Jan-1985,83.7%,Jun-2016


In [21]:
loans = loans.drop(["earliest_cr_line","last_credit_pull_d"],axis=1)

cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']

for c in cols:
    print(loans[c].value_counts())

RENT        18513
MORTGAGE    17112
OWN          2984
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Not Verified       16696
Verified           12290
Source Verified     9722
Name: verification_status, dtype: int64
10+ years    8545
< 1 year     4513
2 years      4303
3 years      4022
4 years      3353
5 years      3202
1 year       3176
6 years      2177
7 years      1714
8 years      1442
9 years      1229
n/a          1032
Name: emp_length, dtype: int64
 36 months    29041
 60 months     9667
Name: term, dtype: int64
CA    6958
NY    3713
FL    2791
TX    2667
NJ    1798
IL    1483
PA    1473
VA    1376
GA    1364
MA    1301
OH    1179
MD    1026
AZ     850
WA     822
CO     770
NC     753
CT     730
MI     712
MO     671
MN     603
NV     481
SC     462
WI     441
AL     437
OR     436
LA     430
KY     315
OK     290
KS     260
UT     254
AR     237
DC     209
RI     196
NM     184
WV     172
NH     166
HI     166
DE     113
MT      83
WY      80
AK      

In [22]:
print(loans['purpose'].value_counts())
print(loans['title'].value_counts())

debt_consolidation    18130
credit_card            5039
other                  3864
home_improvement       2897
major_purchase         2155
small_business         1762
car                    1510
wedding                 929
medical                 680
moving                  576
vacation                375
house                   369
educational             320
renewable_energy        102
Name: purpose, dtype: int64
Debt Consolidation                          2104
Debt Consolidation Loan                     1632
Personal Loan                                642
Consolidation                                494
debt consolidation                           485
Credit Card Consolidation                    353
Home Improvement                             346
Debt consolidation                           324
Small Business Loan                          310
Credit Card Loan                             305
Personal                                     302
Consolidation Loan                       

In [23]:
loans = loans.drop(['title','addr_state'],axis=1)

In [24]:
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype(float)
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype(float)

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}

loans = loans.replace(mapping_dict)

In [25]:
dummies = ["home_ownership", "verification_status", "purpose", "term"]

for dummy in dummies:
    loans[dummy] = loans[dummy].astype('category')

dummy_df = pd.get_dummies(loans[dummies])
loans = pd.concat([loans,dummy_df],axis=1)
loans = loans.drop(dummies,axis=1)

In [26]:
loans.to_csv('cleaned_loans_2007.csv', index=False)

## Making Predictions

In [27]:
loans = pd.read_csv('cleaned_loans_2007.csv')

In [28]:
print(loans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38708 entries, 0 to 38707
Data columns (total 38 columns):
loan_amnt                              38708 non-null float64
int_rate                               38708 non-null float64
installment                            38708 non-null float64
emp_length                             38708 non-null int64
annual_inc                             38708 non-null float64
loan_status                            38708 non-null int64
dti                                    38708 non-null float64
delinq_2yrs                            38708 non-null float64
inq_last_6mths                         38708 non-null float64
open_acc                               38708 non-null float64
pub_rec                                38708 non-null float64
revol_bal                              38708 non-null float64
revol_util                             38708 non-null float64
total_acc                              38708 non-null float64
home_ownership_MORTGAGE    

In [29]:
def classify(y, predictions):
    """
    INPUT:
    y: correctly classified data
    predictions: predicted data
    
    OUTPUT:
    true_negative, true_positive, false_negative, false_positive
    """
    df = pd.DataFrame(y)
    tn = len(df[(predictions == 0) & (y == 0)])
    tp = len(df[(predictions == 1) & (y == 1)])
    fn = len(df[(predictions == 0) & (y == 1)])
    fp = len(df[(predictions == 1) & (y == 0)])
    
    return tn, tp, fn, fp

In [30]:
predictions = pd.Series(np.ones(loans.shape[0]))

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

1.0
1.0


In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

feature_cols = [c for c in loans.columns if c != 'loan_status']

features = loans[feature_cols]
target = loans['loan_status']

lr.fit(features,target)
predictions = lr.predict(features)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.996260017809439
0.9985495421992566


In [32]:
from sklearn.model_selection import cross_val_predict, KFold

lr = LogisticRegression()
kf = KFold(random_state=1)
lr.fit(features,target)
predictions = cross_val_predict(lr,features,target,cv=kf)

predictions = pd.Series(predictions)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.9969723953695458
0.9988517209077449


In [33]:
from sklearn.model_selection import cross_val_predict, KFold

lr = LogisticRegression(class_weight='balanced')
kf = KFold(random_state=1)
lr.fit(features,target)
predictions = cross_val_predict(lr,features,target,cv=kf)

predictions = pd.Series(predictions)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.3976847729296527
0.6732239446408605


In [34]:
penalty = {
    0: 10,
    1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(random_state=1)
lr.fit(features,target)
predictions = cross_val_predict(lr,features,target,cv=kf)

predictions = pd.Series(predictions)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.0780053428317008
0.21974435681261897


In [35]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight='balanced',random_state=1)
kf = KFold(random_state=1)
clf.fit(features,target)
predictions = cross_val_predict(clf,features,target,cv=kf)

predictions = pd.Series(predictions)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)

fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.9330365093499555
0.971806726498051


In [36]:
clf = RandomForestClassifier(class_weight=penalty,random_state=1)
kf = KFold(random_state=1)
clf.fit(features,target)
predictions = cross_val_predict(clf,features,target,cv=kf)

predictions = pd.Series(predictions)

tn, tp, fn, fp = classify(loans["loan_status"],predictions)
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(fpr)
print(tpr)

0.9408726625111309
0.9737104523615266
