# Become A Wise Investor on Lending Club

LendingClub is a US peer-to-peer lending company, headquartered in San Francisco, California. It was the first peer-to-peer lender to register its offerings as securities with the Securities and Exchange Commission (SEC), and to offer loan trading on a secondary market. LendingClub is the world's largest peer-to-peer lending platform.

Given historical data on loans given out with information on whether or not the borrower defaulted (charge-off), I will build a model that can predict wether or nor a borrower will pay back their loan. This way in the future when there is a new potential customer I can assess whether or not they are likely to pay back the loan. The datset can be obtained from [Kaggle](https://www.kaggle.com/wordsforthewise/lending-club)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [None]:
df=pd.read_csv('accepted_2007_to_2018Q4.csv')

## Data Cleaning

In [None]:
# Remove redundant and leak information
df.drop(["desc","url","id","member_id","funded_amnt","funded_amnt_inv",
         "grade","sub_grade","emp_title","issue_d", "zip_code", "out_prncp", 
         "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp",
         "total_rec_int", "total_rec_late_fee", "recoveries", 
         "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"],
          inplace=True,axis=1)
# Remove columns with more than 50% missing values
df.dropna(axis=1,thresh=df.shape[0]*0.5,inplace=True)

# Select features and labels
features_labels=['loan_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status','loan_status',
       'pymnt_plan', 'purpose', 'title', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'revol_util', 'total_acc', 'initial_list_status',
       'last_credit_pull_d', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths',
       'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens']

# The column that directly describes if a loan was paid off on time
labels='loan_status'

In [None]:
df=df[features_labels]

# I can treat the problem as a binary classification one: 'Fully Paid' or 'Charged Off'
df = df[(df['loan_status'] == "Fully Paid") | (df['loan_status'] == "Charged Off")]
status_replace = {"loan_status" : {"Fully Paid": 1,"Charged Off": 0,}}
df = df.replace(status_replace)

In [None]:
df.shape

In [None]:
# Drop columns that contain one true unique value
orig_columns = df.columns
drop_columns = []
for col in orig_columns:
    col_series = df[col].dropna().unique()
    if len(col_series) == 1:
        drop_columns.append(col)
df.drop(drop_columns, axis=1,inplace=True)

# Drop columns that offer very little variablity 
df.drop(['delinq_amnt','acc_now_delinq','collections_12_mths_ex_med',
         'chargeoff_within_12_mths','tax_liens','application_type'],axis=1,inplace=True)


In [None]:
# Remove rows containing missing values 
df.dropna(axis=0,inplace=True)

In [None]:
df.info()

In [None]:
# Exlore object column that contain text
object_columns_df = df.select_dtypes(include=["object"])
print(object_columns_df.head())

# Drop categorical columns that contain too many values

df.drop('addr_state',axis=1,inplace=True)

# Title and purpose columns are repeated information. Remove title column
print(df["title"].value_counts())
print(df["purpose"].value_counts())
df.drop('title',axis=1,inplace=True)

# Extract the year from time stamp features
df['earliest_cr_year'] = df['earliest_cr_line'].apply(lambda date:int(date[-4:]))
df = df.drop('earliest_cr_line',axis=1)
df['last_credit_pull_year'] = df['last_credit_pull_d'].apply(lambda date:int(date[-4:]))
df = df.drop('last_credit_pull_d',axis=1)

# Convert emp_length column to numeric type data
df['emp_length'] = df['emp_length'].str.replace(r'\+*\syears*','').str.replace('< 1','0').astype(int)

In [None]:
# Categorical and dummy variables
# Convert home_ownership to dummy variables, but replace NONE and ANY with OTHER
df['home_ownership']=df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')
dummies = pd.get_dummies(df['home_ownership'],drop_first=True)
df = df.drop('home_ownership',axis=1)
df = pd.concat([df,dummies],axis=1)

# Convert other categorical variables to dummy variables
dummies = pd.get_dummies(df[['verification_status','initial_list_status','purpose','term']],drop_first=True)
df = df.drop(['verification_status','initial_list_status','purpose','term'],axis=1)
df = pd.concat([df,dummies],axis=1)

In [None]:
# Grab a sample of the dataset
df = df.sample(frac=0.1,random_state=101)

features = df[df.columns.drop('loan_status')]
target = df['loan_status']
df.shape

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
features = df[df.columns.drop(['loan_status'])]
target = df['loan_status']

## Logistic Model

In [None]:
lr = LogisticRegression(max_iter=500)

# 10-fold cross validation
predictions = cross_val_predict(lr, features, target, cv=10)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (df["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (df["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (df["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (df["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)

print(tpr)
print(fpr)

In [None]:
# Account for imbalance in the classes
lr = LogisticRegression(class_weight='balanced',max_iter=500)

# 10-fold cross validation
predictions = cross_val_predict(lr, features, target, cv=10)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (df["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (df["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (df["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (df["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)

print(tpr)
print(fpr)

In [None]:
# To improve FPR, impose a penalty of 10 for misclassifying a 0 and a penalty of 1 for misclassifying a 1
penalty = {0: 10, 1: 1}

lr = LogisticRegression(class_weight=penalty,max_iter=500)
predictions = cross_val_predict(lr, features, target, cv=10)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (df["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (df["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (df["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (df["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print(tpr)
print(fpr)

## Random Forest Model

In [None]:
# Account for the imbalance in the classes
rf = RandomForestClassifier(class_weight="balanced")

# 10-fold cross validation
predictions = cross_val_predict(rf, features, target, cv=10)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (df["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.`
tp_filter = (predictions == 1) & (df["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (df["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (df["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print(tpr)
print(fpr)

## Neural Networks Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [None]:
model = Sequential()

# input layer
model.add(Dense(36,  activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(18, activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(9, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=10, shuffle=True,random_state=1)
fp=0
tp=0
fn=0
tn=0

for train_index,test_index in kf.split(features):
    X_train,X_test = features.iloc[train_index].values, features.iloc[test_index].values
    y_train,y_test = target.iloc[train_index].values, target.iloc[test_index].values
    
    # Data Transformation
    scaler = MinMaxScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)
    
    
    model.fit(x=X_train, y=y_train, epochs=100,batch_size=250)
    predictions = model.predict_classes(X_test)
    results=confusion_matrix(y_test,predictions)
    
    # False positives.
    fp += results[0][1]

    # True positives.`
    tp += results[1][1]

    # False negatives.
    fn += results[1][0]
    
    # True negatives
    tn += results[0][0]

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(tpr)
print(fpr)

## Summary

Without accounting for imbalanced classes, the 3 models are good at identifying all the good loans (true positive rate), but also incorrectly identify most of bad loans (false positive rate). Neural networks model proves to be better (lowest FPR).

When I account for imbalanced classes, the logistic model can lower the FPR to 5.2%, and thus lower the risk. Note that this comes at the expense of true positive rate (19.2%).