In [117]:
#We are trying to answer pertinent questions about this data. 
#What are some relevant predictions that can be derived? 
#Like what might be the likelihood that an investment would be repaid,
#or what would be the likelihood that a loan application would be accepted? 

%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [118]:
#Setting user directory

os.chdir("/Users/EagleFace/Documents")


In [119]:
#First we are going to analyze the accepted loans data frame,
#and look at loan repayment likelihoods, before doing any 
#cross-matrix analysis, it will be good to 
#get traction with one matrix at a time and 
#see what insights can be communicated.


In [120]:
data = pd.read_csv('Loan_Data/LoanStats3a.csv', skiprows = 1, low_memory = False)


In [121]:
data_target = data['loan_status'] == 'Fully Paid'
data_target = data_target.astype(int)
data['target'] = data_target

#This sets up the classifier properly

In [122]:
#This drops some extraneous data

data = data.dropna(axis = 1, how = 'all')
data = data.drop('id', axis = 1)


In [123]:
#Identifying potentially categorical columns

data_categorical = data.select_dtypes(include=['object'])
print(data_categorical.columns)

Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d',
       'next_pymnt_d', 'last_credit_pull_d', 'application_type',
       'hardship_flag', 'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date'],
      dtype='object')


In [124]:
print(data_categorical['purpose'].unique())

print(data_categorical['term'].unique())

['credit_card' 'car' 'small_business' 'other' 'wedding'
 'debt_consolidation' 'home_improvement' 'major_purchase' 'medical'
 'moving' 'vacation' 'house' 'renewable_energy' 'educational' nan]
[' 36 months' ' 60 months' nan]


In [125]:
#Dopping more extraneous data

data_categorical = data_categorical.drop(['initial_list_status', 'hardship_flag', 'pymnt_plan', 'grade', 'application_type', 'next_pymnt_d', 'title', 'desc', 'zip_code', 'addr_state', 'emp_title'], axis=1) 

In [126]:
#Dropping 2 rows and converting int_rate from string to numerical

data_categorical = data_categorical.drop(data_categorical.tail(1).index)
data_categorical = data_categorical.drop(data_categorical.tail(1).index)

data_categorical.int_rate = data_categorical.int_rate.str.rstrip('%').astype(float) / 100.0 

In [127]:
data_categorical['term'] = data_categorical['term'] == ' 60 months' 
data_categorical['term'] = data_categorical['term'].astype(int)

In [128]:
home_ownership = pd.get_dummies(data_categorical['home_ownership']) 
data_categorical = data_categorical.join(home_ownership)
data_categorical = data_categorical.drop('home_ownership', axis = 1)

In [129]:
data_categorical['verification_status'] = data_categorical['verification_status'] == 'Verified'
data_categorical['verification_status'] = data_categorical['verification_status'].astype(int)

In [130]:
#To properly format this variable
data_categorical.revol_util = data_categorical.revol_util.str.rstrip('%').astype(float) / 100.0 

In [131]:
#Next we have to deal with employment length, 
#which is a quasi-numerical category. 

data_categorical['emp_length'] = data_categorical['emp_length'].fillna('n/a')
emp_length = pd.get_dummies(data_categorical['emp_length'])
data_categorical = data_categorical.join(emp_length)
data_categorical = data_categorical.drop('emp_length', axis = 1)


In [132]:
purpose = pd.get_dummies(data_categorical['purpose'])
data_categorical = data_categorical.join(purpose)
data_categorical = data_categorical.drop('purpose', axis = 1)

In [133]:
data_categorical['sub_grade'] = data_categorical['sub_grade'].astype('category', categories = [grade+str(subgrade) for grade in 'ABCDEFG' for subgrade in range(1, 6)], ordered = True) 

In [134]:
#Dropping more extraneous data

data_categorical = data_categorical.drop('n/a', axis=1)
data_categorical = data_categorical.drop('last_pymnt_d', axis=1)
data_categorical = data_categorical.drop('earliest_cr_line', axis=1)
data_categorical = data_categorical.drop('issue_d', axis=1)
data_categorical = data_categorical.drop('last_credit_pull_d', axis=1)

In [135]:
loan_status = pd.get_dummies(data_categorical['loan_status'])
data_categorical = data_categorical.join(loan_status)
data_categorical = data_categorical.drop('loan_status', axis=1)
data_categorical = data_categorical.drop('Does not meet the credit policy. Status:Charged Off', axis=1)
data_categorical = data_categorical.drop('Does not meet the credit policy. Status:Fully Paid', axis=1)
data_categorical.head()


Unnamed: 0,term,int_rate,sub_grade,verification_status,revol_util,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,...,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Charged Off,Fully Paid
0,0,0.1065,B2,1,0.837,Cash,N,,,,...,0,0,0,0,0,0,0,0,0,1
1,1,0.1527,C4,0,0.094,Cash,N,,,,...,0,0,0,0,0,0,0,0,1,0
2,0,0.1596,C5,0,0.985,Cash,N,,,,...,0,0,0,0,0,1,0,0,0,1
3,0,0.1349,C1,0,0.21,Cash,N,,,,...,0,0,0,1,0,0,0,0,0,1
4,1,0.1269,B5,0,0.539,Cash,N,,,,...,0,0,0,1,0,0,0,0,0,1


In [136]:
#Checking which variables are homogenous

data_numerical = data.select_dtypes(include=['int', 'float64'])
data_numerical.std() == 0.0


loan_amnt                     False
funded_amnt                   False
funded_amnt_inv               False
installment                   False
annual_inc                    False
dti                           False
delinq_2yrs                   False
inq_last_6mths                False
mths_since_last_delinq        False
mths_since_last_record        False
open_acc                      False
pub_rec                       False
revol_bal                     False
total_acc                     False
out_prncp                      True
out_prncp_inv                  True
total_pymnt                   False
total_pymnt_inv               False
total_rec_prncp               False
total_rec_int                 False
total_rec_late_fee            False
recoveries                    False
collection_recovery_fee       False
last_pymnt_amnt               False
collections_12_mths_ex_med     True
policy_code                    True
acc_now_delinq                False
chargeoff_within_12_mths    

In [137]:
data_numerical = data_numerical.drop(['out_prncp', 'out_prncp_inv', 'collections_12_mths_ex_med', 'policy_code', 'chargeoff_within_12_mths'], axis=1)

In [138]:
data_numerical['mths_since_last_delinq'] = data_numerical['mths_since_last_delinq'].fillna(120.0)

In [139]:
data_numerical['mths_since_last_record'] = data_numerical['mths_since_last_record'].fillna(129.0)
data_numerical['delinq_2yrs'] = data_numerical['delinq_2yrs'].fillna(0.0)
data_numerical = data_numerical.drop('tax_liens', axis=1)
data_numerical['funded_amnt'] = data_numerical['funded_amnt'].fillna(0.0)
data_numerical['loan_amnt'] = data_numerical['loan_amnt'].fillna(0.0)
data_numerical = data_numerical.fillna(0.0)



In [140]:
#This should complete the training set

data_train = data_numerical.join(data_categorical)
data_train = data_train.drop(data_train.loc[data_train['int_rate'].isnull()].index)
data_train = data_train.dropna(axis=0)
data_train.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,...,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Charged Off,Fully Paid
105,18825.0,18825.0,18800.0,465.53,38000.0,23.18,0.0,0.0,120.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
188,16000.0,16000.0,16000.0,536.72,58500.0,20.64,0.0,0.0,120.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
255,10000.0,10000.0,10000.0,334.16,39000.0,18.31,0.0,1.0,120.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
362,25000.0,25000.0,23505.293842,654.31,60000.0,20.04,0.0,0.0,120.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
430,33425.0,20675.0,19010.821218,475.63,75000.0,25.71,0.0,3.0,120.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [141]:
#Now with these values cleaned up we can look at the correlation
#coefficient matrix

corr = data_numerical.corr()
plt.figure(figsize=(16,16))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

ValueError: Arrays were different lengths: 29 vs 0

<matplotlib.figure.Figure at 0x1276d3e48>

In [142]:
from sklearn.cross_validation import train_test_split
x_train, x_test = train_test_split(data_train)

train_target = x_train['target']
x_train = x_train.drop('target', axis=1)
test_target = x_test['target']
x_test = x_test.drop('target', axis=1)



In [143]:
#Trying out a random forest classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = { 'max_depth': [1, 2, 3] }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.fit(x_train, train_target)

ValueError: could not convert string to float: 'May-2015'