In [2]:
import csv

csv_data = []

with open('LoanStats3b.csv') as csv_file:
    reader = csv.DictReader(csv_file)
    csv_data = [row for row in reader]
    
print(csv_data[0])

{'bc_util': '89.4', 'mo_sin_old_rev_tl_op': '290', 'open_il_24m': '', 'last_pymnt_d': 'May-2016', 'num_il_tl': '9', 'out_prncp_inv': '2478.36', 'revol_util': '44.4%', 'out_prncp': '2478.36', 'grade': 'B', 'pymnt_plan': 'n', 'total_rec_int': '1469.69', 'percent_bc_gt_75': '66.7', 'pct_tl_nvr_dlq': '77.3', 'acc_now_delinq': '0', 'mths_since_recent_revol_delinq': '11', 'mths_since_recent_inq': '8', 'num_actv_rev_tl': '4', 'num_bc_sats': '3', 'recoveries': '0.0', 'mths_since_recent_bc': '25', 'max_bal_bc': '', 'annual_inc': '102000', 'inq_last_12m': '', 'last_credit_pull_d': 'Apr-2016', 'funded_amnt_inv': '10000', 'total_pymnt': '8991.33', 'mths_since_rcnt_il': '', 'avg_cur_bal': '4349', 'dti_joint': '', 'mo_sin_rcnt_rev_tl_op': '23', 'collections_12_mths_ex_med': '0', 'delinq_amnt': '0', 'mort_acc': '0', 'desc': '', 'num_actv_bc_tl': '3', 'mths_since_last_major_derog': '54', 'tot_hi_cred_lim': '58486', 'dti': '15.55', 'emp_length': '7 years', 'total_bal_ex_mort': '39143', 'inq_last_6mths'

In [3]:
#cleanup
csv_data = [row for row in csv_data if row['loan_status'] is not None]
csv_data = [row for row in csv_data if row['emp_length'] != 'n/a']
print(set(row['verification_status'] for row in csv_data))
print(len([1 for row in csv_data if row['emp_length'] == 'n/a']))

numeric_status = {'Charged Off': 1,
                   'Current': 0,
                   'Default': 1,
                   'Fully Paid': 0,
                   'In Grace Period': 0,
                   'Late (16-30 days)': 0,
                   'Late (31-120 days)': 1,
                  }

def fix_data(row):
    row['num_status'] = numeric_status[row['loan_status']]
    if not row['annual_inc_joint']:
        row['annual_inc_joint'] = row['annual_inc']
        
    float_values = ['annual_inc', 'annual_inc_joint', 'loan_amnt']
    int_values = ['term']
    for k in float_values:
        row[k] = float(row[k])
    
    if not isinstance(row['emp_length'], int):
        if row['emp_length'] == '< 1 year':
            row['emp_length'] = 0
        else:
            row['emp_length'] = int(row['emp_length'][:2])
            
    if not isinstance(row['term'], int):
        row['term'] = int(row['term'][:3])
        
    if row['verification_status'] == 'Verified':
        row['income_verified'] = 1
    else: # Includes both not verified and "source verified"
        row['income_verified'] = 0
    
    return row

for row in csv_data:
    fix_data(row)
    
print(set(row['num_status'] for row in csv_data))

variables_of_interest = [
    'annual_inc', # annual income disclosed on application
    'annual_inc_joint', # total annual income of co-borrowers
    'emp_length', # employment tenure in years
    'income_verified', # Was the income verified
    'term', # The number of payments on the loan. Values are in months and can be either 36 or 60.
    'loan_amnt', # The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
]

import numpy as np
X = np.array([[row[var] for var in variables_of_interest] for row in csv_data])
print(X.shape)
y = np.array([row['num_status'] for row in csv_data])
print(y.shape)

{'Not Verified', 'Source Verified', 'Verified'}
0
{0, 1}
(180246, 6)
(180246,)


In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [5]:
print(sum(y_test), len(y_test), 1-float(sum(y_test)/len(y_test)))

20092 144197 0.8606628431936865


In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, random_state=0)
lr.fit(X_train_std, y_train)

from sklearn.metrics import accuracy_score
y_pred = lr.predict(X_test_std)
print(sum(y_pred), len(y_pred), 1-float(sum(y_pred)/len(y_pred)))
accuracy_score(y_test, y_pred)

0 144197 1.0


0.86066284319368647

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=5, random_state=0, n_jobs=1)
forest.fit(X_train_std, y_train)

y_pred = forest.predict(X_test_std)
print(sum(y_pred), len(y_pred), 1-float(sum(y_pred)/len(y_pred)))
accuracy_score(y_test, y_pred)

10836 144197 0.9248528055368697


0.81082824191904135