In [1]:
import datetime
import pickle
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
from datetime import datetime as dt
from io import BytesIO
import boto3
import multiprocessing as mp

%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py
%run src/modeling.py
%run src/payments.py

# Load Data

In [2]:
csv_file_names = ('LoanStats3a_securev1.csv', 'LoanStats3b_securev1.csv', 'LoanStats3c_securev1.csv', 'LoanStats3d_securev1.csv',
                  'LoanStats_securev1_2016Q1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats_securev1_2016Q3.csv',
                  'LoanStats_securev1_2016Q4.csv', 'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2017Q2.csv', 
                  'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q4.csv', 'LoanStats_securev1_2018Q1.csv',
                  'LoanStats_securev1_2018Q2.csv', 'LoanStats_securev1_2018Q3.csv', 'LoanStats_securev1_2018Q4.csv')

In [3]:
def load_all_loan_data_from_s3(csv_files, columns, number_of_rows=None):
    '''
    TODO: Update docstring. 
    '''
    loan_data = []
    for filename in csv_files:
        s3 = boto3.client('s3')
        obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
        data = obj['Body'].read()
        f = BytesIO(data)
        data = pd.read_csv(f, header=1, low_memory=False, na_values='n/a',
                           usecols=columns, nrows=number_of_rows) 
        loan_data.append(data)
    loans = pd.concat(loan_data)
    return loans

In [15]:
loans = load_all_loan_data_from_s3(csv_file_names, columns_to_use, number_of_rows=5000)

In [16]:
loans.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,5000,36 months,10.65%,162.87,B,10+ years,RENT,24000.0,Verified,...,,,,,0,0,,,,
1,1077430,2500,60 months,15.27%,59.83,C,< 1 year,RENT,30000.0,Source Verified,...,,,,,0,0,,,,
2,1077175,2400,36 months,15.96%,84.33,C,10+ years,RENT,12252.0,Not Verified,...,,,,,0,0,,,,
3,1076863,10000,36 months,13.49%,339.31,C,10+ years,RENT,49200.0,Source Verified,...,,,,,0,0,,,,
4,1075358,3000,60 months,12.69%,67.79,B,1 year,RENT,80000.0,Source Verified,...,,,,,0,0,,,,


# Clean Data

In [17]:
loans = drop_loan_status(loans)
loans = drop_joint_applicant_loans(loans)
loans = fix_rate_cols(loans)
loans.dropna(subset=['issue_d'], inplace=True)
loans = fix_date_cols(loans)
loans = exclude_loans_before_2010(loans)
loans = clean_loan_term_col(loans)
loans = only_include_36_month_loans(loans)
loans = clean_employment_length(loans)
loans = create_missing_data_boolean_columns(loans)
loans = fill_nas(loans, value=-99)
#loans = add_issue_date_and_month(loans) # Ditch this?
loans = add_supplemental_rate_data(loans)
loans = create_rate_difference_cols(loans)
loans = create_months_since_earliest_cl_col(loans)
#loans = create_loan_life_months_col(loans)
loans = change_data_types(loans)
loans = create_dummy_cols(loans)
loans = drop_unnecessary_cols(loans)
loans.sort_values(by='issue_d', inplace=True)
loans.set_index('id', inplace=True)

In [18]:
loans.head()
# Save the clean data as a pickle.
# Create separate dataframe for EDA.

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
994821,20000,7.9,625.809998,10.0,150000.0,2011-10-01,4.09,0,735,739,...,0,0,0,0,0,0,0,0,0,0
1001335,4200,9.91,135.350006,4.0,42000.0,2011-10-01,20.83,1,695,699,...,0,0,0,0,0,1,0,0,0,0
1001320,10000,7.9,312.910004,2.0,57000.0,2011-10-01,13.09,0,720,724,...,0,0,0,0,0,0,0,0,0,0
1001516,2500,8.9,79.389999,2.0,31000.0,2011-10-01,22.49,0,715,719,...,0,0,0,0,0,1,0,0,0,0
1001151,3500,10.65,114.010002,10.0,55000.0,2011-10-01,9.03,3,685,689,...,0,0,0,0,0,0,0,1,0,0


# Split Training/Testing

Now that the data has been cleaned I need to split it up into training and testing data then figure out how to use my old ROI calculation functions.

In [22]:
df_train, df_test = get_training_and_testing_data(loans, split_date='2016-04-01')

In [23]:
df_train.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
994821,20000,7.9,625.809998,10.0,150000.0,2011-10-01,4.09,0,735,739,...,0,0,0,0,0,0,0,0,0,0
1001335,4200,9.91,135.350006,4.0,42000.0,2011-10-01,20.83,1,695,699,...,0,0,0,0,0,1,0,0,0,0
1001320,10000,7.9,312.910004,2.0,57000.0,2011-10-01,13.09,0,720,724,...,0,0,0,0,0,0,0,0,0,0
1001516,2500,8.9,79.389999,2.0,31000.0,2011-10-01,22.49,0,715,719,...,0,0,0,0,0,1,0,0,0,0
1001151,3500,10.65,114.010002,10.0,55000.0,2011-10-01,9.03,3,685,689,...,0,0,0,0,0,0,0,1,0,0


In [25]:
training_loan_ids = tuple(df_train.index)

# Payments Data

Now we need to read in the payments data to start calculating ROI.

In [11]:
def load_data_from_s3(filename, format='csv', columns_to_use=None):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False, usecols=columns_to_use)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df

In [12]:
df_payments = load_data_from_s3('PMTHIST_INVESTOR_201904.csv', format='csv',
                                columns_to_use=['LOAN_ID', 'RECEIVED_D', 'PBAL_END_PERIOD_INVESTORS', 'RECEIVED_AMT_INVESTORS', 'IssuedDate'])

In [13]:
df_payments.head()

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,IssuedDate
0,54734,SEP2009,632.771017,18636.4093,AUG2009
1,54734,OCT2009,632.771017,18188.363925,AUG2009
2,54734,NOV2009,632.771017,17735.877487,AUG2009
3,54734,DEC2009,632.771017,17278.905966,AUG2009
4,54734,JAN2010,632.771017,16817.404904,AUG2009


In [14]:
df_payments = get_cleaned_payment_history_data(df_payments)
df_payments.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PBAL_END_PERIOD_INVESTORS,RECEIVED_AMT_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,219.55983,7.189307,1
2007-07-01,73582,219.637436,7.289357,1
2007-07-01,74505,219.611313,7.25625,1
2007-07-01,77792,121.962997,3.975833,1
2007-07-01,81085,268.539795,9.03231,1


In [30]:
x = pd.read_pickle('data/cleaned_payments_data.pkl.bz2')
x.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PBAL_END_PERIOD_INVESTORS,RECEIVED_AMT_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,219.55983,7.189307,1
2007-07-01,73582,219.637436,7.289357,1
2007-07-01,74505,219.611313,7.25625,1
2007-07-01,77792,121.962997,3.975833,1
2007-07-01,81085,268.539795,9.03231,1


In [31]:
def get_one_loan_payment_data(payments, loan_id):
    try:
        return payments[payments['LOAN_ID'] == loan_id][['RECEIVED_AMT_INVESTORS', 'mths_since_issue']]
    except:
        return pd.DataFrame()

In [32]:
get_one_loan_payment_data(x, 72176)

In [33]:
x['LOAN_ID']

KeyError: 'LOAN_ID'