In [3]:
import datetime
import pickle
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
from datetime import datetime as dt
from io import BytesIO
import boto3
import multiprocessing as mp

%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py
%run src/modeling.py
%run src/payments.py

# Load Data

In [2]:
csv_file_names = ('LoanStats3a_securev1.csv', 'LoanStats3b_securev1.csv', 'LoanStats3c_securev1.csv', 'LoanStats3d_securev1.csv',
                  'LoanStats_securev1_2016Q1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats_securev1_2016Q3.csv',
                  'LoanStats_securev1_2016Q4.csv', 'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2017Q2.csv', 
                  'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q4.csv', 'LoanStats_securev1_2018Q1.csv',
                  'LoanStats_securev1_2018Q2.csv', 'LoanStats_securev1_2018Q3.csv', 'LoanStats_securev1_2018Q4.csv')

In [3]:
def load_all_loan_data_from_s3(csv_files, columns, number_of_rows=None):
    '''
    TODO: Update docstring. 
    '''
    loan_data = []
    for filename in csv_files:
        s3 = boto3.client('s3')
        obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
        data = obj['Body'].read()
        f = BytesIO(data)
        data = pd.read_csv(f, header=1, low_memory=False, na_values='n/a',
                           usecols=columns, nrows=number_of_rows) 
        loan_data.append(data)
    loans = pd.concat(loan_data)
    return loans

In [4]:
loans = load_all_loan_data_from_s3(csv_file_names, columns_to_use, number_of_rows=5000)

In [5]:
loans.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,5000,36 months,10.65%,162.87,B,10+ years,RENT,24000.0,Verified,...,,,,,0,0,,,,
1,1077430,2500,60 months,15.27%,59.83,C,< 1 year,RENT,30000.0,Source Verified,...,,,,,0,0,,,,
2,1077175,2400,36 months,15.96%,84.33,C,10+ years,RENT,12252.0,Not Verified,...,,,,,0,0,,,,
3,1076863,10000,36 months,13.49%,339.31,C,10+ years,RENT,49200.0,Source Verified,...,,,,,0,0,,,,
4,1075358,3000,60 months,12.69%,67.79,B,1 year,RENT,80000.0,Source Verified,...,,,,,0,0,,,,


# Clean Data

In [6]:
#loans = drop_loan_status(loans)
loans = drop_joint_applicant_loans(loans)
loans = fix_rate_cols(loans)
loans.dropna(subset=['issue_d'], inplace=True)
loans = fix_date_cols(loans)
loans = exclude_loans_before_2010(loans)
loans = clean_loan_term_col(loans)
loans = only_include_36_month_loans(loans)
loans = clean_employment_length(loans)
loans = create_missing_data_boolean_columns(loans)
loans = fill_nas(loans, value=-99)
loans = add_issue_date_and_month(loans)
loans = add_supplemental_rate_data(loans)
loans = create_rate_difference_cols(loans)
loans = create_months_since_earliest_cl_col(loans)
#loans = create_loan_life_months_col(loans)
loans = change_data_types(loans)
loans = create_dummy_cols(loans)
loans = drop_unnecessary_cols(loans)
loans.sort_values(by='issue_d', inplace=True)
loans.set_index('id', inplace=True)

In [7]:
loans.head()
# Save the clean data as a pickle.
# Create separate dataframe for EDA.

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,loan_status,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
994821,20000,7.9,625.809998,10.0,150000.0,2011-10-01,Fully Paid,4.09,0,735,...,0,0,0,0,0,0,0,0,0,0
1001335,4200,9.91,135.350006,4.0,42000.0,2011-10-01,Charged Off,20.83,1,695,...,0,0,0,0,0,1,0,0,0,0
1001320,10000,7.9,312.910004,2.0,57000.0,2011-10-01,Fully Paid,13.09,0,720,...,0,0,0,0,0,0,0,0,0,0
1001516,2500,8.9,79.389999,2.0,31000.0,2011-10-01,Fully Paid,22.49,0,715,...,0,0,0,0,0,1,0,0,0,0
1001151,3500,10.65,114.010002,10.0,55000.0,2011-10-01,Charged Off,9.03,3,685,...,0,0,0,0,0,0,0,1,0,0


# Split Training/Testing

Now that the data has been cleaned I need to split it up into training and testing data then figure out how to use my old ROI calculation functions.

In [9]:
df_train, df_test = get_training_and_testing_data(loans, split_date='2016-04-01')

In [10]:
df_train.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,loan_status,dti,delinq_2yrs,fico_range_low,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
994821,20000,7.9,625.809998,10.0,150000.0,2011-10-01,Fully Paid,4.09,0,735,...,0,0,0,0,0,0,0,0,0,0
1001335,4200,9.91,135.350006,4.0,42000.0,2011-10-01,Charged Off,20.83,1,695,...,0,0,0,0,0,1,0,0,0,0
1001320,10000,7.9,312.910004,2.0,57000.0,2011-10-01,Fully Paid,13.09,0,720,...,0,0,0,0,0,0,0,0,0,0
1001516,2500,8.9,79.389999,2.0,31000.0,2011-10-01,Fully Paid,22.49,0,715,...,0,0,0,0,0,1,0,0,0,0
1001151,3500,10.65,114.010002,10.0,55000.0,2011-10-01,Charged Off,9.03,3,685,...,0,0,0,0,0,0,0,1,0,0


# Payments Data

Now we need to read in the payments data to start calculating ROI.

In [4]:
def load_data_from_s3(filename, format='csv'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket='loan-analysis-data', Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    if format=='csv':
        df = pd.read_csv(f, low_memory=False)
    if format=='pkl.bz2':
        df = pd.read_pickle(f, compression='bz2')
    return df

In [5]:
df_payments = load_data_from_s3('PMTHIST_INVESTOR_201904.csv', format='csv')

MemoryError: 

In [None]:
df_payments.head()

In [6]:
df_payments = load_data_from_s3('PMTHIST_INVESTOR_201808.csv', format='csv')

MemoryError: 