In [1]:
import datetime
import pickle
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
from datetime import datetime as dt
from io import BytesIO
import boto3
import multiprocessing as mp

%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py
%run src/modeling.py
%run src/payments.py

# Load Data

In [2]:
csv_file_names = ('LoanStats3a_securev1.csv', 'LoanStats3b_securev1.csv', 'LoanStats3c_securev1.csv', 'LoanStats3d_securev1.csv',
                  'LoanStats_securev1_2016Q1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats_securev1_2016Q3.csv',
                  'LoanStats_securev1_2016Q4.csv', 'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2017Q2.csv', 
                  'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q4.csv', 'LoanStats_securev1_2018Q1.csv',
                  'LoanStats_securev1_2018Q2.csv', 'LoanStats_securev1_2018Q3.csv', 'LoanStats_securev1_2018Q4.csv')

In [3]:
def load_loan_data_from_s3(csv_files, columns, number_of_rows=None, bucket='loan-analysis-data'):
    loan_data = []
    for filename in csv_files:
        s3 = boto3.client('s3')
        obj = s3.get_object(Bucket=bucket, Key=filename)
        data = obj['Body'].read()
        f = BytesIO(data)
        data = pd.read_csv(f, header=1, low_memory=False, na_values='n/a',
                           usecols=columns, nrows=number_of_rows) 
        loan_data.append(data)
    loans = pd.concat(loan_data)
    # Loan IDs are unique and we can access specific loans much faster by setting them as the index.
    #loans.set_index('id', inplace=True)
    return loans

In [4]:
raw_loans = load_loan_data_from_s3(csv_file_names, columns_to_use, number_of_rows=500)

In [5]:
raw_loans.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,5000,36 months,10.65%,162.87,B,10+ years,RENT,24000.0,Verified,...,,,,,0,0,,,,
1,1077430,2500,60 months,15.27%,59.83,C,< 1 year,RENT,30000.0,Source Verified,...,,,,,0,0,,,,
2,1077175,2400,36 months,15.96%,84.33,C,10+ years,RENT,12252.0,Not Verified,...,,,,,0,0,,,,
3,1076863,10000,36 months,13.49%,339.31,C,10+ years,RENT,49200.0,Source Verified,...,,,,,0,0,,,,
4,1075358,3000,60 months,12.69%,67.79,B,1 year,RENT,80000.0,Source Verified,...,,,,,0,0,,,,


# Clean Data

In [6]:
def clean_and_prepare_raw_data_for_model(df):
    df = drop_loan_status(df)
    df = drop_joint_applicant_loans(df)
    df = fix_rate_cols(df)
    df.dropna(subset=['issue_d'], inplace=True)
    df = fix_date_cols(df)
    df.sort_values(by='issue_d', inplace=True)
    df = exclude_loans_before_2010(df)
    df = clean_loan_term_col(df)
    df = only_include_36_month_loans(df)
    df = clean_employment_length(df)
    # I doubt we need missing data boolean columns for tree models.
    df = create_missing_data_boolean_columns(df)
    df = fill_nas(df, value=-99)
    #df = add_issue_date_and_month(df) # Ditch this?
    df = add_supplemental_rate_data(df)
    df = create_rate_difference_cols(df)
    df = create_months_since_earliest_cl_col(df)
    #df = create_loan_life_months_col(df)
    df = change_data_types(df)
    df = create_dummy_cols(df)
    df = drop_unnecessary_cols(df)
    df.set_index('id', inplace=True)
    
    return df

In [7]:
cleaned_loans = clean_and_prepare_raw_data_for_model(raw_loans)
cleaned_loans.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1077501,5000,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735,739,...,0,0,0,0,0,0,0,0,0,0
1065244,6000,14.65,206.970001,7.0,34000.0,2011-12-01,21.389999,0,670,674,...,0,0,0,0,0,0,0,0,0,0
1064873,14400,12.69,483.049988,9.0,40000.0,2011-12-01,20.16,0,695,699,...,0,0,0,0,0,0,0,0,0,0
1065254,24000,8.9,762.080017,10.0,116400.0,2011-12-01,3.94,0,765,769,...,0,0,0,0,0,0,0,0,0,0
1065103,10000,11.71,330.76001,5.0,32000.0,2011-12-01,21.83,0,695,699,...,0,0,0,0,0,0,0,0,0,0


# Split Training/Testing

Now that the data has been cleaned I need to split it up into training and testing data then figure out how to use my old ROI calculation functions.

In [8]:
df_train, df_test = split_training_and_testing_data(cleaned_loans, split_date='2016-04-01')

In [9]:
df_train.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1077501,5000,10.65,162.869995,10.0,24000.0,2011-12-01,27.65,0,735,739,...,0,0,0,0,0,0,0,0,0,0
1065244,6000,14.65,206.970001,7.0,34000.0,2011-12-01,21.389999,0,670,674,...,0,0,0,0,0,0,0,0,0,0
1064873,14400,12.69,483.049988,9.0,40000.0,2011-12-01,20.16,0,695,699,...,0,0,0,0,0,0,0,0,0,0
1065254,24000,8.9,762.080017,10.0,116400.0,2011-12-01,3.94,0,765,769,...,0,0,0,0,0,0,0,0,0,0
1065103,10000,11.71,330.76001,5.0,32000.0,2011-12-01,21.83,0,695,699,...,0,0,0,0,0,0,0,0,0,0


In [10]:
training_loan_ids = tuple(df_train.index)

# Payments Data

Now we need to read in the payments data to start calculating ROI.

In [11]:
df_payments = load_raw_payments_data_from_s3('PMTHIST_INVESTOR_201904.csv')

In [12]:
df_payments.head()

Unnamed: 0,LOAN_ID,RECEIVED_D,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,IssuedDate
0,54734,SEP2009,632.771017,18636.4093,AUG2009
1,54734,OCT2009,632.771017,18188.363925,AUG2009
2,54734,NOV2009,632.771017,17735.877487,AUG2009
3,54734,DEC2009,632.771017,17278.905966,AUG2009
4,54734,JAN2010,632.771017,16817.404904,AUG2009


In [14]:
df_payments.to_pickle('data/raw_payments_data.pkl.bz2', compression='bz2')

In [13]:
df_payments_clean = get_cleaned_payment_history_data(df_payments)
df_payments_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['mths_since_issue'] = df['mths_since_issue'].astype('uint8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['RECEIVED_AMT_INVESTORS'] = df['RECEIVED_AMT_INVESTORS'].astype('float32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documenta

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,7.189307,219.55983,1
2007-07-01,73582,7.289357,219.637436,1
2007-07-01,74505,7.25625,219.611313,1
2007-07-01,77792,3.975833,121.962997,1
2007-07-01,81085,9.03231,268.539795,1


In [15]:
df_payments_clean.to_pickle('data/clean_payments_data.pkl.bz2', compression='bz2')

# Start Here Tonight

In [21]:
df_payments_clean = pd.read_pickle('data/clean_payments_data.pkl.bz2', compression='bz2')

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,7.189307,219.55983,1
2007-07-01,73582,7.289357,219.637436,1
2007-07-01,74505,7.25625,219.611313,1
2007-07-01,77792,3.975833,121.962997,1
2007-07-01,81085,9.03231,268.539795,1


In [28]:
df_payments_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,7.189307,219.55983,1
2007-07-01,73582,7.289357,219.637436,1
2007-07-01,74505,7.25625,219.611313,1
2007-07-01,77792,3.975833,121.962997,1
2007-07-01,81085,9.03231,268.539795,1


In [30]:
df_payments_clean.Multi

AttributeError: 'DataFrame' object has no attribute 'dropindex'

In [25]:
def get_training_payments(df_payments_clean, training_loan_ids):
    cols = ('RECEIVED_AMT_INVESTORS', 'mths_since_issue')
    training_loan_payments = df_payments_clean.loc[pd.IndexSlice[:, training_loan_ids], :][cols]
    training_loan_payments = df_payments_clean.droplevel(0)
    return training_loan_payments

In [26]:
training_loan_ids = (72176, 73582)

training_loan_payments = get_relevant_payments(df_payments_clean, training_loan_ids)
training_loan_payments

KeyError: ('RECEIVED_AMT_INVESTORS', 'mths_since_issue')

In [None]:
def get_one_loan_payment_data(payments, loan_id):
    try:
        return payments[payments['LOAN_ID'] == loan_id][['RECEIVED_AMT_INVESTORS', 'mths_since_issue']]
    except:
        return pd.DataFrame()

In [32]:
get_one_loan_payment_data(x, 72176)

**From Local**

In [None]:
def get_one_loan_payment_data(payments_training_loans, loan_id):
    try:
        return payments_training_loans.loc[loan_id]
    except:
        return pd.DataFrame()
    
def calculative_npv_payments(loans_payments, r_guess):
    payments = loans_payments.RECEIVED_AMT_INVESTORS
    months = loans_payments.mths_since_issue
    return sum(payments/(1+r_guess)**(months/12))

def adjust_estimated_roi(roi_guess, roi_min, roi_max, npv):
    if npv > 0:
        new_guess = (roi_guess + roi_min)/2
        new_min = roi_min
        new_max = roi_guess
    elif npv < 0:
        new_guess = (roi_guess + roi_max)/2
        new_min = roi_guess
        new_max = roi_max
    else:
        return roi_guess
    
    return (new_guess, new_min, new_max)

def get_roi_for_loan_id(loan_id):
    loan_id = int(loan_id)
    loan_size = loans['loan_amnt'].loc[loan_id]
    loan_payments = get_one_loan_payment_data(payments, loan_id)
    
    if loan_payments.empty:
        return -100
    
    r_guess = .10
    r_min = -.999
    r_max = .50
    
    for _ in range(15):
        npv_payments = calculative_npv_payments(loan_payments, r_guess)
        npv = loan_size - npv_payments
        r_guess, r_min, r_max = adjust_estimated_roi(r_guess, r_min, r_max, npv)
    return r_guess*100

def get_rois_for_loans(loan_ids):
    return {loan_id:get_roi_for_loan_id(loan_id)for loan_id in loan_ids}