# Notebook for Training Models and Running Simulation

In [69]:
import pandas as pd
from xgboost import XGBRegressor

import shap
shap.initjs()

%run src/columns.py
%run src/data-cleaning.py
%run src/feature-engineering.py
%run src/modeling.py
%run src/payments.py
%run src/portfolio.py

ModuleNotFoundError: No module named 'shap'

In [22]:
df_train = pd.read_pickle('data/df_training_loans_with_roi.pkl.bz2', compression='bz2')

In [28]:
df_test = pd.read_pickle('data/df_testing_loans_cleaned.pkl.bz2', compression='bz2')

In [29]:
#df_test.drop(columns='issue_d', inplace=True)
df_test.head()

Unnamed: 0_level_0,loan_amnt,int_rate,installment,emp_length,annual_inc,issue_d,dti,delinq_2yrs,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78219622,15000.0,8.39,472.75,10.0,170000.0,2016-05-01,10.81,0,675.0,679.0,...,0,0,0,0,0,0,0,0,0,0
78608726,2100.0,15.31,73.120003,5.0,19600.0,2016-05-01,14.28,3,660.0,664.0,...,0,0,0,0,1,0,0,0,0,0
77180977,3600.0,7.89,112.629997,8.0,31300.0,2016-05-01,23.200001,0,670.0,674.0,...,0,0,0,0,0,0,0,0,0,0
77142437,20000.0,11.99,664.200012,8.0,76000.0,2016-05-01,16.07,6,695.0,699.0,...,0,0,0,0,0,0,0,0,0,0
77901770,10000.0,9.16,318.75,-99.0,125000.0,2016-05-01,6.79,0,690.0,694.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_payments_all = pd.read_pickle('data/df_payments_data_all_cleaned.pkl.bz2', compression='bz2')

In [11]:
# Is this good for the simulation? Let's worry about that after model training.
df_payments_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,RECEIVED_AMT_INVESTORS,PBAL_END_PERIOD_INVESTORS,mths_since_issue
RECEIVED_D,LOAN_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-07-01,72176,7.189307,219.55983,1
2007-07-01,73582,7.289357,219.637436,1
2007-07-01,74505,7.25625,219.611313,1
2007-07-01,77792,3.975833,121.962997,1
2007-07-01,81085,9.03231,268.539795,1


In [23]:
def split_data_into_labels_and_target(df):
    '''
    Split the data into features (X) and a label (y). Our label in this case is ROI of a loan.

    Args:
        df (dataframe): Our loan dataframe that has been cleaned and prepared for modeling.

    Returns:
        Dataframes: Returns 2 dataframes, one for the model features and one for the model label.
    '''
    X = df.drop(['roi', 'issue_d'], axis=1)
    y = df['roi']
    return X, y

In [24]:
X_train, y_train = split_data_into_labels_and_target(df_train)

In [25]:
model = XGBRegressor(n_jobs=-1)
fit_model = train_model(model, X_train, y_train)

In [30]:
X_test = df_test.drop(columns='issue_d')

In [33]:
predicted_rois = get_predictions(fit_model, X_test)
predicted_rois

array([ 3.0167866 ,  5.535616  ,  0.4722504 , ...,  2.9651687 ,
       -0.49046355,  2.7701457 ], dtype=float32)

In [36]:
simulation_df = create_dataframe_for_simulation(df_test, predicted_rois)
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,3.016787
2016-05-01,78608726,2100.0,5.535616
2016-05-01,77180977,3600.0,0.47225
2016-05-01,77142437,20000.0,2.591496
2016-05-01,77901770,10000.0,4.453407


In [37]:
simulation_df.to_pickle('data/model_xgb_predictions.pkl.bz2', compression='bz2')

#### Random Forest 10 Trees

In [44]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, n_jobs=-1)
fit_model = train_model(model, X_train, y_train)

In [45]:
predicted_rois = get_predictions(fit_model, X_test)

In [46]:
simulation_df = create_dataframe_for_simulation(df_test, predicted_rois)
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,-11.758644
2016-05-01,78608726,2100.0,-7.360871
2016-05-01,77180977,3600.0,4.562841
2016-05-01,77142437,20000.0,11.46498
2016-05-01,77901770,10000.0,-0.79485


In [47]:
simulation_df.to_pickle('data/model_rf_10_trees_predictions.pkl.bz2', compression='bz2')

#### Random Forest 100 Trees

In [48]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
fit_model = train_model(model, X_train, y_train)

In [49]:
predicted_rois = get_predictions(fit_model, X_test)

In [50]:
simulation_df = create_dataframe_for_simulation(df_test, predicted_rois)
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,-0.674256
2016-05-01,78608726,2100.0,-3.251852
2016-05-01,77180977,3600.0,0.326274
2016-05-01,77142437,20000.0,-3.48703
2016-05-01,77901770,10000.0,1.69802


In [51]:
simulation_df.to_pickle('data/model_rf_100_trees_predictions.pkl.bz2', compression='bz2')

#### Decision Tree

In [52]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
fit_model = train_model(model, X_train, y_train)

In [53]:
predicted_rois = get_predictions(fit_model, X_test)

In [54]:
simulation_df = create_dataframe_for_simulation(df_test, predicted_rois)
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,5.582938
2016-05-01,78608726,2100.0,-81.457004
2016-05-01,77180977,3600.0,6.870828
2016-05-01,77142437,20000.0,13.17749
2016-05-01,77901770,10000.0,9.285623


In [56]:
simulation_df.to_pickle('data/model_dt_predictions.pkl.bz2', compression='bz2')

#### High Interest Rate Strategy

In [61]:
predictions = X_test.int_rate
simulation_df = create_dataframe_for_simulation(df_test, predictions)
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,8.39
2016-05-01,78608726,2100.0,15.31
2016-05-01,77180977,3600.0,7.89
2016-05-01,77142437,20000.0,11.99
2016-05-01,77901770,10000.0,9.16


In [64]:
simulation_df.to_pickle('data/model_naive_high.pkl.bz2', compression='bz2')

#### Low Interest Rate Strategy

In [66]:
predictions = X_test.int_rate
simulation_df = create_dataframe_for_simulation(df_test, -1*predictions)

In [67]:
simulation_df.head()

Unnamed: 0_level_0,id,loan_amnt,predicted_roi
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-01,78219622,15000.0,-8.39
2016-05-01,78608726,2100.0,-15.31
2016-05-01,77180977,3600.0,-7.89
2016-05-01,77142437,20000.0,-11.99
2016-05-01,77901770,10000.0,-9.16


In [68]:
simulation_df.to_pickle('data/model_naive_low.pkl.bz2', compression='bz2')

#### SHAP Analysis