# Modeling

In [4]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.15)
import warnings; warnings.filterwarnings('ignore')
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegressionCV, RidgeCV, LassoCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.options.display.float_format = '{:.10f}'.format
plt.rcParams['figure.figsize'] = (12, 4)

In [5]:
#LOAD LOANSTATS
directory = '../../data/'
ls = pd.read_hdf(directory + 'ls_CLEAN.h5', 'ls_CLEAN')
ls.sort_index(axis=1, inplace=True)

## 1. Preprocessing

### 1A. Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split
ls_train, ls_test = train_test_split(ls, test_size=0.2, stratify=ls['OUT_Class'], random_state=1)

### 1B. Standard Scaling

In [7]:
#STANDARD SCALING
scaler = StandardScaler()

#separate features and targets
outcome_var_list = sorted(out_var for out_var in ls.columns if "OUT_" in out_var)

#train features
X_train = ls_train[sorted(set(ls.columns)-set(outcome_var_list))]
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index, columns=X_train.columns)

#test features
X_test = ls_test[sorted(set(ls.columns)-set(outcome_var_list))]
X_test_scaled = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)

#train targets
y_train = ls_train[sorted(outcome_var_list)]
OUT_Class_train = y_train.iloc[:,0]
OUT_Principle_Repaid_Percentage_train = y_train.iloc[:,1]
OUT_Monthly_Rate_Of_Return_train = y_train.iloc[:,2]

#test targets
y_test = ls_test[sorted(outcome_var_list)]
OUT_Class_test = y_test.iloc[:,0]
OUT_Principle_Repaid_Percentage_test = y_test.iloc[:,1]
OUT_Monthly_Rate_Of_Return_test = y_test.iloc[:,2]

## 2. `OUT_Class`

### 2B. Logistic Regression Classification

In [9]:
#LOGISTIC REGRESSION
log_reg = LogisticRegressionCV(Cs=8, solver='lbfgs', max_iter=10000, class_weight='balanced', random_state=0, fit_intercept=False)
log_reg.fit(X_train_scaled, OUT_Class_train)

coeff_df['log_reg_2B'] = log_reg.coef_[0]

## 3 `OUT_Principle_Repaid_Percentage`

### 3D. Lasso Regression

In [10]:
lasso_reg = LassoCV(fit_intercept=False)
lasso_reg.fit(X_train_scaled, OUT_Principle_Repaid_Percentage_train)

coeff_df['lasso_reg_3D'] = lasso_reg.coef_

## 4. `OUT_Monthly_Rate_Of_Return`

### 4D. Lasso Regression

In [11]:
lasso_reg = LassoCV(fit_intercept=False)
lasso_reg.fit(X_train_scaled, OUT_Monthly_Rate_Of_Return_train)

coeff_df['lasso_reg_4D'] = lasso_reg.coef_

## 5. `Finding Best Predictors`

In [12]:
# from sklearn.metrics import accuracy_score, explained_variance_score

# def forward_selection(model, model_type, x_train, y_train, num_pred=5):
#     assert model_type in ['regressor', 'classifier']
#     def base_model():
#         return model
    
#     best_scores = []
#     best_models = []
    
#     all_predictors = set(independent_columns)
#     selected_good_predictors = set()
#     for i in range(num_pred):
#         print('finding pred {}'.format(i))
        
#         possible_scores = []
#         possible_predictors = list(selected_good_predictors ^ all_predictors)
#         for predictor in possible_predictors:
#             current_test_predictors = list(selected_good_predictors) + [predictor]
            
#             model = base_model()
#             model.fit(x_train[current_test_predictors], y_train)
#             model_pred = model.predict(x_train[current_test_predictors])
            
#             if model_type == 'classifier':
#                 score = accuracy_score(y_train, model_pred)
#             else:
#                 score = explained_variance_score(y_train, model_pred)
#             possible_scores.append(score)
        
#         best_predictor = possible_predictors[np.argmax(possible_scores)]
#         selected_good_predictors.add(best_predictor)
        
#         best_models.append(list(selected_good_predictors))
#         best_scores.append(np.max(possible_scores))
#     return list(zip(best_scores, best_models))


# # fs = forward_selection(LogisticRegressionCV(cv=5, random_state=0), 'classifier', x_train, y_train, 3)

In [13]:
coeff_df['sum'] = np.sum(np.abs(coeff_df[['log_reg_2B', 'lasso_reg_3D', 'lasso_reg_4D']]), axis=1)

In [14]:
num_keep = 5

ind = np.argsort(coeff_df['sum'].values)
preds_keep = coeff_df['sum'][ind][-num_keep:].index.values
preds_keep

array(['installment', 'open_acc', 'total_bal_ex_mort',
       'total_il_high_credit_limit', 'sub_grade'], dtype=object)

In [15]:


depths = {}
for i in range(1, 10):
    model = DecisionTreeRegressor(max_depth=i, random_state=0)
    model.fit(X_train_scaled[preds_keep], OUT_Principle_Repaid_Percentage_train)
    
    model_pred = model.predict(X_test_scaled[preds_keep])
    depths[i] = mean_absolute_error(OUT_Principle_Repaid_Percentage_test, model_pred)

best_depth = min(depths, key=depths.get)

In [16]:
model = DecisionTreeRegressor(max_depth=best_depth, random_state=0)
model.fit(X_train_scaled[preds_keep], OUT_Principle_Repaid_Percentage_train)

model_pred = model.predict(X_test_scaled[preds_keep])
model_loss = np.mean(np.abs(model_pred - OUT_Principle_Repaid_Percentage_test))
model_loss

0.014180342420874549

In [17]:
mean = np.mean(OUT_Principle_Repaid_Percentage_train)
random_loss = np.mean(np.abs(mean - OUT_Principle_Repaid_Percentage_test))
random_loss

0.014493387355386083

In [18]:
random_loss/model_loss - 1

0.0220759785074518

In [19]:
# 2.2 percent better then a random guessing the mean value... that's really not great...