# Investment Strategy

In [None]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.15)
import warnings; warnings.filterwarnings('ignore')
from IPython.display import Markdown, display, SVG
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegressionCV, RidgeCV, LassoCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
# from graphviz import Source
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams['figure.figsize'] = (12, 4)

In [None]:
#LOAD LOANSTATS
directory = '../../data/clean/'
ls = pd.read_hdf(directory + 'ls_CLEAN.h5', 'ls_CLEAN')
ls.sort_index(axis=1, inplace=True)

In [None]:
def model_scoring(model, feature, target, modeltype='C', cv=5):
    """Displays model evaluation for regression and classification modeling"""
    scores = ['accuracy', 'precision', 'recall']
    if modeltype=='R': # regression
        scores = ['neg_mean_squared_error', 'r2']
    for score in scores:
        cv_score = cross_val_score(model, feature, target, scoring=score, cv=cv).mean()
        print('\tCross-validation {}: {:.4}'.format(score, cv_score))

In [None]:
from sklearn.model_selection import train_test_split
ls_train, ls_test = train_test_split(ls, test_size=0.2, stratify=ls['OUT_Class'], random_state=1)

In [None]:
#STANDARD SCALING
scaler = StandardScaler()

#separate features and targets
outcome_var_list = sorted(out_var for out_var in ls.columns if "OUT_" in out_var)

#train features
X_train = ls_train[sorted(set(ls.columns)-set(outcome_var_list))]
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index, columns=X_train.columns)

#test features
X_test = ls_test[sorted(set(ls.columns)-set(outcome_var_list))]
X_test_scaled = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)

#train targets
y_train = ls_train[sorted(outcome_var_list)]
OUT_Class_train = y_train.iloc[:,0]
OUT_Principle_Repaid_Percentage_train = y_train.iloc[:,1]
OUT_Monthly_Rate_Of_Return_train = y_train.iloc[:,2]

#test targets
y_test = ls_test[sorted(outcome_var_list)]
OUT_Class_test = y_test.iloc[:,0]
OUT_Principle_Repaid_Percentage_test = y_test.iloc[:,1]
OUT_Monthly_Rate_Of_Return_test = y_test.iloc[:,2]

In [None]:
#GET POLYNOMIALS
dummy_var_list = sorted(dummy for dummy in ls.columns if "D_" in dummy)
numeric_var_list = sorted(set(ls.columns) - set(outcome_var_list) - set(dummy_var_list))
poly_2 = pd.DataFrame(np.hstack((X_train.iloc[:,22:]**(i+1) for i in range(2))),
                      index=X_train.index, 
                      columns=numeric_var_list+[s+'_2' for s in numeric_var_list])
X_train_2 = pd.concat([ls_train[dummy_var_list], poly_2], axis=1).sort_index(axis=1)
poly_2 = pd.DataFrame(np.hstack((X_test.iloc[:,22:]**(i+1) for i in range(2))),
                      index=X_test.index, 
                      columns=numeric_var_list+[s+'_2' for s in numeric_var_list])
X_test_2 = pd.concat([ls_test[dummy_var_list], poly_2], axis=1).sort_index(axis=1)

#STANDARD SCALING
scaler = StandardScaler()
X_train_scaled_2 = pd.DataFrame(scaler.fit_transform(X_train_2),index=X_train_2.index, columns=X_train_2.columns)
X_test_scaled_2 = pd.DataFrame(scaler.transform(X_test_2),index=X_test_2.index, columns=X_test_2.columns)

In [None]:
#CLASSIFIERS
dummy_cls = DummyClassifier(strategy='uniform', random_state=1)
dummy_cls.fit(X_train_scaled, OUT_Class_train)
log_cls = LogisticRegressionCV(Cs=8, solver='lbfgs', max_iter=10000, class_weight='balanced', random_state=0, fit_intercept=False)
log_cls.fit(X_train_scaled, OUT_Class_train)

#REGRESSORS PRP
ridge_reg_PRP = RidgeCV(fit_intercept=False)
ridge_reg_PRP.fit(X_train_scaled, OUT_Principle_Repaid_Percentage_train)
lasso_reg_PRP = LassoCV(fit_intercept=False)
lasso_reg_PRP.fit(X_train_scaled, OUT_Principle_Repaid_Percentage_train)
lasso_reg2_PRP = LassoCV(fit_intercept=False)
lasso_reg2_PRP.fit(X_train_scaled_2, OUT_Principle_Repaid_Percentage_train)

#REGRESSORS MRR
ridge_reg_MRR = RidgeCV(fit_intercept=False)
ridge_reg_MRR.fit(X_train_scaled, OUT_Monthly_Rate_Of_Return_train)
lasso_reg_MRR = LassoCV(fit_intercept=False)
lasso_reg_MRR.fit(X_train_scaled, OUT_Monthly_Rate_Of_Return_train)
lasso_reg2_MRR = LassoCV(fit_intercept=False)
lasso_reg2_MRR.fit(X_train_scaled_2,OUT_Monthly_Rate_Of_Return_train)
x=0

## 1. Modeling Summary

In [None]:
results = pd.read_excel(directory + 'ModelResults.xlsx', sheet_name=[0,1,2], header=[0,1])
OUT_Class_results = results[0]
OUT_PRP_results = results[1]
OUT_MRR_results = results[2]

In the [Modeling](https://cs109group67.github.io/lendingclub/Modeling.html) section we preformed a variety of classifiction and regression models on the three outcome features. A summary of the model scoring metrics is as follows:

In [None]:
display(Markdown('**`OUT_Class`:**'))
OUT_Class_results

In [None]:
display(Markdown('**`OUT_Principle_Repaid`:**'))
OUT_PRP_results

In [None]:
display(Markdown('**`OUT_Monthly_Rate_Of_Return`:**'))
OUT_MRR_results

The classification and regression models did not perform well at predicting the target features, but they did provide important information about which variables potentially hold the most inferential information for a prospective investor. The visualizations below display cofficients assigned across the models. The focus is on features that have the most significant magnitude across distinct models.

In [None]:
def plot_coefficients(feature_train, models, title, thresh=(0.0025,4), figsize=(10,4)):
    """Plots the coefficients assigned by the model to each feature"""
    # create DataFrame with coefficient names and values 
    names = feature_train.columns.tolist()
    coefs = pd.DataFrame(names, columns=['coefs'])
    for k,m in models.items():
        coefs[k] = m.coef_.flatten()
    coefs = coefs.sort_values(by='coefs', ascending=False)

    # plot coefficients for each feature
    coefs = coefs[(coefs > thresh[0]) | (coefs < -thresh[0])]
    coefs.sort_index(inplace=True, ascending=False)
    coefs.dropna(thresh=thresh[1], inplace=True)
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    for k,m in models.items():
        ax.scatter(coefs[k], coefs['coefs'], alpha=0.5, label=k, s=80)
    ax.legend(fontsize=10)
    ax.set_xlabel('Coefficient Value')
    ax.set_ylabel('Features')
    ax.set_title('Coefficients Assigned Across the {}'.format(title))
    ax.axvline(0, color='black')
    plt.show()
    return coefs

In [None]:
models = dict(Logistic_Classification=log_cls,)
coefs1 = plot_coefficients(X_train_scaled, models=models, thresh=(0.15,2), title='Classification Model')

In [None]:
models = dict(Ridge_Regression_PRP=ridge_reg_PRP,
              Lasso_Regression_PRP=lasso_reg_PRP,
              Ridge_Regression_MRR=ridge_reg_MRR,
              Lasso_Regression_MRR=lasso_reg_MRR)
coefs2 = plot_coefficients(X_train_scaled, models=models, thresh=(0.0025,3), title='Regression Models')

In [None]:
models = dict(
          Poly_Regression_PRP=lasso_reg2_PRP,
          Poly_Regression_MRR=lasso_reg2_MRR)
coefs3 = plot_coefficients(X_train_scaled_2, models=models, thresh=(0.0003,3), title='Polynomial Models')

## 2. Key Variables Driving Investment Decisions

The modeling has revealed key variables that should power the investment decisions: those which have significant coefficients across the distinct classification and regression models. These form the subset for our investment strategy formulation. This will aid interpretation and understanding for the investor at the loss of minimal investment efficacy. 

**Top 10 Key Variables**:

In [None]:
key_variables = coefs2['coefs'].sort_values().reset_index(drop=True)
key_variables.name = 'key_variables'
X_train_key = X_train_scaled[key_variables]
print(key_variables)

With the key variables established, we examine decision tree and random forest analysis on this key subset to establish the investment strategy. We focus on the most comprehensive outcome feature of `OUT_Monthly_Rate_Of_Return` since this feature takes into account the total amount repeiad with interest for the effective term of the loan. We visualize both a larger tree (for completeness and demonstrative purposes) and a simplified tree. 

In [None]:
keydecisiontree = DecisionTreeRegressor(random_state=0, 
                                     max_depth=10, 
                                     min_samples_split=.05)
keydecisiontree.fit(X_train_key, OUT_Monthly_Rate_Of_Return_train)
display(Markdown('**FULL Key Feaure Decision Tree**:'))
print(keydecisiontree)
graph = Source(export_graphviz(keydecisiontree, 
                               feature_names=X_train_key.columns,
                               out_file=None, 
                               filled = True))
display(SVG(graph.pipe(format='svg')))
print()

In [None]:
keydecisiontree = DecisionTreeRegressor(random_state=0, 
                                     max_depth=10, 
                                     min_samples_split=.05,
                                     max_leaf_nodes=10)
keydecisiontree.fit(X_train_key, OUT_Monthly_Rate_Of_Return_train)
display(Markdown('**SIMPLIFIED Key Feature Decision Tree**:'))
print(keydecisiontree)
graph = Source(export_graphviz(keydecisiontree, 
                               feature_names=X_train_key.columns,
                               out_file=None, 
                               filled = True))
display(SVG(graph.pipe(format='svg')))
print()

In [None]:
keyrandomforest = RandomForestRegressor(n_estimators=10)
keyrandomforest.fit(X_train_key, OUT_Monthly_Rate_Of_Return_train)
display(Markdown('**Key Feature Random Forest**:'))
print(keyrandomforest)
importances = keyrandomforest.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature Importance of Key Features")
plt.bar(range(X_train_key.shape[1]), importances[indices])
plt.xticks(range(X_train_key.shape[1]), X_train_key.columns, rotation=90)
plt.show()

## 3. Investment Strategy

Based on the completed analysis, we postulate the following simple investment strategy to only invest in loans that meet the following criteria. To reduce risk and get the full benefit of diversification, we recommended that the mimimun qualifying investment should be made across as many qualifying loans as possible.

#### Non-Renter

Only invest in applications from non-renters.

In [None]:
non_renters = (ls['D_home_ownership_RENT'] == 1)

#### Shorter Term

Only invest in applications with term of 36 months.

In [None]:
short_term = (ls['D_term_ 36 months'] == 1)

#### Few Recent Accounts Opened

Only invest in applications from borrowers with less than 4 accounts opened in the last 24 months.

In [None]:
few_recent_accounts = (ls['acc_open_past_24mths'] < 4)

#### Low Debt-to-Income Ratio

Only invest in applications from borrowers with debt-to-equity ratios of less than 20%.

In [None]:
low_dti = (ls['dti'] < 20)

#### Solidly Employed

Only invest in applications from borrowers who have been employed over 5 years.

In [None]:
solid_emp = (ls['emp_length'] > 5)

#### Low Grade

Only invest in loans of grade A, B, C or D

In [None]:
low_grade = ls['sub_grade'] <= 20

## 4. Predictive Quality of Strategy

In [None]:
good_loans = np.all([non_renters, short_term, few_recent_accounts, low_dti, solid_emp, low_grade], axis=0)
num_good_loans = np.sum(good_loans)
print('num good loans: {}'.format(num_good_loans))

In [None]:
mean_good_loan = np.mean(ls['OUT_Principle_Repaid_Percentage'][good_loans])
mean_good_loan

In [None]:
np.random.seed(0)
rand_loan = np.random.choice(ls.index.values, size=num_good_loans, replace=False)

mean_rand_loan = np.mean(ls['OUT_Principle_Repaid_Percentage'][rand_loan])
mean_rand_loan

In [None]:
mean_gain = mean_good_loan/mean_rand_loan - 1
mean_gain*100