In [56]:
# Load general utilities
# ----------------------
import pandas as pd
from scipy import stats
import datetime
import math
import numpy as np
import pickle

# Load the data and engineer features

In [57]:
# This is the code you can use to open your pickle file
# Read the data and features from the pickle
final_data, discrete_features, continuous_features, ret_cols = pickle.load( open( "Data/clean_data.pickle", "rb" ) )

In [58]:
# Create a feature for the length of a person's credit history at the
# time the loan is issued
final_data['cr_hist'] = (final_data.issue_d - final_data.earliest_cr_line) / np.timedelta64(1, 'M')
continuous_features.append('cr_hist')

# Function to Calculate PValues¶¶

In [59]:
def getPValues (model, X_test, y_test):
    params = np.append(model.intercept_,model.coef_)
    predictions = model.predict(X_test)

    newX = pd.DataFrame({"Constant":np.ones(len(X_test))}).join(pd.DataFrame(X_test.reset_index(drop=True)))

    mse = mean_squared_error(y_test, predictions)

    var_b = mse*(np.linalg.pinv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b
    p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-len(newX.columns)-1))) for i in ts_b]
    sd_b = np.round(sd_b,3)
    ts_b = np.round(ts_b,3)
    p_values = np.round(p_values,8)
    params = np.round(params,4)

    df = pd.DataFrame()
    df["Coeff"],df["SE"],df["t val"],df["Probs"] = [params,sd_b,ts_b,p_values]
    names = ['Intercept']
    names.extend(list(X_test))
    df.index = names
    return df

# X and y from the features code cells

In [60]:
from sklearn.preprocessing import MinMaxScaler

def minMaxScaleContinuous(continuousList):
    return pd.DataFrame(MinMaxScaler().fit_transform(final_data[continuousList])
                             ,columns=list(final_data[continuousList].columns)
                             ,index = final_data[continuousList].index)

def createDiscreteDummies(discreteList):
    return pd.get_dummies(final_data[discreteList], dummy_na = True, prefix_sep = "::", drop_first = False)

# Define which features to use in the modeling

In [61]:
# define the discrete features you want to use in modeling.
# if you want to use all the discrete features, just set discrete_features_touse = discrete_features
discrete_features_touse =['purpose', 'term', 'verification_status', 'emp_length', 'home_ownership']

# define the continuous features to use in modeling
# if you want to use all the continuous features, just set the continuous_features_touse = continuous_features
continuous_features_touse = ['loan_amnt', 'funded_amnt','installment','annual_inc','dti','revol_bal','delinq_2yrs','open_acc',
 'pub_rec','fico_range_high','fico_range_low','revol_util','cr_hist']

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Create dummies for categorical features and concatenate with continuous features for X or predictor dataframe

# Use this line of code if you do not want to scale the continuous features
#X_continuous = final_data[continuous_features_touse]

# use this line if you want to scale the continuous features using the MinMaxScaler in the function defined above
X_continuous = minMaxScaleContinuous(continuous_features_touse)

# create numeric dummy features for the discrete features to be used in modeling
X_discrete = createDiscreteDummies(discrete_features_touse)

#concatenate the continuous and discrete features into one dataframe
X = pd.concat([X_continuous, X_discrete], axis = 1)

# this is the target variable 
# 'ret_PESS', 'ret_OPT', 'ret_INTa', 'ret_INTb'

# Use this line of code if you do not want to scale the ret_cols
#=final_data["ret_INTa"]

# use this line if you want to scale the ret_cols using the MinMaxScaler in the function defined above
ret_data = minMaxScaleContinuous(ret_cols)
y=ret_data['ret_INTa']

# create a test and train split of the transformed data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.3)


# Multiple Linear Regression to predict M3(ret_INTa)

In [63]:
from sklearn.linear_model import LinearRegression

mlr_model = LinearRegression(n_jobs=-1).fit(X_train, y_train)

print("mlr.coef_:", mlr_model.coef_)
print("mlr.intercept_:", mlr_model.intercept_)

mlr.coef_: [ 4.48721847e-01 -3.25271388e-01 -2.34721074e-01  5.37513682e-01
 -2.20667804e-01  2.41300757e-01  2.50499498e-03 -3.09690772e-02
  7.10211971e-02 -1.90668524e+00  1.93571233e+00  1.20986956e-02
  1.52978785e-02 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11
 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11
 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11
 -5.16819969e+11 -5.16819969e+11 -5.16819969e+11  5.63161428e+10
  1.36002711e+10  1.36002711e+10 -4.28086239e+10  5.66611060e+10
  5.66611060e+10  5.66611060e+10 -4.51274469e+09 -2.87190251e+10
 -2.87190251e+10 -2.87190251e+10 -2.87190251e+10 -2.87190251e+10
 -2.87190251e+10 -2.87190251e+10 -2.87190251e+10 -2.87190251e+10
 -2.87190251e+10 -2.87190251e+10 -2.87190251e+10  1.11824145e+11
  1.11824145e+11  1.11824145e+11  1.11824145e+11  1.11824145e+11
  0.00000000e+00]
mlr.intercept_: 363453471984.1035


In [64]:
print("Training set score: {:.5f}".format(mlr_model.score(X_train, y_train)))
print("Test set score: {:.5f}".format(mlr_model.score(X_test, y_test)))


Training set score: 0.02084
Test set score: 0.02133


In [65]:
import math
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

predictions = mlr_model.predict(X_test)
score = explained_variance_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = math.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print("score = {:.5f} | MAE = {:.5f} | RMSE = {:.5f} | R2 = {:.5f}".format(score, mae, rmse, r2))

score = 0.02134 | MAE = 0.12218 | RMSE = 0.16711 | R2 = 0.02133


In [66]:
getPValues(mlr_model, X_train, y_train)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = (x >= _b) & cond0


Unnamed: 0,Coeff,SE,t val,Probs
Intercept,363453500000.0,0.012,29277420000000.0,0.0
loan_amnt,0.4487,0.817,0.549,0.582868
funded_amnt,-0.3253,0.817,-0.398,0.690559
installment,-0.2347,0.008,-31.195,0.0
annual_inc,0.5375,0.151,3.552,0.000382
dti,-0.2207,0.015,-14.3,0.0
revol_bal,0.2413,0.02,12.291,0.0
delinq_2yrs,0.0025,0.011,0.227,0.820146
open_acc,-0.031,0.004,-8.23,0.0
pub_rec,0.071,0.023,3.024,0.002492


# LASSO Regression GridsearchCV

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

''' These are just example parameter settings. You can change these parameters or add others.
    The grid search uses a scoring method of R2. You can change that to another scoring method.
'''

parameters = {'alpha' : [0.000000001, 0.00000001, 0.0000001]
             }

print("Parameter grid:\n{}".format(parameters),'\n')

grid =  GridSearchCV(Lasso(max_iter=10000), parameters, cv=5, return_train_score=True, scoring='r2', n_jobs=-1)

# perform grid search cv on training data.  The CV algorithm divides this into training and validation
ls_model = grid.fit(X_train, y_train)

print('best params ',ls_model.best_params_,'\n')
print('best estimator ',ls_model.best_estimator_,'\n')
print('best validation score ', ls_model.best_score_,'\n')
print('scoring method ', ls_model.scorer_)

print("Test set accuracy score: {:.7f}".format(ls_model.score(X_test, y_test)))

saveModel('ls_model', ls_model)