In [1]:
# Load general utilities
# ----------------------
import pandas as pd
from scipy import stats
import datetime
import math
import numpy as np
import pickle

# Load the data and engineer features

In [2]:
# This is the code you can use to open your pickle file
# Read the data and features from the pickle
final_data, discrete_features, continuous_features, ret_cols = pickle.load( open( "Data/clean_data.pickle", "rb" ) )

In [3]:
# Create a feature for the length of a person's credit history at the
# time the loan is issued
final_data['cr_hist'] = (final_data.issue_d - final_data.earliest_cr_line) / np.timedelta64(1, 'M')
continuous_features.append('cr_hist')

# smaller sample of the data due to time constraints

In [4]:
# this code randomly samples 55% of the rows
# change the frac paramter if you want a different % to sample
# replace = False insures we won't select the same row twice
final_data=final_data.sample(frac=.25, replace=False, ).copy()

# Function to Calculate PValues¶¶

In [5]:
def getPValues (model, X_test, y_test):
    params = np.append(model.intercept_,model.coef_)
    predictions = model.predict(X_test)

    newX = pd.DataFrame({"Constant":np.ones(len(X_test))}).join(pd.DataFrame(X_test.reset_index(drop=True)))

    mse = mean_squared_error(y_test, predictions)

    var_b = mse*(np.linalg.pinv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b
    p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-len(newX.columns)-1))) for i in ts_b]
    sd_b = np.round(sd_b,3)
    ts_b = np.round(ts_b,3)
    p_values = np.round(p_values,8)
    params = np.round(params,4)

    df = pd.DataFrame()
    df["Coeff"],df["SE"],df["t val"],df["Probs"] = [params,sd_b,ts_b,p_values]
    names = ['Intercept']
    names.extend(list(X_test))
    df.index = names
    return df

# X and y from the features code cells

In [6]:
from sklearn.preprocessing import MinMaxScaler

def minMaxScaleContinuous(continuousList):
    return pd.DataFrame(MinMaxScaler().fit_transform(final_data[continuousList])
                             ,columns=list(final_data[continuousList].columns)
                             ,index = final_data[continuousList].index)

def createDiscreteDummies(discreteList):
    return pd.get_dummies(final_data[discreteList], dummy_na = True, prefix_sep = "::", drop_first = False)

# Define which features to use in the modeling

In [7]:
# define the discrete features you want to use in modeling.
# if you want to use all the discrete features, just set discrete_features_touse = discrete_features
discrete_features_touse =['purpose', 'term', 'verification_status', 'emp_length', 'home_ownership']

# define the continuous features to use in modeling
# if you want to use all the continuous features, just set the continuous_features_touse = continuous_features
continuous_features_touse = ['loan_amnt', 'funded_amnt','installment','annual_inc','dti','revol_bal','delinq_2yrs','open_acc',
 'pub_rec','fico_range_high','fico_range_low','revol_util','cr_hist']

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Create dummies for categorical features and concatenate with continuous features for X or predictor dataframe

# Use this line of code if you do not want to scale the continuous features
#X_continuous = final_data[continuous_features_touse]

# use this line if you want to scale the continuous features using the MinMaxScaler in the function defined above
X_continuous = minMaxScaleContinuous(continuous_features_touse)

# create numeric dummy features for the discrete features to be used in modeling
X_discrete = createDiscreteDummies(discrete_features_touse)

#concatenate the continuous and discrete features into one dataframe
X = pd.concat([X_continuous, X_discrete], axis = 1)

# this is the target variable 
# 'ret_PESS', 'ret_OPT', 'ret_INTa', 'ret_INTb'

# Use this line of code if you do not want to scale the ret_cols
#y=final_data["ret_OPT"]

# use this line if you want to scale the ret_cols using the MinMaxScaler in the function defined above
ret_data = minMaxScaleContinuous(ret_cols)
y=ret_data['ret_OPT']

# create a test and train split of the transformed data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.3)


# Multiple Linear Regression to predict M2

In [9]:
from sklearn.linear_model import LinearRegression

mlr_model = LinearRegression(n_jobs=-1).fit(X_train, y_train)

print("mlr.coef_:", mlr_model.coef_)
print("mlr.intercept_:", mlr_model.intercept_)

mlr.coef_: [-3.96864470e-02 -3.96864470e-02  8.69385674e-02  1.45332441e-01
 -2.79605848e-02  8.76991979e-02 -2.49728772e-03 -1.60984324e-02
 -2.45239562e-02 -1.95714546e+00  1.95019136e+00 -1.12110028e-03
  8.60892491e-04  1.66227385e-03 -6.80821985e-04  3.55380963e-04
 -2.65881983e-03 -3.71639192e-04 -3.12831106e-03 -4.31738807e-03
 -2.10845565e-03  1.02224338e-03 -8.49784011e-04 -1.33626662e-02
  3.67294981e-04  2.40706928e-02  2.81628887e-13 -8.23850849e-03
  8.23850849e-03 -1.51684915e-12  5.30949577e-04 -1.15767649e-04
 -4.15181926e-04  1.52322599e-13  5.74082100e-04  1.95933258e-03
  8.34633979e-04 -3.32642642e-04  1.16899495e-03  3.98237102e-04
  1.45853136e-03  1.93967630e-03  1.11386984e-03  2.37360294e-03
 -1.59623176e-03 -9.89208674e-03 -1.00145406e-03 -4.49851188e-03
  3.02870388e-02 -9.50043615e-03 -1.52866367e-02  0.00000000e+00]
mlr.intercept_: 0.22036957304871074


In [10]:
print("Training set score: {:.5f}".format(mlr_model.score(X_train, y_train)))
print("Test set score: {:.5f}".format(mlr_model.score(X_test, y_test)))


Training set score: 0.01582
Test set score: 0.01381


In [11]:
import math
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

predictions = mlr_model.predict(X_test)
score = explained_variance_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = math.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print("score = {:.5f} | MAE = {:.5f} | RMSE = {:.5f} | R2 = {:.5f}".format(score, mae, rmse, r2))

score = 0.01398 | MAE = 0.04416 | RMSE = 0.06084 | R2 = 0.01381


In [12]:
getPValues(mlr_model, X_train, y_train)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = (x >= _b) & cond0


Unnamed: 0,Coeff,SE,t val,Probs
Intercept,0.2204,0.006,37.271,0.0
loan_amnt,-0.0397,0.002,-19.647,0.0
funded_amnt,-0.0397,0.002,-19.647,0.0
installment,0.0869,0.005,16.214,0.0
annual_inc,0.1453,0.022,6.55,0.0
dti,-0.028,0.011,-2.51,0.012061
revol_bal,0.0877,0.014,6.49,0.0
delinq_2yrs,-0.0025,0.006,-0.418,0.675804
open_acc,-0.0161,0.002,-6.575,0.0
pub_rec,-0.0245,0.013,-1.96,0.050013


# LASSO Regression GridsearchCV

In [13]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

''' These are just example parameter settings. You can change these parameters or add others.
    The grid search uses a scoring method of R2. You can change that to another scoring method.
'''

parameters = {'alpha' : [0.000000001, 0.00000001, 0.0000001]
             }

print("Parameter grid:\n{}".format(parameters),'\n')

grid =  GridSearchCV(Lasso(max_iter=10000), parameters, cv=5, return_train_score=True, scoring='r2', n_jobs=-1)

# perform grid search cv on training data.  The CV algorithm divides this into training and validation
ls_model = grid.fit(X_train, y_train)

print('best params ',ls_model.best_params_,'\n')
print('best estimator ',ls_model.best_estimator_,'\n')
print('best validation score ', ls_model.best_score_,'\n')
print('scoring method ', ls_model.scorer_)

print("Test set accuracy score: {:.7f}".format(ls_model.score(X_test, y_test)))

#saveModel('ls_model', ls_model)

Parameter grid:
{'alpha': [1e-09, 1e-08, 1e-07]} 

best params  {'alpha': 1e-07} 

best estimator  Lasso(alpha=1e-07, copy_X=True, fit_intercept=True, max_iter=10000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) 

best validation score  0.015118202282069726 

scoring method  make_scorer(r2_score)
Test set accuracy score: 0.0138347


  positive)
