In [1]:
# Load general utilities
# ----------------------
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.axes as ax
import datetime
import numpy as np
import pickle
import time
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, classification_report, plot_confusion_matrix


def minMaxScaleContinuous(continuousList):
    return pd.DataFrame(MinMaxScaler().fit_transform(data[continuousList])
                             ,columns=list(data[continuousList].columns)
                             ,index = data[continuousList].index)

def createDiscreteDummies(discreteList):
    return pd.get_dummies(data[discreteList], dummy_na = True, prefix_sep = "::", drop_first = False)

# save the model to disk
def saveModel(filename, model):
    joblib.dump(model, filename)
 
 
# load the model from disk
def loadModel(filename):
    return joblib.load(filename)

### Read in Data

In [2]:
# This is the code you can use to open your pickle file
# Read the data and features from the pickle
print("Reading in pickle file...")

data, discrete_features, continuous_features, ret_cols = pickle.load( open( "../Data/clean_data.pickle", "rb" ) )

Reading in pickle file...


In [3]:
# Create the outcome
data["default"] = data.loan_status.isin(["Charged Off", "Default"])

In [4]:
# Create a feature for the length of a person's credit history at the
# time the loan is issued
data['cr_hist'] = (data.issue_d - data.earliest_cr_line) / np.timedelta64(1, 'M')
continuous_features.append('cr_hist')

# define the discrete features you want to use in modeling.
# if you want to use all the discrete features, just set discrete_features_touse = discrete_features
discrete_features_touse = discrete_features
# discrete_features_touse =['purpose', 'term', 'verification_status', 'emp_length', 'home_ownership']

# define the continuous features to use in modeling
# if you want to use all the continuous features, just set the continuous_features_touse = continuous_features
continuous_features_touse = continuous_features
# continuous_features_touse = ['loan_amnt', 'funded_amnt','installment','annual_inc','dti','revol_bal','delinq_2yrs','open_acc',
#  'pub_rec','fico_range_high','fico_range_low','revol_util','cr_hist']

### Split training and test data

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Create dummies for categorical features and concatenate with continuous features for X or predictor dataframe

# Use this line of code if you do not want to scale the continuous features
#X_continuous = data[continuous_features_touse]

# use this line if you want to scale the continuous features using the MinMaxScaler in the function defined above
X_continuous = minMaxScaleContinuous(continuous_features_touse)

# create numeric dummy features for the discrete features to be used in modeling
X_discrete = createDiscreteDummies(discrete_features_touse)

#concatenate the continuous and discrete features into one dataframe
X = pd.concat([X_continuous, X_discrete], axis = 1)

# this is the target variable 
target_col = 'default'
y=data[target_col]

# create a test and train split of the transformed data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.4)

print("Population:\n",y.value_counts())
print("Train:\n", y_train.value_counts())
print("Test:\n", y_test.value_counts())

Population:
 False    613255
True     164011
Name: default, dtype: int64
Train:
 False    367757
True      98602
Name: default, dtype: int64
Test:
 False    245498
True      65409
Name: default, dtype: int64


### Read in model

In [6]:
model = loadModel('lr_model')

#### Create array of grades

These are represented as a numeric value 0-6

In [11]:
grade = (X_test['grade::B']*1 + X_test['grade::C']*2 + X_test['grade::D']*3 + X_test['grade::E']*4 + X_test['grade::F']*5 + X_test['grade::G']*6 )
grade = grade.values
print(grade)

[0 3 3 ... 3 1 1]


#### Create array of scores (probabilities)

In [12]:
prob = model.predict_proba(X_test)
prob = prob[:,1]
print(prob)

[9.99901627e-01 5.80504281e-04 1.86538550e-04 ... 1.00000000e+00
 9.97840278e-01 8.87630372e-04]


#### Rank Correlation Using Spearman's Correlation

Use the following function to find the rank correlation between the Lending Club grades and the probability of default by your model. 

You can call the function with those values using your test dataset.

In [15]:
from scipy.stats import spearmanr

def getCorrelation (grades, scores):
    coef, p = spearmanr(grades, scores)
    print('Spearmans correlation coefficient: %.3f' % coef)

    alpha = 0.05
    if p > alpha:
        print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
    else:
        print('Samples are correlated (reject H0) p=%.3f' % p)

In [16]:
getCorrelation(grade, prob)

Spearmans correlation coefficient: 0.126
Samples are correlated (reject H0) p=0.000
