In [None]:
# Import all the packages that we need
import pandas as pd
import numpy as np
import dask.dataframe as dd
import coiled
import joblib
from joblib import dump, load
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
from sklearn.model_selection import train_test_split
from dask_ml.preprocessing import Categorizer, DummyEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
# Remove NAs
def remove_na(dd):
    """
    This function removes NAs and outliers in annual_inc

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only grades we need
    """
    dd = dd.dropna(subset=['annual_inc',
                            'dti',
                            'pub_rec',
                            'pub_rec_bankruptcies',
                            'int_rate',
                            'loan_amnt',
                            'grade',
                            'sub_grade',
                            'verification_status',
                            'term'
                            ])
    
    dd.annual_inc = dd.annual_inc[dd.annual_inc < 2e7]
    return dd

In [None]:
# Function to filter out grades F and G
def filter_grade(dd):
    """
    This function filters out functions F and G

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only grades we need
    """
    dd = dd[dd['grade'].isin(['A', 'B', 'C', 'D', 'E'])]
    return dd

In [None]:
def filter_loan_status(dd):
    dd = dd[dd['loan_status'].isin(['Charged Off','Fully Paid'])]

    return dd

In [None]:
# Feature engineer days_since_earliest_credit
def get_days_first_credit(dd):
    """
    This function adds a new column that holds info on how many days has it been since first credit to loan issuance.

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with the new column added
    """
    dd['days_since_first_credit'] = (dd['issue_d'] - dd['earliest_cr_line']).dt.days
    
    return dd

In [None]:
# Function to clean emp_length
def clean_emp_length(dd):
    """
    This function cleans emp_length

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with the new column added
    """
    dd.emp_length = dd.emp_length.replace(to_replace='< 1 year', value='0')
    dd.emp_length = dd.emp_length.str.strip('<+ years')
    dd.emp_length = dd.emp_length.fillna('-1')
    dd.emp_length = dd.emp_length.astype(int)

    return dd

In [None]:
# Function to separate data by term
def separate_by_term(dd):
    dd.term = dd.term.str.strip(' months').astype(int)
     
    df_3 = dd[dd.term == 36]
    df_5 = dd[dd.term == 60]

    df_3 = df_3[df_3['issue_d'].dt.year <= 2015]
    df_5 = df_5[df_5['issue_d'].dt.year <= 2013]

    return df_3,df_5  

In [None]:
# Function to select all the features that we want
def select_features(dd): 
    """
    This function selects only the features that we want for future modelling

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only the features selected

    Notes:
    1. We are not selecting grade since the information is already present in sub_grade
    2. We are not selecting open_acc since we believe that feature is updated throughout time
    3. Emp_title is dropped since we cannot clean it
    4. Zip code is dropped since there is too many and State would give enough information
    """
    dd = dd[[
            'addr_state', # Need to dummify
            'annual_inc',
            'disbursement_method', # Need to binarize
            'dti',
            'emp_length', # Need to convert to number and add NAs
            'fico_range_high', 
            'fico_range_low',
            'home_ownership', # Need to dummify
            'initial_list_status', # Need to dummify (binarize)
            'installment',
            'int_rate',
            'loan_amnt', 
            'pub_rec', 
            'pub_rec_bankruptcies',
            'purpose', # Need to dummify
            'sub_grade', # Need to dummify or be ordinal encoded
            'verification_status',
            'loan_status' # Need to dummify
    ]]

    return dd



In [None]:
def encode_categorical(dd):
    # ce = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
    # dd = ce.fit_transform(dd)
    # de = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
    # dd = de.fit_transform(dd)

    ce = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
    dd = ce.fit_transform(dd)
    de = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
    dd = de.fit_transform(dd)

    return dd

In [None]:
def scale_features(dd):
    scaler = StandardScaler()
    dd = scaler.fit_transform(dd)

    return dd

In [None]:
import distributed
distributed.__version__

In [None]:
cluster = coiled.Cluster(n_workers=10)
# cluster = coiled.Cluster(name='DarishSakeesing-855fcb7f-8')

from dask.distributed import Client
client = Client(cluster)
print('Dashboard:', client.dashboard_link)

In [None]:
raw_data = dd.read_csv(
    "s3://lending-club2/accepted_2007_to_2018Q4.csv",
    dtype={'desc': 'object', 
            'id': 'object',
            'sec_app_earliest_cr_line': 'object'}, 
    parse_dates = ['issue_d','earliest_cr_line'],
    low_memory=False,
    storage_options={"anon": True},
    blocksize="16 MiB",
)

In [None]:
print('0')
raw_data = remove_na(raw_data)
print('1')
raw_data = filter_grade(raw_data)
print('2')
raw_data = filter_loan_status(raw_data)
print('3')
raw_data = get_days_first_credit(raw_data)
print('4')
raw_data = clean_emp_length(raw_data)
print('5')
df_3, df_5 = separate_by_term(raw_data)
print('6')

In [None]:
df_3 = select_features(df_3)
df_5 = select_features(df_5)

In [None]:
save_df_3 = df_3.compute()
save_df_5 = df_5.compute()

In [None]:
save_df_3.reset_index(inplace=True, drop=True)
save_df_5.reset_index(inplace=True, drop=True)

In [None]:
y_3 = df_3.pop('loan_status')
y_5 = df_5.pop('loan_status')

In [None]:
# ce_3 = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
# df_3 = ce_3.fit_transform(df_3)
# de_3 = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
# df_3 = de_3.fit_transform(df_3)

# ce_5 = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
# df_5 = ce_3.fit_transform(df_5)
# de_5 = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
# df_5 = de_3.fit_transform(df_5)

ce_3 = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
df_3 = ce_3.fit_transform(df_3)
de_3 = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
df_3 = de_3.fit_transform(df_3)

ce_5 = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
df_5 = ce_3.fit_transform(df_5)
de_5 = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status'])
df_5 = de_3.fit_transform(df_5)

In [None]:
# df_3 = encode_categorical(df_3)
# df_5 = encode_categorical(df_5)

In [None]:
# df_3 = scale_features(df_3)
# df_5 = scale_features(df_5)

In [None]:
sc_3 = StandardScaler()
df_3 = sc_3.fit_transform(df_3)
sc_5 = StandardScaler()
sc_5 = sc_5.fit_transform(df_5)

In [None]:
le_3 = LabelEncoder()
y_3 = le_3.fit_transform(y_3)
le_5 = LabelEncoder()
y_5 = le_5.fit_transform(y_5)

In [None]:
df_3 = df_3.compute().reset_index(drop=True)
df_5 = df_5.compute().reset_index(drop=True)

In [None]:
with joblib.parallel_backend('dask', n_jobs=-1):
    X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(df_3, y_3.compute(), test_size=0.2, shuffle=True)
    X_5_train, X_5_test, y_5_train, y_5_test = train_test_split(df_5, y_5.compute(), test_size=0.2, shuffle=True)

# Null Model

The models we will come up detects *charged_off* as a negative class and *fully_paid* as positive class. 

Let's see the performance of a model that classifies everything as positive, i.e., the null model.

## 3-YEAR NULL Model

In [None]:
y_3_pred_null = np.ones(len(y_3_test)).astype(int)

In [None]:
cm_3_null = confusion_matrix(y_3_test, y_3_pred_null)
cm_3_null_df = pd.DataFrame(cm_3_null, columns=[f'Pred_{label}' for label in le_3.classes_.compute()], index= [f'True_{label}' for label in le_3.classes_.compute()])
cm_3_null_df


In [None]:
precision_full_paid = cm_3_null_df.iloc[1, 1] / (cm_3_null_df.iloc[1, 1] + cm_3_null_df.iloc[0, 1])
print('Precision of Fully Paid (Null Model):\n' + str(precision_full_paid))


## 5-YEAR NULL Model

In [None]:
y_5_pred_null = np.ones(len(y_5_test)).astype(int)
cm_5_null = confusion_matrix(y_5_test, y_5_pred_null)
cm_5_null_df = pd.DataFrame(cm_5_null, columns=[f'Pred_{label}' for label in le_5.classes_.compute()], index= [f'True_{label}' for label in le_5.classes_.compute()])
cm_5_null_df

In [None]:
precision_full_paid = cm_5_null_df.iloc[1, 1] / (cm_5_null_df.iloc[1, 1] + cm_5_null_df.iloc[0, 1])
print('Precision of Fully Paid (Null Model):\n' + str(precision_full_paid))

# Linear Discriminant Analysis

## ROC_AUC Metric

In [None]:
# Constucting Priors List
priors = []
for x in range(0, 101, 1):
    priors.append([x/100, (100-x)/100])

In [None]:
def custom_scoring(estimator, X, y):
    return roc_auc_score(y, estimator.predict(X), average='weighted')

In [None]:
params = {'priors': priors}
lda_3 = LinearDiscriminantAnalysis()
lda_5 = LinearDiscriminantAnalysis()
print('Initialized estimators')


grid_search_3_roc = GridSearchCV(estimator=lda_3, param_grid=params, scoring=custom_scoring, n_jobs=-1, cv=3, verbose=5)
grid_search_5_roc = GridSearchCV(estimator=lda_5, param_grid=params, scoring=custom_scoring, n_jobs=-1, cv=3, verbose=5)
print('Initialized grid')

with joblib.parallel_backend('dask', n_jobs=-1, scatter=[X_3_train, y_3_train, X_5_train, y_5_train]):
    print('Entered parallel backend')
    grid_search_3_roc.fit(X_3_train, y_3_train)
    print('Finished 3, Starting 5')
    grid_search_5_roc.fit(X_5_train, y_5_train)

In [None]:
lda_3 = LinearDiscriminantAnalysis(priors=grid_search_3_roc.best_params_['priors'])
lda_3.fit(X_3_train, y_3_train)
cm_3 = confusion_matrix(y_3_test, lda_3.predict(X_3_test), labels=[0,1])
cm_3_df = pd.DataFrame(cm_3, columns=[f'Pred_{label}' for label in le_3.classes_.compute()], index= [f'True_{label}' for label in le_3.classes_.compute()])

In [None]:
lda_5 = LinearDiscriminantAnalysis(priors=grid_search_5_roc.best_params_['priors'])
lda_5.fit(X_5_train, y_5_train)
cm_5 = confusion_matrix(y_5_test, lda_5.predict(X_5_test), labels=[0,1])
cm_5_df = pd.DataFrame(cm_5, columns=[f'Pred_{label}' for label in le_5.classes_.compute()], index= [f'True_{label}' for label in le_5.classes_.compute()])

## Confusion Matrix for LDA 3-YEAR Loans (ROC_AUC)

In [None]:
cm_3_df

## Confusion Matrix for LDA 5-YEAR Loans (ROC_AUC)

In [None]:
cm_5_df

In [None]:
# save model
# dump(lda_3, 'lda_3_ROC.joblib')
# dump(lda_5, 'lda_5_ROC.joblib')

## Balanced Accuracy Metric

In [None]:
params = {'priors': priors}
lda_3 = LinearDiscriminantAnalysis()
lda_5 = LinearDiscriminantAnalysis()
print('Initialized estimators')


grid_search_3 = GridSearchCV(estimator=lda_3, param_grid=params, scoring='balanced_accuracy', n_jobs=-1, cv=3, verbose=5)
grid_search_5 = GridSearchCV(estimator=lda_5, param_grid=params, scoring='balanced_accuracy', n_jobs=-1, cv=3, verbose=5)
print('Initialized grid')

with joblib.parallel_backend('dask', n_jobs=-1, scatter=[X_3_train, y_3_train, X_5_train, y_5_train]):
    print('Entered parallel backend')
    grid_search_3.fit(X_3_train, y_3_train)
    print('Finished 3, Starting 5')
    grid_search_5.fit(X_5_train, y_5_train)

In [None]:
lda_3 = LinearDiscriminantAnalysis(priors=grid_search_3.best_params_['priors'])
lda_3.fit(X_3_train, y_3_train)
cm_3 = confusion_matrix(y_3_test, lda_3.predict(X_3_test), labels=[0,1])
cm_3_df = pd.DataFrame(cm_3, columns=[f'Pred_{label}' for label in le_3.classes_.compute()], index= [f'True_{label}' for label in le_3.classes_.compute()])

In [None]:
lda_5 = LinearDiscriminantAnalysis(priors=grid_search_5.best_params_['priors'])
lda_5.fit(X_5_train, y_5_train)
cm_5 = confusion_matrix(y_5_test, lda_5.predict(X_5_test), labels=[0,1])
cm_5_df = pd.DataFrame(cm_5, columns=[f'Pred_{label}' for label in le_5.classes_.compute()], index= [f'True_{label}' for label in le_5.classes_.compute()])

## Confusion Matrix for LDA 3-YEAR Loans (Balanced_Accuracy)

In [None]:
cm_3_df

## Confusion Matrix for LDA 5-YEAR Loans (Balanced_Accuracy)

In [None]:
cm_5_df

In [None]:
grid_search_3_roc.best_score_

In [None]:
grid_search_3.best_score_

In [None]:
#save model
dump(lda_3, 'lda_3_balanced.joblib')
dump(lda_5, 'lda_5_balanced.joblib')

# INVESTIGATING ERRORS

In [None]:
lda_3 = load('lda_3_balanced.joblib')

In [None]:
false_positive = []
false_negative = []
true_positive = []
true_negative = []
predictions = lda_3.predict(X_3_test)
for index, pred, lbl in zip(X_3_test.index, predictions, y_3_test):
    if pred != lbl:
        if pred == 1 and lbl == 0:
            false_positive.append(index)
        else:
            false_negative.append(index)
    else:
        if pred == 0:
            true_negative.append(index)
        else:
            true_positive.append(index)

In [None]:
len(false_positive)

In [None]:
fp_df = save_df_3[save_df_3.index.isin(false_positive)]

In [None]:
fp_df

In [None]:
tn_df = save_df_3[save_df_3.index.isin(true_negative)]

In [None]:
tn_df

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 7))

axes[0].title.set_text('False Positive')
axes[1].title.set_text('True Negative')
fp_df.sub_grade.sort_values().hist(ax=axes[0])
tn_df.sub_grade.sort_values().hist(ax=axes[1])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 7))

axes[0].title.set_text('False Positive')
axes[1].title.set_text('True Negative')
fp_df.addr_state.sort_values().hist(ax=axes[0])
tn_df.addr_state.sort_values().hist(ax=axes[1])

In [None]:
fp_df.annual_inc.median()

In [None]:
tn_df.annual_inc.median()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 7))

axes[0].title.set_text('False Positive')
axes[1].title.set_text('True Negative')
fp_df.annual_inc.sort_values().plot.box(ax=axes[0])
tn_df.annual_inc.sort_values().plot.box(ax=axes[1])