In [1]:
# Import all the packages that we need
import pandas as pd
import numpy as np
import dask.dataframe as dd
import coiled
import joblib
pd.set_option('display.max_rows', 500)
from sklearn.model_selection import train_test_split
from dask_ml.preprocessing import Categorizer, DummyEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
# Remove NAs
def remove_na(dd):
    """
    This function removes NAs and outliers in annual_inc

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only grades we need
    """
    dd = dd.dropna(subset=['annual_inc',
                            'dti',
                            'pub_rec',
                            'pub_rec_bankruptcies',
                            'int_rate',
                            'loan_amnt',
                            'grade',
                            'sub_grade',
                            'verification_status',
                            'term'
                            ])
    dd.annual_inc = dd.annual_inc[dd.annual_inc < 2e7]
    return dd

In [3]:
# Function to filter out grades F and G
def filter_grade(dd):
    """
    This function filters out functions F and G

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only grades we need
    """
    dd = dd[dd['grade'].isin(['A', 'B', 'C', 'D', 'E'])]
    return dd

In [4]:
def filter_loan_status(dd):
    dd = dd[dd['loan_status'].isin(['Charged Off','Fully Paid'])]

    return dd

In [5]:
# Feature engineer days_since_earliest_credit
def get_days_first_credit(dd):
    """
    This function adds a new column that holds info on how many days has it been since first credit to loan issuance.

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with the new column added
    """
    dd['days_since_first_credit'] = (dd['issue_d'] - dd['earliest_cr_line']).dt.days
    
    return dd

In [6]:
# Function to clean emp_length
def clean_emp_length(dd):
    """
    This function cleans emp_length

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with the new column added
    """
    dd.emp_length = dd.emp_length.replace(to_replace='< 1 year', value='0')
    dd.emp_length = dd.emp_length.str.strip('<+ years')
    dd.emp_length = dd.emp_length.fillna('-1')
    dd.emp_length = dd.emp_length.astype(int)

    return dd

In [7]:
# Function to separate data by term
def separate_by_term(dd):
    dd.term = dd.term.str.strip(' months').astype(int)
     
    df_3 = dd[dd.term == 36]
    df_5 = dd[dd.term == 60]

    df_3 = df_3[df_3['issue_d'].dt.year <= 2015]
    df_5 = df_5[df_5['issue_d'].dt.year <= 2013]

    return df_3,df_5  

In [8]:
# Function to select all the features that we want
def select_features(dd):
    """
    This function selects only the features that we want for future modelling

    params: dd - a dask dataframe
    returns: dd - a dask dataframe with only the features selected

    Notes:
    1. We are not selecting grade since the information is already present in sub_grade
    2. We are not selecting open_acc since we believe that feature is updated throughout time
    3. Emp_title is dropped since we cannot clean it
    4. Zip code is dropped since there is too many and State would give enough information
    """
    dd = dd[[
            'addr_state', # Need to dummify
            'annual_inc',
            'disbursement_method', # Need to binarize
            'dti',
            'emp_length', # Need to convert to number and add NAs
            'fico_range_high', 
            'fico_range_low',
            'home_ownership', # Need to dummify
            'initial_list_status', # Need to dummify (binarize)
            'installment',
            'int_rate',
            'loan_amnt', 
            'pub_rec', 
            'pub_rec_bankruptcies',
            'purpose', # Need to dummify
            'sub_grade', # Need to dummify or be ordinal encoded
            'verification_status',
            'loan_status' # Need to dummify
    ]]

    return dd

In [9]:
def encode_categorical(dd):
    ce = Categorizer(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
    dd = ce.fit_transform(dd)
    de = DummyEncoder(columns=['addr_state', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'verification_status', 'sub_grade'])
    dd = de.fit_transform(dd)

    return dd

In [10]:
def scale_features(dd):
    scaler = StandardScaler()
    dd = scaler.fit_transform(dd)

    return dd

In [11]:

# cluster = coiled.Cluster(n_workers=10)
cluster = coiled.Cluster(name='DarishSakeesing-bc650b6e-1')

from dask.distributed import Client
client = Client(cluster)
print('Dashboard:', client.dashboard_link)



raw_data = dd.read_csv(
    "s3://lending-club/accepted_2007_to_2018Q4.csv",
    dtype={'desc': 'object', 
            'id': 'object',
            'sec_app_earliest_cr_line': 'object'}, 
    parse_dates = ['issue_d','earliest_cr_line'],
    low_memory=False,
    storage_options={"anon": True},
    blocksize="16 MiB",
)

Output()

Dashboard: http://ec2-3-142-95-126.us-east-2.compute.amazonaws.com:8787


In [12]:
raw_data = remove_na(raw_data)
raw_data = filter_grade(raw_data)
raw_data = filter_loan_status(raw_data)
raw_data = get_days_first_credit(raw_data)
raw_data = clean_emp_length(raw_data)
df_3, df_5 = separate_by_term(raw_data)

In [13]:
df_3 = select_features(df_3)
df_5 = select_features(df_5)

In [14]:
y_3 = df_3.pop('loan_status')
y_5 = df_5.pop('loan_status')

In [15]:
df_3 = encode_categorical(df_3)
df_5 = encode_categorical(df_5)

In [16]:
df_3 = scale_features(df_3)
df_5 = scale_features(df_5)

In [17]:
le_3 = LabelEncoder()
y_3 = le_3.fit_transform(y_3)
le_5 = LabelEncoder()
y_5 = le_5.fit_transform(y_5)

In [18]:
with joblib.parallel_backend('dask', n_jobs=-1):
    X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(df_3.compute(), y_3.compute(), test_size=0.2, shuffle=True)
    X_5_train, X_5_test, y_5_train, y_5_test = train_test_split(df_5.compute(), y_5.compute(), test_size=0.2, shuffle=True)

# Null Model

In [19]:
pd.Series(y_3_train).value_counts()

1    423099
0     67397
dtype: int64

# Linear Discriminant Analysis

In [20]:
# Constucting Priors List
priors = []
for x in range(0, 101, 1):
    priors.append([x/100, (100-x)/100])

In [21]:
params = {'priors': priors}
lda_3 = LinearDiscriminantAnalysis()
lda_5 = LinearDiscriminantAnalysis()
print('Initialized estimators')

grid_search_3 = GridSearchCV(estimator=lda_3, param_grid=params, scoring='balanced_accuracy', n_jobs=-1, cv=3, verbose=5)
grid_search_5 = GridSearchCV(estimator=lda_5, param_grid=params, scoring='balanced_accuracy', n_jobs=-1, cv=3, verbose=5)
print('Initialized grid')

with joblib.parallel_backend('dask', n_jobs=-1, scatter=[X_3_train, y_3_train, X_5_train, y_5_train]):
    print('Entered parallel backend')
    grid_search_3.fit(X_3_train, y_3_train)
    print('Finished 3, Starting 5')
    grid_search_5.fit(X_5_train, y_5_train)

Initialized estimators
Initialized grid
Entered parallel backend
Fitting 3 folds for each of 101 candidates, totalling 303 fits
Finished 3, Starting 5
Fitting 3 folds for each of 101 candidates, totalling 303 fits


In [22]:
lda_3 = LinearDiscriminantAnalysis(priors=grid_search_3.best_params_['priors'])
lda_3.fit(X_3_train, y_3_train)
cm_3 = confusion_matrix(y_3_test, lda_3.predict(X_3_test), labels=[0,1])
cm_3_df = pd.DataFrame(cm_3, columns=[f'Pred_{label}' for label in le_3.classes_.compute()], index= [f'True_{label}' for label in le_3.classes_.compute()])

In [23]:
lda_5 = LinearDiscriminantAnalysis(priors=grid_search_5.best_params_['priors'])
lda_5.fit(X_5_train, y_5_train)
cm_5 = confusion_matrix(y_5_test, lda_5.predict(X_5_test), labels=[0,1])
cm_5_df = pd.DataFrame(cm_5, columns=[f'Pred_{label}' for label in le_5.classes_.compute()], index= [f'True_{label}' for label in le_5.classes_.compute()])

## Confusion Matrix for LDA 3-YEAR Loans

In [24]:
cm_3_df

Unnamed: 0,Pred_Charged Off,Pred_Fully Paid
True_Charged Off,10295,6544
True_Fully Paid,38134,67651


## Confusion Matrix for LDA 5-YEAR Loans

In [25]:
cm_5_df

Unnamed: 0,Pred_Charged Off,Pred_Fully Paid
True_Charged Off,1167,1023
True_Fully Paid,2705,4662


# Random Forest

In [29]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
forest_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rfc_3 = RandomForestClassifier()
rfc_5 = RandomForestClassifier()
print('Initialized estimators')

rf_grid_3 = GridSearchCV(estimator = rfc_3, param_grid = forest_grid, cv = 3, verbose=3, n_jobs = -1)
rf_grid_5 = GridSearchCV(estimator = rfc_5, param_grid = forest_grid, cv = 3, verbose=3, n_jobs = -1)
print('Initialzed Grid')



with joblib.parallel_backend('dask', n_jobs=-1, scatter=[X_3_train, y_3_train, X_5_train, y_5_train]):
    print('Entered parallel backend')
    rf_grid_3.fit(X_3_train, y_3_train)
    print('Finished 3, Starting 5')
    rf_grid_5.fit(X_5_train, y_5_train)


Initialized estimators
Initialzed Grid
Entered parallel backend
Fitting 3 folds for each of 4320 candidates, totalling 12960 fits


KeyboardInterrupt: 

In [None]:
rfc_3 = rf_grid_3.best_estimator_
rfc_3.fit(X_3_train, y_3_train)
cm_3 = confusion_matrix(y_3_test, rfc_3.predict(X_3_test), labels=[0,1])
cm_3_df = pd.DataFrame(cm_3, columns=[f'Pred_{label}' for label in le_3.classes_.compute()], index= [f'True_{label}' for label in le_3.classes_.compute()])

In [None]:
rfc_5 = rf_grid_5.best_estimator_
rfc_5.fit(X_5_train, y_5_train)
cm_5 = confusion_matrix(y_5_test, rfc_5.predict(X_5_test), labels=[0,1])
cm_5_df = pd.DataFrame(cm_5, columns=[f'Pred_{label}' for label in le_5.classes_.compute()], index= [f'True_{label}' for label in le_5.classes_.compute()])