In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    fbeta_score,
)
from sklearn.ensemble import RandomForestClassifier

# Replace with whatever model import(s) you're using
from xgboost import XGBClassifier


%matplotlib inline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    import warnings
    import statsmodels.api as sm
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
loans = pd.read_csv("sba_loans_cleaned2.csv")
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522582 entries, 0 to 522581
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   State               522582 non-null  object 
 1   BankState           522582 non-null  object 
 2   ApprovalFY          522582 non-null  int64  
 3   NoEmp               522582 non-null  int64  
 4   NewExist            522582 non-null  int64  
 5   RevLineCr           522049 non-null  float64
 6   LowDoc              518370 non-null  float64
 7   DisbursementGross   522582 non-null  float64
 8   MIS_Status          522582 non-null  int64  
 9   twoDigNAICS         522582 non-null  int64  
 10  is_franchise        522582 non-null  float64
 11  bank_out_of_state   522582 non-null  int64  
 12  Term_years          522582 non-null  float64
 13  job_category        522582 non-null  float64
 14  retained_category   522582 non-null  float64
 15  UrbanRural_cleaned  516078 non-nul

<IPython.core.display.Javascript object>

In [5]:
loans.head()

Unnamed: 0,State,BankState,ApprovalFY,NoEmp,NewExist,RevLineCr,LowDoc,DisbursementGross,MIS_Status,twoDigNAICS,is_franchise,bank_out_of_state,Term_years,job_category,retained_category,UrbanRural_cleaned,Disbr_year,Disbr_Month_sin,Disbr_Month_cos,sba_pre_approv,bank_size,percent_SBA,Appv_Month_sin,Appv_Month_cos
0,NC,MN,2006,2,1,0.0,0.0,253400.0,0,44,1.0,0,13.5,0.0,0.0,0.0,2006,0.866025,0.5,1.0,1.0,0.75,0.5,0.866025
1,AR,MS,2006,7,0,0.0,0.0,137300.0,0,72,0.0,0,10.5,0.0,0.0,0.0,2006,1.0,6.123234000000001e-17,1.0,0.0,0.85,0.5,0.866025
2,CA,SD,2006,18,1,1.0,0.0,438541.0,0,61,0.0,0,6.916667,1.0,2.0,0.0,2006,0.5,0.8660254,1.0,2.0,0.5,0.5,0.866025
3,FL,FL,2006,4,0,1.0,0.0,51440.0,0,23,0.0,1,7.0,0.0,1.0,0.0,2006,0.5,0.8660254,1.0,2.0,0.5,0.5,0.866025
4,LA,LA,2006,3,0,0.0,0.0,50000.0,0,53,0.0,1,5.0,0.0,0.0,0.0,2006,0.866025,-0.5,1.0,0.0,0.85,0.5,0.866025


<IPython.core.display.Javascript object>

In [6]:
cat_cols = [
    "State",
    "BankState",
    "twoDigNAICS",
]

ordinal_cols = [
    "job_category",
    "retained_category",
    "bank_size",
]

num_cols = [
    "ApprovalFY",
    "NoEmp",
    "DisbursementGross",
    "Term_years",  # untapped potential here
    "Disbr_year",  # may want to do some grouping by or transformation
    "Disbr_Month_sin",
    "Disbr_Month_cos",
    "Approval_Month_sin",
    "Approval_Month_cos",
]

bin_cols = ["NewExist", "is_franchise", "bank_out_of_state", "sba_pre_approv"]

# impute_cols = [
#     "RevLineCr",
#     "LowDoc",
#     "UrbanRural_cleaned",
# ]

# dropping na's until the imputer is working
loans = loans.dropna()

<IPython.core.display.Javascript object>

In [7]:
X = loans.drop(columns="MIS_Status")
y = loans["MIS_Status"]

<IPython.core.display.Javascript object>

In [8]:
print_vif(X.select_dtypes(include="number"))

VIF results
-------------------------------
const                 814829.290183
ApprovalFY                23.479557
NoEmp                      1.031137
NewExist                   1.116096
RevLineCr                  1.637751
LowDoc                     1.463996
DisbursementGross          1.440323
twoDigNAICS                1.062718
is_franchise               1.077374
bank_out_of_state          1.552891
Term_years                 1.726284
job_category               1.142698
retained_category          1.301262
UrbanRural_cleaned         1.085572
Disbr_year                23.010926
Disbr_Month_sin            2.115029
Disbr_Month_cos            1.924876
sba_pre_approv             1.136868
bank_size                  1.762733
percent_SBA                2.268133
Appv_Month_sin             1.949872
Appv_Month_cos             2.103376
dtype: float64
-------------------------------



<IPython.core.display.Javascript object>

In [9]:
# remove ApprovalFY.sba_pre_approv contains some of the same info, Disbr_year will also cover the loss
X = X.drop(columns="ApprovalFY")

<IPython.core.display.Javascript object>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

<IPython.core.display.Javascript object>

In [11]:
preprocessing = ColumnTransformer(
    [
        ("leaveOneOut", LeaveOneOutEncoder(), cat_cols),
        # ("knnImptute", KNNImputer(n_neighbors=2), impute_cols),
        # ("simpleImptute", SimpleImputer(), impute_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

In [12]:
n_trees = 50
learning_rate = 2 / n_trees

pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("xgbClass", XGBClassifier(n_estimators=n_trees, learning_rate=learning_rate)),
    ]
)

<IPython.core.display.Javascript object>

In [13]:
grid = {
    "xgbClass__subsample": [0.5, 0.75, 1.0],
    #     "gbr__max_features": [0.5, 0.75, 1.0], # alternative
    "xgbClass__colsample_bytree": [0.6, 0.8, 1.0],
    "xgbClass__max_depth": [3, 4, 6],
}



<IPython.core.display.Javascript object>

In [14]:
# 2.6min runtime
pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=3)
pipeline_cv.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  6.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('leaveOneOut',
                                                                         LeaveOneOutEncoder(cols=None,
                                                                                            drop_invariant=False,
                                                                                            handle_missing='value',
                                                                                            handle_unknown='value',
       

<IPython.core.display.Javascript object>

In [15]:
pipeline_cv.best_params_

{'xgbClass__colsample_bytree': 1.0,
 'xgbClass__max_depth': 4,
 'xgbClass__subsample': 0.75}

<IPython.core.display.Javascript object>

In [16]:
pipeline_cv.cv_results_

{'mean_fit_time': array([11.14649773, 10.12689479,  8.61228283, 13.15079935, 12.27082173,
        10.75022666, 20.2248648 , 18.46457728, 15.88099607, 15.26505852,
        13.71977409, 10.92937779, 17.14293639, 16.66236043, 14.34861644,
        27.07674249, 26.23725867, 21.69898256, 17.12930338, 16.26694417,
        14.06899285, 22.34263126, 21.06431985, 18.7009627 , 32.0627687 ,
        30.05552069, 24.46867299]),
 'std_fit_time': array([0.13867339, 0.09766044, 0.20397328, 0.05839843, 0.11642526,
        0.28431516, 0.13470499, 0.3694924 , 1.11315705, 0.31843679,
        0.16188948, 0.33042917, 0.19641443, 0.24176059, 0.13728599,
        0.41060316, 0.26062253, 0.56047987, 0.11176566, 0.14426073,
        0.09690255, 0.39407821, 0.22273741, 0.29697853, 0.2512847 ,
        0.17549418, 2.24044274]),
 'mean_score_time': array([0.37133924, 0.38496995, 0.37399912, 0.46143206, 0.44481007,
        0.46375855, 0.62865043, 0.64494061, 0.68053055, 0.44181784,
        0.42898655, 0.4086686 , 0.479

<IPython.core.display.Javascript object>

In [17]:
pipeline_cv.score(X_train, y_train)

0.8986342841064227

<IPython.core.display.Javascript object>

In [18]:
pipeline_cv.score(X_test, y_test)

0.8979701898501676

<IPython.core.display.Javascript object>

In [19]:
y_pred = pipeline_cv.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93     96078
           1       0.84      0.73      0.78     31865

    accuracy                           0.90    127943
   macro avg       0.88      0.84      0.86    127943
weighted avg       0.90      0.90      0.90    127943



<IPython.core.display.Javascript object>

In [20]:
confusion_matrix(y_test, y_pred, normalize="true")

array([[0.95481796, 0.04518204],
       [0.2734348 , 0.7265652 ]])

<IPython.core.display.Javascript object>

In [21]:
importance_df = pd.DataFrame(
    {
        "feat": X_train.columns,
        "importance": pipeline_cv.best_estimator_.named_steps[
            "xgbClass"
        ].feature_importances_,
    }
)

importance_df.sort_values("importance", ascending=False)

Unnamed: 0,feat,importance
10,Term_years,0.277175
18,bank_size,0.239319
9,bank_out_of_state,0.180649
1,BankState,0.093512
14,Disbr_year,0.071084
0,State,0.048077
19,percent_SBA,0.028963
5,LowDoc,0.027991
7,twoDigNAICS,0.022567
6,DisbursementGross,0.010663


<IPython.core.display.Javascript object>

In [22]:
y.value_counts()

0    385081
1    126690
Name: MIS_Status, dtype: int64

<IPython.core.display.Javascript object>

Due to the large class imbalance, the majority class will be down sampled to the size of the minority class. 

In [23]:
X_train_0 = X_train[y_train == 0]
X_train_1 = X_train[y_train == 1]

n_0 = X_train_0.shape[0]
n_1 = X_train_1.shape[0]

# Sample majority class to have less observations
X_train_0_sample = X_train_0.sample(n_1, replace=False, random_state=42)

# # Sample minority class to have less observations
# X_train_1_sample = X_train_1.sample(n, replace=True, random_state=42)

X_train_resample = pd.concat((X_train_1, X_train_0_sample))
X_train_resample = X_train_resample.reset_index(drop=True)

y_train_resample = np.array([1] * n_1 + [0] * n_1)
y_train_resample.mean()

0.5

<IPython.core.display.Javascript object>

In [24]:
pipeline_cv_resample = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=3,)
pipeline_cv_resample.fit(X_train_resample, y_train_resample)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('leaveOneOut',
                                                                         LeaveOneOutEncoder(cols=None,
                                                                                            drop_invariant=False,
                                                                                            handle_missing='value',
                                                                                            handle_unknown='value',
       

<IPython.core.display.Javascript object>

In [25]:
pipeline_cv_resample.best_params_

{'xgbClass__colsample_bytree': 0.8,
 'xgbClass__max_depth': 6,
 'xgbClass__subsample': 0.5}

<IPython.core.display.Javascript object>

In [26]:
pipeline_cv_resample.score(X_train_resample, y_train_resample)

0.8778223042446612

<IPython.core.display.Javascript object>

In [27]:
pipeline_cv_resample.score(X_test, y_test)

0.8460564470115598

<IPython.core.display.Javascript object>

In [28]:
y_pred = pipeline_cv_resample.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.81      0.89     96078
           1       0.63      0.94      0.75     31865

    accuracy                           0.85    127943
   macro avg       0.80      0.88      0.82    127943
weighted avg       0.89      0.85      0.85    127943



<IPython.core.display.Javascript object>

In [29]:
confusion_matrix(y_test, y_pred)

array([[78160, 17918],
       [ 1778, 30087]], dtype=int64)

<IPython.core.display.Javascript object>

In [30]:
importance_df = pd.DataFrame(
    {
        "feat": X_train.columns,
        "importance": pipeline_cv_resample.best_estimator_.named_steps[
            "xgbClass"
        ].feature_importances_,
    }
)

importance_df.sort_values("importance", ascending=False)

Unnamed: 0,feat,importance
10,Term_years,0.380492
9,bank_out_of_state,0.156057
1,BankState,0.106188
14,Disbr_year,0.100384
18,bank_size,0.071149
0,State,0.063261
19,percent_SBA,0.048667
5,LowDoc,0.015171
12,retained_category,0.014683
7,twoDigNAICS,0.013894


<IPython.core.display.Javascript object>

In [31]:
pipeline_cv_resample_f1 = GridSearchCV(
    pipeline, grid, verbose=1, n_jobs=-1, cv=3, scoring="f1"
)
pipeline_cv_resample_f1.fit(X_train_resample, y_train_resample)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('leaveOneOut',
                                                                         LeaveOneOutEncoder(cols=None,
                                                                                            drop_invariant=False,
                                                                                            handle_missing='value',
                                                                                            handle_unknown='value',
       

<IPython.core.display.Javascript object>

In [32]:
pipeline_cv_resample_f1.best_params_

{'xgbClass__colsample_bytree': 0.8,
 'xgbClass__max_depth': 6,
 'xgbClass__subsample': 0.5}

<IPython.core.display.Javascript object>

In [33]:
pipeline_cv_resample_f1.score(X_train_resample, y_train_resample)

0.8855170778222998

<IPython.core.display.Javascript object>

In [34]:
pipeline_cv_resample_f1.score(X_test, y_test)

0.7533992738199575

<IPython.core.display.Javascript object>

In [35]:
y_pred = pipeline_cv_resample_f1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.81      0.89     96078
           1       0.63      0.94      0.75     31865

    accuracy                           0.85    127943
   macro avg       0.80      0.88      0.82    127943
weighted avg       0.89      0.85      0.85    127943



<IPython.core.display.Javascript object>

In [36]:
importance_df = pd.DataFrame(
    {
        "feat": X_train.columns,
        "importance": pipeline_cv_resample_f1.best_estimator_.named_steps[
            "xgbClass"
        ].feature_importances_,
    }
)

importance_df.sort_values("importance", ascending=False)

Unnamed: 0,feat,importance
10,Term_years,0.380492
9,bank_out_of_state,0.156057
1,BankState,0.106188
14,Disbr_year,0.100384
18,bank_size,0.071149
0,State,0.063261
19,percent_SBA,0.048667
5,LowDoc,0.015171
12,retained_category,0.014683
7,twoDigNAICS,0.013894


<IPython.core.display.Javascript object>

In [None]:
rf_grid = {
    
}

In [37]:
rf_pipeline = Pipeline(
    [
        ('preprocessing', preprocessing),
        ('rf_class', RandomForestClassifier())
    ])


<IPython.core.display.Javascript object>

In [None]:
rf_cv = GridSearchCV