In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [17]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Replace with whatever model import(s) you're using
from xgboost import XGBClassifier


%matplotlib inline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

<IPython.core.display.Javascript object>

In [3]:
loans = pd.read_csv("sba_loans_cleaned2.csv")
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522582 entries, 0 to 522581
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   State               522582 non-null  object 
 1   BankState           522582 non-null  object 
 2   ApprovalFY          522582 non-null  int64  
 3   NoEmp               522582 non-null  int64  
 4   NewExist            522582 non-null  int64  
 5   RevLineCr           522049 non-null  float64
 6   LowDoc              518370 non-null  float64
 7   DisbursementGross   522582 non-null  float64
 8   MIS_Status          522582 non-null  int64  
 9   twoDigNAICS         522582 non-null  int64  
 10  is_franchise        522582 non-null  float64
 11  bank_out_of_state   522582 non-null  int64  
 12  Term_years          522582 non-null  float64
 13  job_category        522582 non-null  float64
 14  retained_category   522582 non-null  float64
 15  UrbanRural_cleaned  516078 non-nul

<IPython.core.display.Javascript object>

In [4]:
loans.head()

Unnamed: 0,State,BankState,ApprovalFY,NoEmp,NewExist,RevLineCr,LowDoc,DisbursementGross,MIS_Status,twoDigNAICS,is_franchise,bank_out_of_state,Term_years,job_category,retained_category,UrbanRural_cleaned,Disbr_year,Disbr_Month_sin,Disbr_Month_cos,sba_pre_approv,bank_size,percent_SBA,Appv_Month_sin,Appv_Month_cos
0,NC,MN,2006,2,1,0.0,0.0,253400.0,0,44,1.0,0,13.5,0.0,0.0,0.0,2006,0.866025,0.5,1.0,1.0,0.75,0.5,0.866025
1,AR,MS,2006,7,0,0.0,0.0,137300.0,0,72,0.0,0,10.5,0.0,0.0,0.0,2006,1.0,6.123234000000001e-17,1.0,0.0,0.85,0.5,0.866025
2,CA,SD,2006,18,1,1.0,0.0,438541.0,0,61,0.0,0,6.916667,1.0,2.0,0.0,2006,0.5,0.8660254,1.0,2.0,0.5,0.5,0.866025
3,FL,FL,2006,4,0,1.0,0.0,51440.0,0,23,0.0,1,7.0,0.0,1.0,0.0,2006,0.5,0.8660254,1.0,2.0,0.5,0.5,0.866025
4,LA,LA,2006,3,0,0.0,0.0,50000.0,0,53,0.0,1,5.0,0.0,0.0,0.0,2006,0.866025,-0.5,1.0,0.0,0.85,0.5,0.866025


<IPython.core.display.Javascript object>

In [5]:
cat_cols = [
    "State",
    "BankState",
    "twoDigNAICS",
]

ordinal_cols = [
    "job_category",
    "retained_category",
    "bank_size",
]

num_cols = [
    "ApprovalFY",
    "NoEmp",
    "DisbursementGross",
    "Term_years",  # untapped potential here
    "Disbr_year",  # may want to do some grouping by or transformation
    "Disbr_Month_sin",
    "Disbr_Month_cos",
    "Approval_Month_sin",
    "Approval_Month_cos",
]

bin_cols = ["NewExist", "is_franchise", "bank_out_of_state", "sba_pre_approv"]

# impute_cols = [
#     "RevLineCr",
#     "LowDoc",
#     "UrbanRural_cleaned",
# ]

# dropping na's until the imputer is working
loans = loans.dropna()

<IPython.core.display.Javascript object>

In [37]:
X = loans.drop(columns="MIS_Status")
y = loans["MIS_Status"]

<IPython.core.display.Javascript object>

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

<IPython.core.display.Javascript object>

In [39]:
preprocessing = ColumnTransformer(
    [
        ("leaveOneOut", LeaveOneOutEncoder(), cat_cols),
        # ("knnImptute", KNNImputer(n_neighbors=2), impute_cols),
        # ("simpleImptute", SimpleImputer(), impute_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

In [40]:
n_trees = 50
learning_rate = 2 / n_trees

pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("xgbClass", XGBClassifier(n_estimators=n_trees, learning_rate=learning_rate)),
    ]
)

<IPython.core.display.Javascript object>

In [41]:
grid = {
    "xgbClass__subsample": [0.5, 0.75, 1.0],
    #     "gbr__max_features": [0.5, 0.75, 1.0], # alternative
    # "xgbClass__max_features": [0.4, 0.6, 0.8, 1.0],
    "xgbClass__max_depth": [3, 4, 6],
}



<IPython.core.display.Javascript object>

In [42]:
# 2.6min runtime
pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=3)
pipeline_cv.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('leaveOneOut',
                                                                         LeaveOneOutEncoder(cols=None,
                                                                                            drop_invariant=False,
                                                                                            handle_missing='value',
                                                                                            handle_unknown='value',
       

<IPython.core.display.Javascript object>

In [46]:
pipeline_cv.best_params_

{'xgbClass__max_depth': 4, 'xgbClass__subsample': 1.0}

<IPython.core.display.Javascript object>

In [47]:
pipeline_cv.cv_results_

{'mean_fit_time': array([15.71028336, 14.72492107, 12.62022209, 20.0606362 , 19.88145288,
        17.93000889, 31.7094601 , 28.55955696, 22.68129063]),
 'std_fit_time': array([0.10242996, 0.08719009, 0.2125926 , 0.14850272, 0.35953109,
        1.09544751, 0.48900515, 0.74860047, 3.30054025]),
 'mean_score_time': array([0.38895901, 0.38962317, 0.37133813, 0.45012959, 0.48370465,
        0.45677694, 0.61335826, 0.6522549 , 0.49999523]),
 'std_score_time': array([0.00732964, 0.02206729, 0.00702096, 0.00734371, 0.01976332,
        0.00293604, 0.00373157, 0.04935151, 0.07208523]),
 'param_xgbClass__max_depth': masked_array(data=[3, 3, 3, 4, 4, 4, 6, 6, 6],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_xgbClass__subsample': masked_array(data=[0.5, 0.75, 1.0, 0.5, 0.75, 1.0, 0.5, 0.75, 1.0],
              mask=[False, False, False, False, False, False, False, False,
          

<IPython.core.display.Javascript object>

In [48]:
pipeline_cv.score(X_train, y_train)

0.8995409402128036

<IPython.core.display.Javascript object>

In [49]:
pipeline_cv.score(X_test, y_test)

0.8982593811306597

<IPython.core.display.Javascript object>

In [50]:
y_pred = pipeline_cv.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93     96129
           1       0.85      0.72      0.78     31814

    accuracy                           0.90    127943
   macro avg       0.88      0.84      0.86    127943
weighted avg       0.90      0.90      0.90    127943



<IPython.core.display.Javascript object>

In [51]:
confusion_matrix(y_test, y_pred)

array([[92002,  4127],
       [ 8890, 22924]], dtype=int64)

<IPython.core.display.Javascript object>

In [53]:
importance_df = pd.DataFrame(
    {
        "feat": X_train.columns,
        "importance": pipeline_cv.best_estimator_.named_steps[
            "xgbClass"
        ].feature_importances_,
    }
)

importance_df.sort_values("importance", ascending=False)

Unnamed: 0,feat,importance
11,Term_years,0.257043
19,bank_size,0.204841
10,bank_out_of_state,0.18418
1,BankState,0.088663
15,Disbr_year,0.072887
3,NoEmp,0.057057
0,State,0.052542
20,percent_SBA,0.029604
8,twoDigNAICS,0.026781
6,LowDoc,0.026403


<IPython.core.display.Javascript object>

In [21]:
y.value_counts()

0    385081
1    126690
Name: MIS_Status, dtype: int64

<IPython.core.display.Javascript object>

Due to the large class imbalance, the majority class will be down sampled to the size of the minority class. 

In [34]:
X_train_0 = X_train[y_train == 0]
X_train_1 = X_train[y_train == 1]

n_0 = X_train_0.shape[0]
n_1 = X_train_1.shape[0]

# Sample majority class to have less observations
X_train_0_sample = X_train_0.sample(n_1, replace=False, random_state=42)

# # Sample minority class to have less observations
# X_train_1_sample = X_train_1.sample(n, replace=True, random_state=42)

X_train_resample = pd.concat((X_train_1, X_train_0_sample))
X_train_resample = X_train_resample.reset_index(drop=True)

y_train_resample = np.array([1] * n_1 + [0] * n_1)
y_train_resample.mean()

0.5

<IPython.core.display.Javascript object>

In [58]:
pipeline_cv_resample = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=3)
pipeline_cv_resample.fit(X_train_resample, y_train_resample)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   51.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('leaveOneOut',
                                                                         LeaveOneOutEncoder(cols=None,
                                                                                            drop_invariant=False,
                                                                                            handle_missing='value',
                                                                                            handle_unknown='value',
       

<IPython.core.display.Javascript object>

In [59]:
pipeline_cv_resample.best_params_

{'xgbClass__max_depth': 4, 'xgbClass__subsample': 0.5}

<IPython.core.display.Javascript object>

In [60]:
pipeline_cv_resample.score(X_train_resample, y_train_resample)

0.8842030999915761

<IPython.core.display.Javascript object>

In [62]:
pipeline_cv_resample.score(X_test, y_test)

0.8565845728175828

<IPython.core.display.Javascript object>

In [63]:
y_pred = pipeline_cv_resample.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90     96129
           1       0.64      0.94      0.77     31814

    accuracy                           0.86    127943
   macro avg       0.81      0.89      0.83    127943
weighted avg       0.89      0.86      0.86    127943



<IPython.core.display.Javascript object>