In [45]:
import os
import sys
import gdown
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn import set_config

from catboost import CatBoostClassifier



In [46]:

df = pd.read_csv('SBAcleaned.csv')

df['State'] = df['State'].astype('category')
df['NAICS'] = df['NAICS'].astype('category')
df['Name'] = df['Name'].astype('category')
df['Bank'] = df['Bank'].astype('category')
df['BankState'] = df['BankState'].astype('category')

In [47]:
y = df['MIS_Status']
X = df.drop('MIS_Status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [48]:


numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

# Gradient Boosting

In [40]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gbm', GradientBoostingClassifier())
])

In [41]:
param_grid = {
    'gbm__n_estimators': [100],  
    'gbm__learning_rate': [0.05, 0.1, 0.2],  
    'gbm__max_depth': [5],
    'gbm__subsample': [0.7, 1.0],
    'gbm__max_features': ['auto', 'sqrt', 'log2'],
    'gbm__loss': ['deviance', 'exponential'],
    'gbm__criterion': ['friedman_mse', 'mse', 'mae'],
    'gbm__init': ['zero', 'random'],
    'gbm__random_state': [42]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(f"Best hyperparameters : {grid_search.best_params_}")

print(f"Best accuracy : {grid_search.best_score_}")

Best hyperparameters : {'gbm__criterion': 'friedman_mse', 'gbm__init': 'zero', 'gbm__learning_rate': 0.2, 'gbm__loss': 'exponential', 'gbm__max_depth': 5, 'gbm__max_features': 'sqrt', 'gbm__n_estimators': 100, 'gbm__random_state': 42, 'gbm__subsample': 0.7}
Best accuracy : 0.8528162005184452


In [None]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [None]:
f1_macro = f1_score(y_test, predictions, average='macro')

print(f"Score F1 : {f1_macro}")

Score F1 : 0.8511125837608076


# Catboost Classifier

In [49]:
cat_features = ['State','NAICS','Name', 'City','Bank','BankState']

model = CatBoostClassifier(cat_features=cat_features)

model.fit(X_train, y_train)

Learning rate set to 0.123656
0:	learn: 0.5641357	total: 497ms	remaining: 8m 16s
1:	learn: 0.4623802	total: 696ms	remaining: 5m 47s
2:	learn: 0.3855870	total: 848ms	remaining: 4m 41s
3:	learn: 0.3411232	total: 1.07s	remaining: 4m 26s
4:	learn: 0.3064233	total: 1.26s	remaining: 4m 10s
5:	learn: 0.2850050	total: 1.5s	remaining: 4m 7s
6:	learn: 0.2633442	total: 1.74s	remaining: 4m 7s
7:	learn: 0.2514004	total: 2.01s	remaining: 4m 9s
8:	learn: 0.2386960	total: 2.23s	remaining: 4m 5s
9:	learn: 0.2303320	total: 2.39s	remaining: 3m 56s
10:	learn: 0.2218997	total: 2.55s	remaining: 3m 49s
11:	learn: 0.2170014	total: 2.67s	remaining: 3m 39s
12:	learn: 0.2089670	total: 2.82s	remaining: 3m 34s
13:	learn: 0.2043733	total: 2.96s	remaining: 3m 28s
14:	learn: 0.2002004	total: 3.11s	remaining: 3m 24s
15:	learn: 0.1963989	total: 3.23s	remaining: 3m 18s
16:	learn: 0.1930659	total: 3.35s	remaining: 3m 13s
17:	learn: 0.1904397	total: 3.52s	remaining: 3m 11s
18:	learn: 0.1885115	total: 3.67s	remaining: 3m 9

<catboost.core.CatBoostClassifier at 0x291e97fd0>

In [50]:
y_pred = model.predict(X_test)

In [51]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

Accuracy : 0.9651556156968877


In [52]:
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"Score F1 : {f1_macro}")

Score F1 : 0.9328690780776092


In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      2794
           1       0.97      0.98      0.98     14942

    accuracy                           0.97     17736
   macro avg       0.94      0.92      0.93     17736
weighted avg       0.96      0.97      0.96     17736

