In [34]:
import os
import sys
import gdown
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn import set_config

from catboost import CatBoostClassifier



In [25]:

df = pd.read_csv('SBAcleaned.csv')

df['State'] = df['State'].astype('category')
df['NAICS'] = df['NAICS'].astype('category')

In [23]:
y = df['MIS_Status']
X = df.drop('MIS_Status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [9]:


numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

# Gradient Boosting

In [11]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gbm', GradientBoostingClassifier())
])

In [13]:
param_grid = {
    'gbm__n_estimators': [100],  
    'gbm__learning_rate': [0.05, 0.1, 0.2],  
    'gbm__max_depth': [5],
    'gbm__subsample': [0.7, 1.0],
    'gbm__max_features': ['auto', 'sqrt', 'log2'],
    'gbm__loss': ['deviance', 'exponential'],
    'gbm__criterion': ['friedman_mse', 'mse', 'mae'],
    'gbm__init': ['zero', 'random'],
    'gbm__random_state': [42]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

1020 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sims/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sims/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sims/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/sims/anaconda3/lib/python3.1

In [None]:
{'modelcolsample_bytree': 0.9, 'modellearning_rate': 0.1, 'modelmax_depth': 10, 'modeln_estimators': 200, 'model__subsample': 0.9}

In [15]:
print(f"Best hyperparameters : {grid_search.best_params_}")

print(f"Best accuracy : {grid_search.best_score_}")

Best hyperparameters : {'gbm__criterion': 'friedman_mse', 'gbm__init': 'zero', 'gbm__learning_rate': 0.2, 'gbm__loss': 'exponential', 'gbm__max_depth': 5, 'gbm__max_features': 'sqrt', 'gbm__n_estimators': 100, 'gbm__random_state': 42, 'gbm__subsample': 0.7}
Best accuracy : 0.8528162005184452


In [16]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [17]:
f1_macro = f1_score(y_test, predictions, average='macro')

print(f"Score F1 : {f1_macro}")

Score F1 : 0.8511125837608076


# Catboost Classifier

In [26]:
cat_features = ['State','NAICS']

model = CatBoostClassifier(cat_features=cat_features)

model.fit(X_train, y_train)

Learning rate set to 0.123656
0:	learn: 0.5421545	total: 131ms	remaining: 2m 10s
1:	learn: 0.4513404	total: 193ms	remaining: 1m 36s
2:	learn: 0.3957134	total: 276ms	remaining: 1m 31s
3:	learn: 0.3553377	total: 355ms	remaining: 1m 28s
4:	learn: 0.3211250	total: 432ms	remaining: 1m 26s
5:	learn: 0.2984405	total: 529ms	remaining: 1m 27s
6:	learn: 0.2786018	total: 606ms	remaining: 1m 25s
7:	learn: 0.2641366	total: 685ms	remaining: 1m 24s
8:	learn: 0.2536792	total: 768ms	remaining: 1m 24s
9:	learn: 0.2444888	total: 865ms	remaining: 1m 25s
10:	learn: 0.2373940	total: 945ms	remaining: 1m 24s
11:	learn: 0.2313808	total: 1s	remaining: 1m 22s
12:	learn: 0.2255927	total: 1.07s	remaining: 1m 21s
13:	learn: 0.2208442	total: 1.15s	remaining: 1m 20s
14:	learn: 0.2178839	total: 1.23s	remaining: 1m 21s
15:	learn: 0.2149325	total: 1.31s	remaining: 1m 20s
16:	learn: 0.2115116	total: 1.4s	remaining: 1m 21s
17:	learn: 0.2090949	total: 1.51s	remaining: 1m 22s
18:	learn: 0.2051862	total: 1.58s	remaining: 1m 

<catboost.core.CatBoostClassifier at 0x28f610b50>

In [27]:
y_pred = model.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

Accuracy : 0.9541046459179071


In [33]:
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"Score F1 : {f1_macro}")

Score F1 : 0.9117378637471814


In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      2794
           1       0.97      0.98      0.97     14942

    accuracy                           0.95     17736
   macro avg       0.92      0.90      0.91     17736
weighted avg       0.95      0.95      0.95     17736

