In [None]:
import os
import sys
import gdown
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn import set_config



In [6]:

df = pd.read_csv('SBAcleaned.csv')

In [7]:
y = df['MIS_Status']
X = df.drop('MIS_Status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [9]:


numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [11]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gbm', GradientBoostingClassifier())
])

In [13]:
param_grid = {
    'gbm__n_estimators': [100],  
    'gbm__learning_rate': [0.05, 0.1, 0.2],  
    'gbm__max_depth': [5],
    'gbm__subsample': [0.7, 1.0],
    'gbm__max_features': ['auto', 'sqrt', 'log2'],
    'gbm__loss': ['deviance', 'exponential'],
    'gbm__criterion': ['friedman_mse', 'mse', 'mae'],
    'gbm__init': ['zero', 'random'],
    'gbm__random_state': [42]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best hyperparameters : {grid_search.best_params_}")

print(f"Best accuracy : {grid_search.best_score_}")

In [None]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [None]:
f1_macro = f1_score(y_test, predictions, average='macro')

print(f"Score F1 : {f1_macro}")