In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[DALLAS, SALISBURY, INDIANAPOLIS, NORTH PLAINS..."
1,State,object,0,0.0,51,"[CA, MA, MI, HI]"
2,Bank,object,0,0.0,5221,"[SOVEREIGN BANK, COMERICA BANK, BRANCH BK. & T..."
3,BankState,object,0,0.0,55,"[DC, WI, CA, MN, VA]"
4,NAICS,object,0,0.0,20,"[44-45, 81, 54, 71]"
5,ApprovalDate,object,0,0.0,7367,"[22-02-89, 24-12-03, 30-01-04, 14-05-08, 05-04..."
6,ApprovalFY,int64,0,0.0,29,"[2000, 2007, 2003, 1992, 2005]"
7,Term,int64,0,0.0,378,"[130, 60, 84, 120, 240]"
8,NoEmp,int64,0,0.0,502,"[3, 2, 35, 11, 8]"
9,NewExist,int64,0,0.0,2,[1]


In [3]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [5]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [6]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

# Initialize and fit the model
model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164957
0:	learn: 0.5326729	total: 352ms	remaining: 5m 51s
1:	learn: 0.4280727	total: 620ms	remaining: 5m 9s
2:	learn: 0.3673866	total: 854ms	remaining: 4m 43s
3:	learn: 0.3306180	total: 1.05s	remaining: 4m 22s
4:	learn: 0.3003553	total: 1.27s	remaining: 4m 12s
5:	learn: 0.2802671	total: 1.52s	remaining: 4m 11s
6:	learn: 0.2666509	total: 1.66s	remaining: 3m 55s
7:	learn: 0.2505848	total: 1.88s	remaining: 3m 53s
8:	learn: 0.2404257	total: 2.19s	remaining: 4m
9:	learn: 0.2280862	total: 2.48s	remaining: 4m 5s
10:	learn: 0.2235376	total: 2.72s	remaining: 4m 4s
11:	learn: 0.2184816	total: 3.06s	remaining: 4m 11s
12:	learn: 0.2134023	total: 3.22s	remaining: 4m 4s
13:	learn: 0.2107578	total: 3.45s	remaining: 4m 3s
14:	learn: 0.2075128	total: 3.63s	remaining: 3m 58s
15:	learn: 0.2032454	total: 3.83s	remaining: 3m 55s
16:	learn: 0.2005273	total: 4.01s	remaining: 3m 52s
17:	learn: 0.1973175	total: 4.2s	remaining: 3m 49s
18:	learn: 0.1945793	total: 4.35s	remaining: 3m 44s
19

<catboost.core.CatBoostClassifier at 0x1f6890bf110>

In [7]:
y_pred = model.predict(X_test)

In [8]:
# Convert y_pred to boolean
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

# Calculate F1 score
f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

# Print classification report
print(classification_report(y_test, y_pred_bool))


Accuracy : 0.9626471432672983
Score F1 : 0.9429697800407499
              precision    recall  f1-score   support

       False       0.93      0.89      0.91      7344
        True       0.97      0.98      0.98     27486

    accuracy                           0.96     34830
   macro avg       0.95      0.94      0.94     34830
weighted avg       0.96      0.96      0.96     34830



In [9]:
# Ensuite, sauvegardez-le
sauvegarder_modele(model, 'catboost_model.pkl')
# Plus tard, chargez le modèle à partir du fichier
loaded_model = charger_modele('catboost_model.pkl')

Le modèle a été sauvegardé dans 'catboost_model.pkl'.
Le modèle a été chargé à partir de 'catboost_model.pkl'.
