In [2]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [3]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,Name,object,0,0.0,611131,"[SOUTHWEST DENTAL ASSOCIATES S, Greenlight Opt..."
1,City,object,0,0.0,28860,"[SHREWSBURY, PEARLAND, OXFORD, GARDENA, AUSTIN]"
2,State,object,0,0.0,51,"[MN, MD, WA, CA]"
3,Bank,object,0,0.0,5221,"[WELLS FARGO BANK NATL ASSOC, BBCN BANK, THE B..."
4,BankState,object,0,0.0,55,"[OH, NC, VT, GA, TX]"
5,NAICS,object,0,0.0,20,"[44-45, 72, 81]"
6,ApprovalDate,object,0,0.0,7367,"[10-12-03, 21-09-05, 13-05-04, 16-11-06, 16-10..."
7,ApprovalFY,int64,0,0.0,29,"[2008, 1990, 2007, 2005]"
8,Term,int64,0,0.0,378,"[300, 32, 84]"
9,NoEmp,int64,0,0.0,502,"[8, 1, 3, 11]"


In [4]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [6]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [7]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5284301	total: 706ms	remaining: 11m 44s
1:	learn: 0.4242194	total: 950ms	remaining: 7m 54s
2:	learn: 0.3589203	total: 1.49s	remaining: 8m 15s
3:	learn: 0.3202413	total: 1.68s	remaining: 6m 59s
4:	learn: 0.2978130	total: 2.17s	remaining: 7m 10s
5:	learn: 0.2771180	total: 2.61s	remaining: 7m 12s
6:	learn: 0.2591597	total: 3.09s	remaining: 7m 18s
7:	learn: 0.2427013	total: 3.57s	remaining: 7m 22s
8:	learn: 0.2355862	total: 3.98s	remaining: 7m 18s
9:	learn: 0.2301515	total: 4.34s	remaining: 7m 10s
10:	learn: 0.2242157	total: 4.83s	remaining: 7m 14s
11:	learn: 0.2201500	total: 5.26s	remaining: 7m 12s
12:	learn: 0.2166442	total: 5.69s	remaining: 7m 12s
13:	learn: 0.2122358	total: 6.22s	remaining: 7m 18s
14:	learn: 0.2072176	total: 6.79s	remaining: 7m 26s
15:	learn: 0.2012689	total: 7.2s	remaining: 7m 22s
16:	learn: 0.1985113	total: 7.74s	remaining: 7m 27s
17:	learn: 0.1961195	total: 8.03s	remaining: 7m 18s
18:	learn: 0.1937842	total: 8.63s	remaining:

<catboost.core.CatBoostClassifier at 0x18b1389ccd0>

In [8]:
y_pred = model.predict(X_test)

In [9]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.964512331677625
Score F1 : 0.9458493612951737
              precision    recall  f1-score   support

       False       0.93      0.90      0.91      7344
        True       0.97      0.98      0.98     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.94      0.95     34829
weighted avg       0.96      0.96      0.96     34829



In [10]:
sauvegarder_modele(model, 'catboost_model.pkl')

loaded_model = charger_modele('catboost_model.pkl')

Le modèle a été sauvegardé dans '..\Model\catboost_model.pkl.pckl'.
Le modèle a été chargé à partir de '..\Model\catboost_model.pkl.pckl'.
