In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_pure = df_pure.drop('Name',axis=1)

In [3]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[FORT WORTH, LOS ANGELES, BRIER, MCALLEN, FENTON]"
1,State,object,0,0.0,51,"[MA, CA, MT, FL, WI]"
2,Bank,object,0,0.0,5221,"[BBCN BANK, USC CU, BANK OF AMERICA NATL ASSOC..."
3,BankState,object,0,0.0,55,"[IN, CA, NC, CT]"
4,NAICS,object,0,0.0,20,"[72, 42, 48-49, 54, 31-33]"
5,ApprovalDate,object,0,0.0,7367,"[27-07-99, 30-01-06, 19-03-07, 24-05-95, 18-04..."
6,ApprovalFY,int64,0,0.0,29,"[1995, 2006, 2002, 1994, 2001]"
7,Term,int64,0,0.0,378,"[120, 84, 107, 60]"
8,NoEmp,int64,0,0.0,502,"[14, 3, 25, 1, 11]"
9,NewExist,bool,0,0.0,2,[True]


In [4]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [6]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [7]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5305364	total: 386ms	remaining: 6m 25s
1:	learn: 0.4261477	total: 664ms	remaining: 5m 31s
2:	learn: 0.3647838	total: 937ms	remaining: 5m 11s
3:	learn: 0.3215861	total: 1.27s	remaining: 5m 15s
4:	learn: 0.2945982	total: 1.54s	remaining: 5m 6s
5:	learn: 0.2747997	total: 1.84s	remaining: 5m 4s
6:	learn: 0.2596856	total: 2.16s	remaining: 5m 6s
7:	learn: 0.2442870	total: 2.45s	remaining: 5m 4s
8:	learn: 0.2355695	total: 2.65s	remaining: 4m 52s
9:	learn: 0.2280952	total: 3.06s	remaining: 5m 2s
10:	learn: 0.2232100	total: 3.23s	remaining: 4m 50s
11:	learn: 0.2181247	total: 3.46s	remaining: 4m 44s
12:	learn: 0.2144671	total: 3.63s	remaining: 4m 35s
13:	learn: 0.2086584	total: 3.89s	remaining: 4m 34s
14:	learn: 0.2050862	total: 4.08s	remaining: 4m 27s
15:	learn: 0.2033642	total: 4.37s	remaining: 4m 28s
16:	learn: 0.2014908	total: 4.63s	remaining: 4m 27s
17:	learn: 0.1974947	total: 4.8s	remaining: 4m 21s
18:	learn: 0.1938388	total: 5.09s	remaining: 4m 22

<catboost.core.CatBoostClassifier at 0x1ed920311d0>

In [8]:
y_pred = model.predict(X_test)

In [9]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.9625886473915415
Score F1 : 0.9428109592164164
              precision    recall  f1-score   support

       False       0.93      0.89      0.91      7344
        True       0.97      0.98      0.98     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.94      0.94     34829
weighted avg       0.96      0.96      0.96     34829



In [10]:
sauvegarder_modele(model, 'catboost_model')

Le modèle a été sauvegardé dans '..\Model\catboost_model.pkl'.
