In [2]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [3]:
df_pure = df_pure.drop('Name',axis=1)

In [4]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[LYNDONVILLE, EPPING (CENSUS NAME FOR EPPING, ..."
1,State,object,0,0.0,51,"[FL, PA, TX, OH, RI]"
2,Bank,object,0,0.0,5221,"[BRIDGE COMMUNITY BANK, U.S. BANK NATIONAL ASS..."
3,BankState,object,0,0.0,55,"[MD, CA, NY, MN, IL]"
4,NAICS,object,0,0.0,20,"[62, 42, 51, 81]"
5,ApprovalDate,object,0,0.0,7367,"[17-09-02, 09-10-03, 27-09-07, 08-03-07, 24-09..."
6,ApprovalFY,int64,0,0.0,29,"[1999, 2003, 2012, 2006, 2005]"
7,Term,int64,0,0.0,378,"[229, 60, 300, 240, 123]"
8,NoEmp,int64,0,0.0,502,"[4, 29, 1, 2]"
9,NewExist,bool,0,0.0,2,[True]


In [5]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [7]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [8]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5305364	total: 705ms	remaining: 11m 44s
1:	learn: 0.4261477	total: 1.3s	remaining: 10m 51s
2:	learn: 0.3647838	total: 1.97s	remaining: 10m 53s
3:	learn: 0.3215861	total: 2.62s	remaining: 10m 51s
4:	learn: 0.2945982	total: 3.19s	remaining: 10m 35s
5:	learn: 0.2747997	total: 3.81s	remaining: 10m 30s
6:	learn: 0.2596856	total: 4.49s	remaining: 10m 37s
7:	learn: 0.2442870	total: 5.04s	remaining: 10m 24s
8:	learn: 0.2355695	total: 5.49s	remaining: 10m 4s
9:	learn: 0.2280952	total: 6.27s	remaining: 10m 21s
10:	learn: 0.2238673	total: 6.72s	remaining: 10m 4s
11:	learn: 0.2184313	total: 7.36s	remaining: 10m 5s
12:	learn: 0.2129270	total: 7.85s	remaining: 9m 55s
13:	learn: 0.2097481	total: 8.35s	remaining: 9m 48s
14:	learn: 0.2040654	total: 8.85s	remaining: 9m 41s
15:	learn: 0.2013739	total: 9.3s	remaining: 9m 31s
16:	learn: 0.1988038	total: 9.71s	remaining: 9m 21s
17:	learn: 0.1961517	total: 10.2s	remaining: 9m 14s
18:	learn: 0.1930094	total: 10.6s	rem

<catboost.core.CatBoostClassifier at 0x7f70a73539a0>

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.9627896293318786
Score F1 : 0.9432090029811182
              precision    recall  f1-score   support

       False       0.93      0.89      0.91      7344
        True       0.97      0.98      0.98     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.94      0.94     34829
weighted avg       0.96      0.96      0.96     34829



In [11]:
sauvegarder_modele(model, 'catboost_model')

Le modèle a été sauvegardé dans '../Model/catboost_model.pkl'.
