In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_pure = df_pure.drop('Name',axis=1)

In [3]:
df_pure = df_pure.drop('ApprovalDate',axis=1)
df_pure = df_pure.drop('ApprovalFY',axis=1)

In [4]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[DALLAS, PORTLAND, SEATTLE, SAN DIEGO, SLATE H..."
1,State,object,0,0.0,51,"[NJ, KY, UT, CO, GA]"
2,Bank,object,0,0.0,5221,"[ROCKLAND TRUST COMPANY, UNITED CENTRAL BANK, ..."
3,BankState,object,0,0.0,55,"[CA, IL, PA, OH, AR]"
4,NAICS,object,0,0.0,20,"[56, 31-33, 81, 62, 72]"
5,Term,int64,0,0.0,378,"[67, 120, 84, 216]"
6,NoEmp,int64,0,0.0,502,"[5, 2, 3, 1]"
7,NewExist,bool,0,0.0,2,[True]
8,CreateJob,int64,0,0.0,218,"[0, 22]"
9,RetainedJob,int64,0,0.0,325,"[2, 1, 3]"


In [5]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [7]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [8]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5379369	total: 703ms	remaining: 11m 42s
1:	learn: 0.4321330	total: 1.28s	remaining: 10m 40s
2:	learn: 0.3720179	total: 1.68s	remaining: 9m 17s
3:	learn: 0.3382316	total: 2.08s	remaining: 8m 36s
4:	learn: 0.3038444	total: 2.67s	remaining: 8m 51s
5:	learn: 0.2834936	total: 3.21s	remaining: 8m 52s
6:	learn: 0.2637925	total: 3.69s	remaining: 8m 44s
7:	learn: 0.2520120	total: 4.16s	remaining: 8m 35s
8:	learn: 0.2418088	total: 4.65s	remaining: 8m 32s
9:	learn: 0.2349106	total: 5.07s	remaining: 8m 21s
10:	learn: 0.2303163	total: 5.46s	remaining: 8m 11s
11:	learn: 0.2234932	total: 5.96s	remaining: 8m 10s
12:	learn: 0.2201837	total: 6.39s	remaining: 8m 5s
13:	learn: 0.2169529	total: 6.83s	remaining: 8m 1s
14:	learn: 0.2124745	total: 7.28s	remaining: 7m 58s
15:	learn: 0.2082895	total: 7.72s	remaining: 7m 55s
16:	learn: 0.2050954	total: 8.17s	remaining: 7m 52s
17:	learn: 0.2027978	total: 8.6s	remaining: 7m 48s
18:	learn: 0.2004511	total: 9.02s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f7e503472e0>

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.9596026299922479
Score F1 : 0.9383354386009064
              precision    recall  f1-score   support

       False       0.92      0.88      0.90      7344
        True       0.97      0.98      0.97     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.93      0.94     34829
weighted avg       0.96      0.96      0.96     34829



In [11]:
sauvegarder_modele(model, 'catboost_model')

Le modèle a été sauvegardé dans '../Model/catboost_model.pkl'.
