In [2]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [3]:
df_pure = df_pure.drop('Name',axis=1)

In [4]:
df_pure = df_pure.drop('ApprovalDate',axis=1)
df_pure = df_pure.drop('ApprovalFY',axis=1)

In [5]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[RICHMOND, DAVIS, SNOHOMISH, HAYWARD, VENTURA]"
1,State,object,0,0.0,51,"[CA, ID, IN, TN, IL]"
2,Bank,object,0,0.0,5221,"[CITIZENS BANK NATL ASSOC, JPMORGAN CHASE BANK..."
3,BankState,object,0,0.0,55,"[IL, PA, SC, OR]"
4,NAICS,object,0,0.0,20,"[72, 11, 44-45, 81, 23]"
5,Term,int64,0,0.0,378,"[84, 73]"
6,NoEmp,int64,0,0.0,502,"[1, 6, 2, 5, 10]"
7,NewExist,bool,0,0.0,2,[True]
8,CreateJob,int64,0,0.0,218,"[7, 0, 3]"
9,RetainedJob,int64,0,0.0,325,"[8, 1, 5, 3]"


In [6]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [8]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [9]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5379369	total: 668ms	remaining: 11m 7s
1:	learn: 0.4321330	total: 1.23s	remaining: 10m 15s
2:	learn: 0.3720179	total: 1.61s	remaining: 8m 55s
3:	learn: 0.3382316	total: 2.13s	remaining: 8m 49s
4:	learn: 0.3038444	total: 2.69s	remaining: 8m 54s
5:	learn: 0.2834936	total: 3.23s	remaining: 8m 54s
6:	learn: 0.2637925	total: 3.75s	remaining: 8m 51s
7:	learn: 0.2520120	total: 4.24s	remaining: 8m 45s
8:	learn: 0.2418088	total: 4.77s	remaining: 8m 45s
9:	learn: 0.2349106	total: 5.2s	remaining: 8m 34s
10:	learn: 0.2303163	total: 5.62s	remaining: 8m 25s
11:	learn: 0.2234932	total: 6.13s	remaining: 8m 24s
12:	learn: 0.2201837	total: 6.57s	remaining: 8m 19s
13:	learn: 0.2169529	total: 7.03s	remaining: 8m 14s
14:	learn: 0.2124745	total: 7.47s	remaining: 8m 10s
15:	learn: 0.2082895	total: 7.92s	remaining: 8m 7s
16:	learn: 0.2050954	total: 8.38s	remaining: 8m 4s
17:	learn: 0.2027978	total: 8.81s	remaining: 8m
18:	learn: 0.2004511	total: 9.36s	remaining: 8m 3s

<catboost.core.CatBoostClassifier at 0x7fa227fca3e0>

In [10]:
y_pred = model.predict(X_test)

In [11]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.9596026299922479
Score F1 : 0.9383354386009064
              precision    recall  f1-score   support

       False       0.92      0.88      0.90      7344
        True       0.97      0.98      0.97     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.93      0.94     34829
weighted avg       0.96      0.96      0.96     34829



In [12]:
sauvegarder_modele(model, 'catboost_model')

Le modèle a été sauvegardé dans '../Model/catboost_model.pkl'.
