In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from utils import generate_description
from pickle_job import sauvegarder_modele, charger_modele

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_pure = df_pure.drop('Name',axis=1)

In [3]:
df_pure = df_pure.drop('ApprovalDate',axis=1)
df_pure = df_pure.drop('ApprovalFY',axis=1)

In [4]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[BROOKLYN, POINT PLEASANT BEACH, ROCKAWAY, BOU..."
1,State,object,0,0.0,51,"[MA, PA, CA]"
2,Bank,object,0,0.0,5221,"[READYCAP LENDING, LLC, FLUSHING BANK, BUSINES..."
3,BankState,object,0,0.0,55,"[OH, UT, CA, IL]"
4,NAICS,object,0,0.0,20,"[71, 42, 44-45, 48-49, 81]"
5,Term,int64,0,0.0,378,"[60, 240, 36]"
6,NoEmp,int64,0,0.0,502,"[1, 10, 12, 2]"
7,NewExist,bool,0,0.0,2,[True]
8,CreateJob,int64,0,0.0,218,"[2, 0]"
9,RetainedJob,int64,0,0.0,325,"[0, 2, 4, 5, 8]"


In [5]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [7]:
numerical_columns_selector = X.select_dtypes(exclude=['object', 'bool']).columns
categorical_columns_selector = X.select_dtypes(include=['object', 'bool']).columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [8]:
categorical_columns_indices = [X.columns.get_loc(col) for col in categorical_columns_selector]

model = CatBoostClassifier(cat_features=categorical_columns_indices)
model.fit(X_train, y_train)


Learning rate set to 0.164956
0:	learn: 0.5379369	total: 528ms	remaining: 8m 47s
1:	learn: 0.4321330	total: 881ms	remaining: 7m 19s
2:	learn: 0.3719982	total: 1.11s	remaining: 6m 9s
3:	learn: 0.3334925	total: 1.42s	remaining: 5m 54s
4:	learn: 0.3113326	total: 1.72s	remaining: 5m 42s
5:	learn: 0.2843986	total: 2.1s	remaining: 5m 48s
6:	learn: 0.2678681	total: 2.39s	remaining: 5m 38s
7:	learn: 0.2565025	total: 2.8s	remaining: 5m 47s
8:	learn: 0.2489724	total: 3.23s	remaining: 5m 55s
9:	learn: 0.2427822	total: 3.56s	remaining: 5m 52s
10:	learn: 0.2305218	total: 3.89s	remaining: 5m 49s
11:	learn: 0.2264070	total: 4.12s	remaining: 5m 38s
12:	learn: 0.2230230	total: 4.4s	remaining: 5m 33s
13:	learn: 0.2170782	total: 4.65s	remaining: 5m 27s
14:	learn: 0.2105175	total: 4.9s	remaining: 5m 21s
15:	learn: 0.2082212	total: 5.11s	remaining: 5m 13s
16:	learn: 0.2060524	total: 5.5s	remaining: 5m 17s
17:	learn: 0.2037854	total: 5.72s	remaining: 5m 12s
18:	learn: 0.2013495	total: 5.91s	remaining: 5m 4s

<catboost.core.CatBoostClassifier at 0x29ae4b6d0>

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred_bool = np.array([True if pred == 'True' else False for pred in y_pred])

accuracy = accuracy_score(y_test, y_pred_bool)
print(f'Accuracy : {accuracy}')

f1_macro = f1_score(y_test, y_pred_bool, average="macro")
print(f"Score F1 : {f1_macro}")

print(classification_report(y_test, y_pred_bool))

Accuracy : 0.9601194406959718
Score F1 : 0.939136855102799
              precision    recall  f1-score   support

       False       0.92      0.88      0.90      7344
        True       0.97      0.98      0.97     27485

    accuracy                           0.96     34829
   macro avg       0.95      0.93      0.94     34829
weighted avg       0.96      0.96      0.96     34829



In [11]:
sauvegarder_modele(model, 'catboost_model')

Le modèle a été sauvegardé dans '../Model/catboost_model.pkl'.
