In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn import set_config

from catboost import CatBoostClassifier

from data_to_csv import df_to_csv
from utils import generate_description

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,City,object,0,0.0,28860,"[STAR, PAYSON, ST. GEORGE, CAMBRIDGE, CHANDLER]"
1,State,object,0,0.0,51,"[NJ, MN, TX, CA, MD]"
2,Bank,object,0,0.0,5221,"[PIONEER BANK, FREEDOM NATIONAL BANK, SUNTRUST..."
3,BankState,object,0,0.0,55,"[CT, OH, CA, MN, RI]"
4,NAICS,object,0,0.0,20,"[54, 62, 56, 42]"
5,ApprovalDate,object,0,0.0,7367,"[10-09-04, 28-04-06, 19-10-00, 02-08-07, 01-07..."
6,ApprovalFY,int64,0,0.0,29,"[1990, 2004, 2002, 1996, 2005]"
7,Term,int64,0,0.0,378,"[70, 63, 65, 180]"
8,NoEmp,int64,0,0.0,502,"[1, 2, 4, 18]"
9,NewExist,bool,0,0.0,2,[True]


In [3]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [5]:
cat_features_indices = ['City','State','Bank','BankState','NAICS','ApprovalDate','UrbanRural']
model = CatBoostClassifier(cat_features=cat_features_indices)
model.fit(X_train,y_train)

Learning rate set to 0.164957
0:	learn: 0.5326729	total: 638ms	remaining: 10m 37s
1:	learn: 0.4280727	total: 1.31s	remaining: 10m 53s
2:	learn: 0.3673866	total: 1.86s	remaining: 10m 17s
3:	learn: 0.3306180	total: 2.4s	remaining: 9m 56s
4:	learn: 0.3003553	total: 2.99s	remaining: 9m 55s
5:	learn: 0.2802671	total: 3.68s	remaining: 10m 10s
6:	learn: 0.2666509	total: 4.15s	remaining: 9m 48s
7:	learn: 0.2505848	total: 4.71s	remaining: 9m 44s
8:	learn: 0.2404257	total: 5.4s	remaining: 9m 54s
9:	learn: 0.2280862	total: 6.07s	remaining: 10m 1s
10:	learn: 0.2235376	total: 6.67s	remaining: 9m 59s
11:	learn: 0.2184931	total: 7.46s	remaining: 10m 14s
12:	learn: 0.2134122	total: 7.93s	remaining: 10m 2s
13:	learn: 0.2104018	total: 8.37s	remaining: 9m 49s
14:	learn: 0.2045908	total: 8.86s	remaining: 9m 42s
15:	learn: 0.2026451	total: 9.44s	remaining: 9m 40s
16:	learn: 0.1995157	total: 9.92s	remaining: 9m 33s
17:	learn: 0.1965470	total: 10.4s	remaining: 9m 27s
18:	learn: 0.1933651	total: 10.9s	remaini

<catboost.core.CatBoostClassifier at 0x7f1c79845cf0>

In [6]:
y_pred = model.predict(X_test)

In [8]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

# Calculate F1 score
f1_macro = f1_score(y_test, y_pred, average="macro")
print(f"Score F1 : {f1_macro}")

# Print classification report
print(classification_report(y_test, y_pred))

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[False  True] and y_pred=['False' 'True']. Make sure that the predictions provided by the classifier coincides with the true labels.

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ],
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [5, 10],  
    'model__learning_rate': [0.1, 0.01],  
    'model__subsample': [0.8, 0.9],  
    'model__colsample_bylevel': [0.8, 0.9]  
}

Traceback (most recent call last):
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 113, in _check_targets
    unique_values = _union1d(y_true, y_pred, xp)
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 118, in _union1d
    return xp.asarray(numpy.union1d(a, b))
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 932, in union1d
    return unique(np.concatenate((ar1, ar2), axis=None))
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 274, in unique
    ret = _unique1d(ar, return_index, return_inverse, return_counts,
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 336, in _unique1d
    ar.sort()
TypeError: '<' not su

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print(f"Best hyperparameters : {grid_search.best_params_}")
print(f"Best accuracy : {grid_search.best_score_}")

In [None]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy : {accuracy}')

In [None]:
f1_macro = f1_score(y_test, y_pred, average="macro")
print(f"Score F1 : {f1_macro}")

In [None]:
print(classification_report(y_test, y_pred))