In [1]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn import set_config

from catboost import CatBoostClassifier

from data_to_csv import df_to_csv
from utils import generate_description

df_0 = pd.read_csv(folder_path + '/SBA_Cleaned_0.csv')
df_1 = pd.read_csv(folder_path + '/SBA_Cleaned_1.csv')
df_2 = pd.read_csv(folder_path + '/SBA_Cleaned_2.csv')
df_pure = pd.read_csv(folder_path + '/SBA_Cleaned_Pure.csv')

In [2]:
df_0.dropna(inplace=True)
generate_description(df_0)

Unnamed: 0,feature,data_type,null,nulPct,unique,uniqueSample
0,NAICS,object,0,0.0,20,"[62, 52, 31-33, 42, 53]"
1,Term,int64,0,0.0,377,"[60, 282, 84, 300]"
2,NoEmp,int64,0,0.0,499,"[11, 1, 5, 3, 4]"
3,NewExist,float64,0,0.0,2,[1.0]
4,CreateJob,int64,0,0.0,216,"[0, 2, 18]"
5,RetainedJob,int64,0,0.0,323,"[2, 12, 6, 1]"
6,UrbanRural,int64,0,0.0,3,"[1, 0]"
7,RevLineCr,object,0,0.0,2,"[N, Y]"
8,LowDoc,object,0,0.0,2,"[N, Y]"
9,GrAppv,int64,0,0.0,16880,"[550000, 150000, 50000, 35000, 60000]"


In [3]:
y = df_0['Approve']
X = df_0.drop('Approve', axis=1)

bool_columns = X.select_dtypes(include='bool').columns
X[bool_columns] = X[bool_columns].astype(object)

In [4]:
print(y.value_counts())

Approve
True     540752
False    134274
Name: count, dtype: int64


In [5]:
y.isna().sum()

0

In [6]:
y.dtype

dtype('bool')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [8]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [9]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [5, 10],  
    'model__learning_rate': [0.1, 0.01],  
    'model__subsample': [0.8, 0.9],  
    'model__colsample_bylevel': [0.8, 0.9]  
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)
print("Meilleure précision trouvée :", grid_search.best_score_)

# Obtain the best model
best_model = grid_search.best_estimator_

# Obtain predictions using the best model
predictions = best_model.predict(X_test, predict="Class")

# Calculate f1-score
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

Traceback (most recent call last):
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 113, in _check_targets
    unique_values = _union1d(y_true, y_pred, xp)
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 118, in _union1d
    return xp.asarray(numpy.union1d(a, b))
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 932, in union1d
    return unique(np.concatenate((ar1, ar2), axis=None))
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 274, in unique
    ret = _unique1d(ar, return_index, return_inverse, return_counts,
  File "/home/utilisateur/projet/Brief_Prêt/Model_pret/.venv/lib/python3.10/site-packages/numpy/lib/arraysetops.py", line 336, in _unique1d
    ar.sort()
TypeError: '<' not su

In [None]:
df_1.dropna(inplace=True)
generate_description(df_1)

In [None]:
y = df_1['Approve']
X = df_1.drop('Approve', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [5, 10],  
    'model__learning_rate': [0.1, 0.01],  
    'model__subsample': [0.8, 0.9],  
    'model__colsample_bylevel': [0.8, 0.9]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

In [None]:
df_2.dropna(inplace=True)
generate_description(df_2)

In [None]:
y = df_2['Approve']
X = df_2.drop('Approve', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [5, 10],  
    'model__learning_rate': [0.1, 0.01],  
    'model__subsample': [0.8, 0.9],  
    'model__colsample_bylevel': [0.8, 0.9]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)

In [None]:
df_pure.dropna(inplace=True)
generate_description(df_pure)

In [None]:
y = df_pure['Approve']
X = df_pure.drop('Approve', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

In [None]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [5, 10],  
    'model__learning_rate': [0.1, 0.01],  
    'model__subsample': [0.8, 0.9],  
    'model__colsample_bylevel': [0.8, 0.9]  
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

print("Meilleure précision trouvée :", grid_search.best_score_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
f1_macro = f1_score(y_test, predictions, average='macro')
print("F1-score macro sur l'ensemble de test :", f1_macro)