In [10]:
import os
import sys
sys.path.append('../Module')
folder_path = '../Dataset'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn import set_config

df = pd.read_csv(folder_path + '/SBA_Cleaned_maybe.csv')

In [None]:
maybe_leaking_feature = ['RetainedJob', 'CreateJob']
maybe_leaking_fea = ['SBA_Appv', 'GrAppv']
df = df.drop(maybe_leaking_feature, axis=1)

In [11]:
y = df['Approve']
X = df.drop('Approve', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns.tolist()
categorical_columns_selector = X.select_dtypes(include='object').columns.tolist()

numeric_features = X[numerical_columns_selector]
categorical_features = X[categorical_columns_selector]

In [14]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [16]:
pipelines = {
    'RandomForest': Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', RandomForestClassifier())]),
    'SVM': Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', SVC())]),
    'LogisticRegression': Pipeline(steps=[('preprocessor', preprocessor),
                                          ('classifier', LogisticRegression())])
}

In [17]:
param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20, 25],  
        'classifier__min_samples_leaf': [1, 2, 5, 80],  
        'classifier__min_samples_split': [2, 3],  
        'classifier__max_samples': [None, 0.7],  
        'classifier__random_state': [42]
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': [0.1, 0.01, 'scale']
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    }
}

In [18]:
best_models = {}
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grid=param_grids[model_name], cv=5, scoring='precision')
    grid_search.fit(X_train, y_train)  
    best_models[model_name] = grid_search.best_estimator_

best_model_name = max(best_models, key=lambda x: best_models[x].best_score_)

print("Meilleur modèle:", best_model_name)
print("Précision du meilleur modèle sur l'ensemble de validation croisée:", best_models[best_model_name].best_score_)
print("Meilleurs hyperparamètres:", best_models[best_model_name].best_params_)

ValueError: 
All the 640 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
640 fits failed with the following error:
Traceback (most recent call last):
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/utilisateur/.local/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 724, in fit_transform
    self._validate_column_callables(X)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 426, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 406, in _get_column_indices
    key_dtype = _determine_key_type(key)
  File "/home/utilisateur/.local/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 282, in _determine_key_type
    raise ValueError(err_msg)
ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
