In [None]:
import sys
sys.path.append("..")

from catboost import CatBoostClassifier

from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              HistGradientBoostingClassifier,
                              RandomForestClassifier, StackingClassifier)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (classification_report, make_scorer, recall_score)
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


from global_vars import X_test, X_train, y_test, y_train

Saving test_data15variables_dropFirst_stratified.csv to test_data15variables_dropFirst_stratified.csv
Saving train_data15variables_dropFirst_stratified.csv to train_data15variables_dropFirst_stratified.csv


In [None]:
# Le scorer reste fixe sur tous les modèles
scorer = make_scorer(recall_score, average="macro", labels=[2])

In [None]:
catboost = CatBoostClassifier()

param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'depth': [4, 6, 8],
    'iterations': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=5, scoring=scorer)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'depth': 4, 'iterations': 200, 'learning_rate': 0.001}


In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
print("Classification Report sur l'ensemble de test avec CatboostClassifier(): \n", classification_report(y_test, y_pred))

Classification Report sur l'ensemble de test avec CatboostClassifier(): 
               precision    recall  f1-score   support

           1       0.57      0.62      0.60      6163
           2       0.38      0.70      0.49      6163
           3       0.41      0.12      0.18      6164
           4       0.41      0.31      0.35      6163

    accuracy                           0.44     24653
   macro avg       0.44      0.44      0.40     24653
weighted avg       0.44      0.44      0.40     24653



# ADABOOST

In [None]:
adaboost = AdaBoostClassifier()
param_grid = {"n_estimators": [50, 100, 200], "learning_rate": [0.1, 0.01, 0.001]}

grid_search = GridSearchCV(
    estimator=adaboost, param_grid=param_grid, cv=5, scoring=scorer
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.001, 'n_estimators': 50}


In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
print("Classification Report sur l'ensemble de test avec Adaboost(): \n", classification_report(y_test, y_pred))

Classification Report sur l'ensemble de test avec Adaboost(): 
               precision    recall  f1-score   support

           1       0.57      0.62      0.60      6163
           2       0.38      0.70      0.49      6163
           3       0.41      0.12      0.18      6164
           4       0.41      0.31      0.35      6163

    accuracy                           0.44     24653
   macro avg       0.44      0.44      0.40     24653
weighted avg       0.44      0.44      0.40     24653



# SGD

In [None]:
sgd = SGDClassifier(loss='log')

param_grid = {
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "penalty": ["l1", "l2"],
    "max_iter": [1000, 2000, 3000],
    "learning_rate": ["constant", "optimal", "adaptive"],
}

grid_search = GridSearchCV(estimator=sgd, param_grid=param_grid, cv=5, scoring=scorer)

In [None]:
grid_search.fit(X_train, y_train)

240 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 892, in fit
    self._more_validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 149, in _more_validate_params
    raise ValueError("eta0 must be > 0")
ValueError: eta0 must be > 0

 0.57092409 0.5892204  0.58800394 0.59266514 0.63428161 0.59331581
   

In [None]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'alpha': 0.1, 'learning_rate': 'optimal', 'max_iter': 2000, 'penalty': 'l2'}


In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
print("Classification Report sur l'ensemble de test avec SGD(): \n", classification_report(y_test, y_pred))

Classification Report sur l'ensemble de test avec SGD(): 
               precision    recall  f1-score   support

           1       0.54      0.71      0.61      6163
           2       0.46      0.65      0.54      6163
           3       0.40      0.17      0.24      6164
           4       0.45      0.37      0.41      6163

    accuracy                           0.48     24653
   macro avg       0.46      0.48      0.45     24653
weighted avg       0.46      0.48      0.45     24653



# HGB

In [None]:
hgb = HistGradientBoostingClassifier()

param_grid = {
    "max_iter": [50, 100, 200],
    "learning_rate": [0.1, 0.01, 0.001],
    "max_depth": [None, 5, 10],
}

grid_search = GridSearchCV(estimator=hgb, param_grid=param_grid, cv=5, scoring=scorer)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.01, 'max_depth': 10, 'max_iter': 200}


In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print("Classification Report sur l'ensemble de test avec HGB(): \n", classification_report(y_test, y_pred))

Classification Report sur l'ensemble de test avec HGB(): 
               precision    recall  f1-score   support

           1       0.64      0.74      0.69      6163
           2       0.52      0.66      0.59      6163
           3       0.41      0.28      0.33      6164
           4       0.48      0.43      0.46      6163

    accuracy                           0.53     24653
   macro avg       0.51      0.53      0.52     24653
weighted avg       0.51      0.53      0.52     24653



# Bagging Classifier avec base model par default (Decision Tree)

In [None]:
bagging = BaggingClassifier()

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}


In [None]:
scorer = make_scorer(recall_score, average = "macro", labels = [2])

In [None]:
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring=scorer)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}


In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print("Classification Report sur l'ensemble de test avec BaggingClassifier(): \n", classification_report(y_test, y_pred))

Classification Report sur l'ensemble de test avec BaggingClassifier(): 
               precision    recall  f1-score   support

           1       0.59      0.78      0.67      6163
           2       0.52      0.64      0.57      6163
           3       0.43      0.27      0.33      6164
           4       0.49      0.39      0.43      6163

    accuracy                           0.52     24653
   macro avg       0.51      0.52      0.50     24653
weighted avg       0.51      0.52      0.50     24653



# Stacking

In [None]:
logistic_regression = LogisticRegression(max_iter=1000, C=0.001, penalty='l2')
SVM = SVC(C=0.1, kernel="linear")
knn_model = KNeighborsClassifier(n_neighbors=17)

In [None]:
base_estimators = [
    ("logistic_regression", logistic_regression),
    ("svm", SVM),
    ("knn", knn_model),
]

stacking = StackingClassifier(estimators=base_estimators, final_estimator=RandomForestClassifier())

In [None]:
param_grid = {
    'final_estimator__n_estimators': [10, 50, 100],
    'final_estimator__max_depth': [None, 5, 10],
    'final_estimator__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=stacking, param_grid=param_grid, cv=5, scoring=scorer, error_score="raise"
)

In [None]:
grid_search.fit(X_train, y_train)