In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import Utils as ut
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd



In [2]:
class CorrectedMultiOutputClassifier(BaseEstimator, ClassifierMixin):
    """Wraps a multi-output classifier and applies label correction after prediction."""
    def __init__(self, base_estimator, correction_estimator=None):
        self.base_estimator = base_estimator
        self.correction_estimator = correction_estimator or ut.LabelCorrection()

    def fit(self, X, y):
        self.base_estimator.fit(X, y)
        if hasattr(self.correction_estimator, "fit"):
            self.correction_estimator.fit(X)
        return self

    def predict(self, X):
        raw_preds = self.base_estimator.predict(X)
        df = pd.DataFrame(raw_preds, columns=ut.LABEL_COLUMNS)
        if isinstance(X, pd.DataFrame) and "rating" in X.columns:
            df["rating"] = X["rating"].values
        else:
            df["rating"] = 3  # valeur neutre par défaut
        df = self.correction_estimator.transform(df)
        return df[ut.LABEL_COLUMNS].values

    def predict_proba(self, X):
        return self.base_estimator.predict_proba(X)


In [3]:
%%time
df_train = pd.read_csv(ut.PATH_LABELISED_SET,index_col=0)
# 4. Fit sur ton set d'entraînement
df_train = ut.Preprocessor().transform(df_train)
X_train = df_train[["revue", "rating"]]
y_train = df_train[ut.LABEL_COLUMNS]
corrected_model = CorrectedMultiOutputClassifier(
    base_estimator=MultiOutputClassifier(
        XGBClassifier(
            n_jobs=1,
            use_label_encoder=False,  # pour éviter le warning
            eval_metric='logloss',   # évite erreur pour multiclass
            random_state=42,
            verbosity=0,
            n_estimators = 100,
            subsample = 0.8,
            colsample_bytree = 0.8
        )
    ),
    correction_estimator=ut.LabelCorrection()
)
pipeline = Pipeline([
    ('features', ut.final_features),
    ('model', corrected_model)
])
param_grid = {
    "model__base_estimator__estimator__max_depth": [3, 5, 7,9],
    "model__base_estimator__estimator__learning_rate": [0.03, 0.05, 0.1,0.15]
}
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="f1_micro", verbose=2)
grid.fit(X_train, y_train)
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values(by="mean_test_score", ascending=False)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=3; total time=  24.6s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=3; total time=  23.2s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=3; total time=  22.9s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=5; total time=  23.6s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=5; total time=  23.6s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=5; total time=  23.0s
[CV] END model__base_estimator__estimator__learning_rate=0.03, model__base_estimator__estimator__max_depth=7; total time=  26.7s
[CV] END model__base_estimator__esti

In [4]:
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__base_estimator__estimator__learning_rate,param_model__base_estimator__estimator__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,15.716364,0.976158,7.270354,1.204419,0.1,3,{'model__base_estimator__estimator__learning_r...,0.762234,0.822391,0.69992,0.761515,0.050001,1
12,15.330872,0.778245,7.60321,1.157875,0.15,3,{'model__base_estimator__estimator__learning_r...,0.761788,0.813089,0.694162,0.756346,0.048704,2
6,17.598683,1.411276,7.126128,1.099047,0.05,7,{'model__base_estimator__estimator__learning_r...,0.751557,0.817943,0.697987,0.755829,0.049065,3
11,17.804376,1.248942,7.195942,1.128949,0.1,9,{'model__base_estimator__estimator__learning_r...,0.749814,0.828909,0.687853,0.755525,0.057727,4
15,17.299503,1.538912,7.168088,1.079545,0.15,9,{'model__base_estimator__estimator__learning_r...,0.75453,0.820016,0.690247,0.754931,0.052979,5
