# Test et Évaluation des Modèles ML

In [3]:
import sys
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Paths + import DatasetLoader


In [4]:
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent  # adapte si ton notebook est ailleurs

DATASET_PATH = PROJECT_ROOT / "datasets" / "dataset_scoring.csv"
SAVED_MODELS_DIR = PROJECT_ROOT / "saved_models"
BEST_MODEL_PATH = SAVED_MODELS_DIR / "best_model.pkl"

REPORT_DIR = PROJECT_ROOT / "evaluation"
REPORT_DIR.mkdir(parents=True, exist_ok=True)

SCORING_DIR = PROJECT_ROOT / "Scoring"
sys.path.insert(0, str(SCORING_DIR))

# Ajouter le chemin vers le module DatasetLoader
current_dir = Path().resolve()
ml_models_dir = current_dir.parent
scoring_dir = ml_models_dir.parent
sys.path.insert(0, str(scoring_dir))

from ml_models.dataset_loader import DatasetLoader

## Charger dataset + préparer features

In [5]:
df = pd.read_csv(DATASET_PATH)

loader = DatasetLoader()
X, y = loader.prepare_features(df)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("df:", df.shape)
print("X:", X.shape, "y:", y.shape)
print("Best model:", BEST_MODEL_PATH)
print("Report dir:", REPORT_DIR)

df: (10000, 12)
X: (10000, 10) y: (10000,)
Best model: c:\Users\hp\Desktop\EcoLabel-MS-Score-environnemental-des-produits\Scoring\ml_models\saved_models\best_model.pkl
Report dir: c:\Users\hp\Desktop\EcoLabel-MS-Score-environnemental-des-produits\Scoring\ml_models\evaluation


## Charger le meilleur modèle

In [6]:
with open(BEST_MODEL_PATH, "rb") as f:
    best_model = pickle.load(f)

best_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Prédictions + métriques sur TEST

In [7]:
y_pred = best_model.predict(X_test)

rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae = float(mean_absolute_error(y_test, y_pred))
r2 = float(r2_score(y_test, y_pred))

print("RMSE:", rmse)
print("MAE :", mae)
print("R2  :", r2)

RMSE: 5.17560162118052
MAE : 4.130355686060208
R2  : 0.9685332202200759


## Analyse erreurs (résidus) + stats rapides

In [8]:
residuals = (y_test - y_pred)

summary = {
    "residual_mean": float(np.mean(residuals)),
    "residual_std": float(np.std(residuals)),
    "residual_min": float(np.min(residuals)),
    "residual_max": float(np.max(residuals)),
    "abs_error_mean": float(np.mean(np.abs(residuals))),
    "abs_error_median": float(np.median(np.abs(residuals)))
}

summary

{'residual_mean': 0.03558143170314589,
 'residual_std': 5.175479311414971,
 'residual_min': -18.766308579592092,
 'residual_max': 19.19516089008426,
 'abs_error_mean': 4.130355686060208,
 'abs_error_median': 3.4300505739837757}

## Top erreurs (les pires prédictions)

In [9]:
errors_df = pd.DataFrame({
    "y_true": y_test.values if hasattr(y_test, "values") else y_test,
    "y_pred": y_pred,
})
errors_df["error"] = errors_df["y_true"] - errors_df["y_pred"]
errors_df["abs_error"] = errors_df["error"].abs()

display(errors_df.sort_values("abs_error", ascending=False).head(15))

Unnamed: 0,y_true,y_pred,error,abs_error
233,29.097417,9.902256,19.195161,19.195161
919,70.621647,89.387955,-18.766309,18.766309
1945,54.927282,36.42382,18.503462,18.503462
151,10.332573,27.299173,-16.9666,16.9666
27,61.667341,45.256628,16.410712,16.410712
609,70.00566,54.520307,15.485353,15.485353
1362,33.486246,48.566507,-15.080262,15.080262
1027,31.746752,46.77869,-15.031937,15.031937
1552,47.757207,32.958993,14.798214,14.798214
1219,54.393382,69.070278,-14.676896,14.676896


## Sauvegarder prédictions + rapport JSON

In [10]:
# Sauver predictions
pred_path = REPORT_DIR / "predictions.csv"
errors_df.to_csv(pred_path, index=False)

# Rapport complet
report = {
    "dataset_path": str(DATASET_PATH),
    "best_model_path": str(BEST_MODEL_PATH),
    "n_samples_total": int(len(X)),
    "n_train": int(len(X_train)),
    "n_test": int(len(X_test)),
    "metrics": {
        "rmse": rmse,
        "mae": mae,
        "r2": r2
    },
    "residuals_summary": summary,
    "features": list(X.columns)
}

report_path = REPORT_DIR / "evaluation_report.json"
with open(report_path, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print("Saved:", pred_path)
print("Saved:", report_path)

Saved: c:\Users\hp\Desktop\EcoLabel-MS-Score-environnemental-des-produits\Scoring\ml_models\evaluation\predictions.csv
Saved: c:\Users\hp\Desktop\EcoLabel-MS-Score-environnemental-des-produits\Scoring\ml_models\evaluation\evaluation_report.json
