##### Projet CO2 par Polina, Vincent, Denis

Ce notebook:
entraine un mod√®le de classification pour pr√©diction par Gradient Boosting Machines Catboost  
Prend en entr√©e les fichiers:
    (processed)/X_test_scaled.csv, X_train_scaled.csv, y_test_cat.csv, y_train_cat.csv : les donn√©es scal√©es et donc forc√©ment pr√©alablement s√©par√©es en jeux de train/test.

Fournit en sortie les fichiers:
    (models)/<nom_de_modele>.pkl


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# charge les chemins vers les fichiers de donn√©es : base_processed, base_raw, base_models...
%run init_notebook.py

In [4]:
from common_co2 import load_our_data_cat, display_norm_matrix, display_roc

# Classification par Gradient Boosting Machines Catboost

In [5]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "16"  # nombre de c≈ìurs physiques r√©els


In [6]:
X_train_scaled, X_test_scaled, y_train, y_test=load_our_data_cat()

In [7]:
# Cette case est une r√©f√©rence pour les hyperparam√®tres du mod√®le
hyperparams = {
    'iterations': 1000,  # Nombre d'it√©rations
    'learning_rate': 0.1,  # Taux d'apprentissage
    'depth': 6,  # Profondeur des arbres
    'l2_leaf_reg': 3,  # R√©gularisation L2
    'random_seed': 42,  # Seed pour la reproductibilit√©
}

In [None]:
!pip install catboost

In [8]:
from catboost import CatBoostClassifier

# Initialiser le mod√®le
model = CatBoostClassifier(**hyperparams)

ModuleNotFoundError: No module named 'catboost'

In [None]:
# Fonction pour entra√Æner le mod√®le
def train_model(model, X_train_scaled, y_train):
    """
    Entra√Æne le mod√®le.
    """
    model.fit(X_train_scaled, y_train, verbose=0)  # verbose=0 pour √©viter trop de logs
    return model

# Fonction pour √©valuer les performances du mod√®le
from sklearn.metrics import classification_report, f1_score, recall_score
def evaluate_model(model, X_test_scaled, y_test):
    """
    Calcule et affiche les m√©triques du mod√®le.
    """
    # Pr√©dictions sur les donn√©es de test
    y_pred = model.predict(X_test_scaled)
    
    # Calcul et affichage de la pr√©cision
    accuracy = model.score(X_test_scaled, y_test)
    print(f"Pr√©cision du mod√®le : {accuracy:.2f}")
    
    # Calcul et affichage du F1-score
    f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' pour les classes d√©s√©quilibr√©es
    print(f"F1-score : {f1:.2f}")
    
    # Calcul et affichage du recall
    recall = recall_score(y_test, y_pred, average='weighted')
    print(f"Recall : {recall:.2f}")
    
    # Affichage du rapport de classification
    report = classification_report(y_test, y_pred)
    print("Rapport de classification :")
    print(report)

    return accuracy, f1, recall, y_pred

In [None]:
# info pour les repr√©sentations graphiques
name="CatBoost"


In [None]:
"""
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

# Fonction pour afficher la matrice de confusion
def display_confusion_matrix(model, X_test_scaled, y_test, name="Mod√®le", params=None):
    """
    Affiche la matrice de confusion normalis√©e.
    """
    # Pr√©dictions
    y_pred = model.predict(X_test_scaled)

    # Matrice de confusion brute
    cm = confusion_matrix(y_test, y_pred)

    # Normalisation par le nombre total de chaque classe
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    print(f"\nüîπ Matrice de confusion pour {name} avec {params} üîπ")

    # Affichage
    plt.figure(figsize=(8, 6))

    # Heatmap avec les deux annotations
    sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues",
                xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))

    """
    # si version trop r√©cente de matplotlib, ajout des valeurs sur une deuxi√®me couche d'annotations
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j + 0.5, i + 0.5, f"{cm[i, j]}",
                     ha="center", va="center", color="black", fontsize=10)
    """

    plt.xlabel("Pr√©dictions")
    plt.ylabel("Vraies classes")
    plt.title(f"Matrice de confusion normalis√©e, {name}")
    plt.show()
    """

In [None]:
model=train_model(model, X_train_scaled, y_train)
accuracy, f1, recall, y_pred=evaluate_model(model, X_test_scaled, y_test)

In [None]:
text_hyperparams = f"iterations={hyperparams['iterations']}, depth={hyperparams['depth']}, l2_leaf_reg={hyperparams['l2_leaf_reg']}"
display_norm_matrix(name, y_pred, y_test, text_hyperparams)
# display_confusion_matrix(model, X_test_scaled, y_test, name=name, params=hyperparams)

Quelle est la qualit√© du r√©sultat ?  

In [None]:
# Nouveau set d'hyperparam√®tres du mod√®le
hyperparams = {
    'iterations': 1500,  # Nombre d'it√©rations
    'learning_rate': 0.1,  # Taux d'apprentissage
    'depth': 6,  # Profondeur des arbres
    'l2_leaf_reg': 2,  # R√©gularisation L2
    'random_seed': 42,  # Seed pour la reproductibilit√©
}

model = CatBoostClassifier(**hyperparams)
model=train_model(model, X_train_scaled, y_train)
accuracy, f1, recall, y_pred=evaluate_model(model, X_test_scaled, y_test)

In [None]:
text_hyperparams = f"iterations={hyperparams['iterations']}, depth={hyperparams['depth']}, l2_leaf_reg={hyperparams['l2_leaf_reg']}"
display_norm_matrix(name, y_pred, y_test, text_hyperparams)
# display_confusion_matrix(model, X_test_scaled, y_test, name=name, params=hyperparams)

In [None]:
# Nouveau set d'hyperparam√®tres du mod√®le
hyperparams = {
    'iterations': 1500,  # Nombre d'it√©rations
    'learning_rate': 0.1,  # Taux d'apprentissage
    'depth': 6,  # Profondeur des arbres
    'l2_leaf_reg': 2,  # R√©gularisation L2
    'random_seed': 42,  # Seed pour la reproductibilit√©
}

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


model = CatBoostClassifier(**hyperparams)
model=train_model(model, X_train_resampled, y_train_resampled)
accuracy, f1, recall, y_pred=evaluate_model(model, X_test_scaled, y_test)


In [None]:
text_hyperparams = f"iterations={hyperparams['iterations']}, depth={hyperparams['depth']}, l2_leaf_reg={hyperparams['l2_leaf_reg']}"
text_hyperparams += f", SMOTE"
display_norm_matrix(name, y_pred, y_test, text_hyperparams)
# display_confusion_matrix(model, X_test_scaled, y_test, name=name, params=hyperparams)

In [None]:
display_roc(X_test_scaled, y_test, y_pred, model)

# Tentative d'am√©lioration par p√©nalit√©

La classe 2 est sous repr√©sent√©e par rapport √† la classe 3,  
on applique donc une p√©nalit√© aux probabilit√©s

In [11]:
# param√®tres pour ce chapitre:
threshold = 0.2 # par exemple 0.1 pour 10% de favorisation

In [None]:
# Pr√©dire les classes sur les donn√©es de test
y_prob = model.predict_proba(X_test_scaled)

y_adjusted_pred = []

# Custom logic: on priorisera class 2 si c'est celle qui est la plus probable au threshold pr√®s
for prob in y_prob:
    # La classe avec la plus haute probe est:
    max_prob_class_index = np.argmax(prob)

    # Check if class 2 is close enough to the maximum probability
    if prob[1] >= prob[max_prob_class_index] - threshold:  # Close enough to the max probability
        y_adjusted_pred.append(2)  # Favor class 2
    else:
        y_adjusted_pred.append(max_prob_class_index+1)  # Stick to the class with the highest probability


In [None]:
# for info 
display(y_prob)
# k-NN with k=5 (our best results) is not very good for improvement by penalty,
# because the granularity of the probabilities is low.

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Calculate new metrics
adjusted_accuracy = accuracy_score(y_test, y_adjusted_pred)
adjusted_f1 = f1_score(y_test, y_adjusted_pred, average='weighted')
adjusted_recall = recall_score(y_test, y_adjusted_pred, average='weighted')  # Include recall calculation

# Print the metrics
print(f"Adjusted Accuracy: {adjusted_accuracy:.2f}")
print(f"Adjusted F1-Score: {adjusted_f1:.2f}")
print(f"Adjusted Recall: {adjusted_recall:.2f}")


In [None]:
# pour info, les lignes qui ont boug√© avec la p√©nalisation

import pandas as pd

# Create a DataFrame to compare the Series
comparison_df = pd.DataFrame({
    "Original Prediction": y_pred,
    "Adjusted Prediction": y_adjusted_pred
})

# Add a column to indicate differences
comparison_df["Difference"] = comparison_df["Original Prediction"] != comparison_df["Adjusted Prediction"]

# Display rows with differences
differences = comparison_df[comparison_df["Difference"]]
display(differences)

In [None]:
# affichage de la matrice
text_hyperparams = f"iterations={hyperparams['iterations']}, depth={hyperparams['depth']}, l2_leaf_reg={hyperparams['l2_leaf_reg']}"
text_hyperparams += f", p√©nalit√© {threshold} %"
display_norm_matrix(name, y_pred, y_test, text_hyperparams)

NameError: name 'y_pred' is not defined

# Sauvegarde du mod√®le

In [None]:
import joblib

# Chemin pour enregistrer le mod√®le
model_path = base_models + 'rf_cat.pkl'

# Enregistrer le mod√®le
joblib.dump(random_forest_model, model_path)

print(f"Mod√®le random_forest enregistr√© dans {model_path}")
