# Mnitoring

In [3]:
import pandas as pd 
from evidently.metric_preset import ClassificationPreset, DataDriftPreset
from evidently.report import Report
from IPython.display import IFrame
#import train_model

# Définition d'une fonction utilitaire pour éviter KeyError
def drop_column_if_exists(df, col):
    if col in df.columns:
        return df.drop(columns=col)
    return df

# Fonction de target encoding
def target_encode_smooth(df, col, target, alpha=40):
    df_copy = df[[col, target]].copy()
    classes = df[target].unique()
    global_probas = df[target].value_counts(normalize=True)

    stats = df_copy.groupby(col)[target].value_counts().unstack().fillna(0)
    totals = stats.sum(axis=1)

    encoded = pd.DataFrame(index=df.index)

    for cls in classes:
        n_cy = stats[cls] if cls in stats.columns else 0
        p_y = global_probas[cls]
        smooth = (n_cy + alpha * p_y) / (totals + alpha)
        encoded[f"{col}_enc_{cls}"] = df[col].map(smooth)

    return encoded


def encode_features(df, target_col='account_status', alpha=10):
    df = df.copy()
    dummy_cols = ['gender', 'marital_status', 'employment_status', 
                  'education_level', 'subscription_type', 'age_group']
    
    df_dummies = pd.get_dummies(df[dummy_cols], prefix=dummy_cols)
    country_enc = target_encode_smooth(df, col='country', target=target_col, alpha=alpha)

    numeric_cols = df.drop(columns=dummy_cols + ['country', target_col]).copy()
    numeric_cols = numeric_cols.astype({col: 'float64' for col in numeric_cols.select_dtypes('int').columns})

    final_df = pd.concat([df_dummies, country_enc, numeric_cols], axis=1)
    final_df[target_col] = df[target_col]
    
    return final_df

# Chargement des données brutes
ref_raw = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_train_cleaned.csv").sample(200, random_state=1)
cur_raw = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_train_cleaned.csv").sample(200, random_state=42)

# Gestion des valeurs manquantes pour account_status (nécessaire pour target encoding)
ref_raw['account_status'] = ref_raw['account_status'].fillna('Unknown')
cur_raw['account_status'] = cur_raw['account_status'].fillna('Unknown')

# Encodage des deux jeux de données
ref_encoded = drop_column_if_exists(encode_features(ref_raw, target_col='account_status', alpha=40), 'account_status')
cur_encoded = drop_column_if_exists(encode_features(cur_raw, target_col='account_status', alpha=40), 'account_status')

# Réalignement des colonnes du jeu courant sur celles de référence
cur_encoded = cur_encoded.reindex(columns=ref_encoded.columns, fill_value=0)

# Génération du rapport Evidently
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=ref_encoded, current_data=cur_encoded)

# Sauvegarde et affichage du rapport
report.save_html("rapport_drift.html")
IFrame(src='rapport_drift.html', width='100%', height=600)