In [4]:
import os
import pandas as pd
import shap
import joblib

In [5]:
# Charger le modèle
model = joblib.load("/home/sacko/Documents/ProjetAchats/models/model.joblib")

# Données d'entrainement
df_train_cleaned = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_train_cleaned.csv")

# Charger les données de test
df_test_cleaned = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_test_cleaned.csv")

In [6]:
# Séparation des variables explicatives et cible
X_train = df_train_cleaned.drop(["account_status"], axis=1)
y_train = df_train_cleaned["account_status"]

X_test = df_test_cleaned.drop(["account_status"], axis=1)
y_test = df_test_cleaned["account_status"]

# Fonction de target encoding
def target_encode_smooth(df, col, target, alpha=40):
    df_copy = df[[col, target]].copy()
    classes = df[target].unique()
    global_probas = df[target].value_counts(normalize=True)

    stats = df_copy.groupby(col)[target].value_counts().unstack().fillna(0)
    totals = stats.sum(axis=1)

    encoded = pd.DataFrame(index=df.index)

    for cls in classes:
        n_cy = stats[cls] if cls in stats.columns else 0
        p_y = global_probas[cls]
        smooth = (n_cy + alpha * p_y) / (totals + alpha)
        encoded[f"{col}_enc_{cls}"] = df[col].map(smooth)

    return encoded

 # Fonction d'encodage des variables catégorielles nominales en variables binaires (one-hot encoding) 
def encode_features(df, target_col='account_status', alpha=10):
    df = df.copy() # Copie du DataFrame pour éviter de modifier l'original
    dummy_cols = ['gender', 'marital_status', 'employment_status', 
                  'education_level', 'subscription_type', 'age_group']
    
    df_dummies = pd.get_dummies(df[dummy_cols], prefix=dummy_cols)
    country_enc = target_encode_smooth(df, col='country', target=target_col, alpha=alpha)

    numeric_cols = df.drop(columns=dummy_cols + ['country', target_col]).copy()
    numeric_cols = numeric_cols.astype({col: 'float64' for col in numeric_cols.select_dtypes('int').columns}) 

    final_df = pd.concat([df_dummies, country_enc, numeric_cols], axis=1)
    final_df[target_col] = df[target_col]
    
    return final_df


# Préparation des dossiers
os.makedirs("artifacts", exist_ok=True)
os.makedirs("reports", exist_ok=True)

# Encodage des données
train_encoded = encode_features(X_train.assign(account_status=y_train), target_col='account_status')
test_encoded = encode_features(X_test.assign(account_status=y_test), target_col='account_status')

X_train_encoded = train_encoded.drop(columns='account_status')
y_train_encoded = train_encoded['account_status']
X_test_encoded = test_encoded.drop(columns='account_status')
y_test_encoded = test_encoded['account_status']

X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [None]:
# Création d'explicabilité 
explainer = shap.TreeExplainer(model)

# Calcul des valeurs SHAP
shap_values = explainer.shap_values(X_test_encoded)

# Visualisualisation
shap.summary_plot(shap_values, X_test_encoded)