# Projet 2 - Counterfactuals pour données tabulaires

Elyes KHALFALLAH & Edouard CHAPPON

MALIA

---
---


In [None]:
import sys
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)

import matplotlib.pyplot as plt
import seaborn as sns
plt.rc("font", size=16)

from sklearn import metrics
from sklearn import model_selection
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Ajout de DiCE pour les explications contrefactuelles
import dice_ml
from dice_ml.utils import helpers


In [None]:
# Charger le dataset "Adult Income"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income",
]
df = pd.read_csv(url, names=columns, na_values=" ?")

df.replace(to_replace={np.nan: 0.0}, inplace=True)


In [None]:
# Définir la variable cible et les caractéristiques
cible = df["income"]
caracteristiques = df.drop(columns=["income"])

sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=(12,12))
counter = cible.value_counts()
counter.rename(index={" <=50K": "Low Income", " >50K": "High Income"}, inplace=True)
ax.set_xlabel("# Cases", size=16)
counter.plot(kind="barh", ax=ax)


In [None]:
# Séparation des données
x_train, x_test, y_train, y_test = model_selection.train_test_split(caracteristiques, 
                                                                    cible, 
                                                                    test_size=0.3, 
                                                                    random_state=14,
                                                                    stratify=cible)


In [None]:
# Prétraitement
caracteristiques_categoric = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
caracteristiques_numeriques = set(caracteristiques.columns) - set(caracteristiques_categoric)

types_cat = [list(caracteristiques[cat].unique()) for cat in caracteristiques_categoric]
transfo_categorique = Pipeline(steps=[('onehot', OneHotEncoder(categories=types_cat))])
preprocesseur = ColumnTransformer(transformers=[('categorical', transfo_categorique, caracteristiques_categoric)])


In [None]:
# Entraînement du modèle
params_rf = {"n_estimators": 50, "max_depth": 8, "random_state":27}
rf = RandomForestClassifier(**params_rf)
clf_rf = Pipeline(steps=[('preprocessor', preprocesseur), ('classifier', rf)])
modele = clf_rf.fit(x_train , y_train)


In [None]:
# Matrice de confusion
y_hat = clf_rf.predict(x_test)
cf = metrics.confusion_matrix(y_pred=y_hat, y_true=y_test)
noms_df = ["Low Income", "High Income"]
cf_df = pd.DataFrame(data=cf, columns=noms_df, index=noms_df)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(cf_df, fmt="d",annot=True, ax=ax)
ax.set_xlabel("Prédiction du Modèle", size=16)
ax.set_ylabel("Valeur Réelle", size=16)
ax.set_title("Matrice de Confusion", size=20)


In [None]:
X = pd.concat([caracteristiques, cible], axis=1)
donnees_dice = dice_ml.Data(dataframe=X, continuous_features=list(caracteristiques_numeriques), outcome_name='income')
modele_exp = dice_ml.Model(model=modele, backend='sklearn')
exp = dice_ml.Dice(donnees_dice, modele_exp, method="random")


In [None]:
patient_index = 2
print("-------Revenu Réel du Patient------")
print(cible.iloc[patient_index])
print("-------Revenu Prédit du Patient------")
pred_class = clf_rf.predict(X.loc[patient_index:patient_index, :])[0]
pred_proba = clf_rf.predict_proba(X.loc[patient_index:patient_index, :])[0]
print(f'Probabilité de {pred_class}: {round(pred_proba[1],3)}')


In [None]:
caracteristiques_a_varier = ["education", "marital_status", "occupation", "relationship", "age"]
e1 = exp.generate_counterfactuals(caracteristiques[patient_index:patient_index+1], 
                                  total_CFs=2, 
                                  desired_class="opposite",
                                  random_seed=3434,
                                  features_to_vary=caracteristiques_a_varier)
e1.visualize_as_dataframe(show_only_changes=False)


In [None]:
CF_dice = e1.cf_examples_list[0].final_cfs_df
pred_cf = clf_rf.predict_proba(CF_dice.drop(columns=["income"]))
CF_dice[["Probabilité Low Income", "Probabilité High Income"]] = pred_cf

CF_dice
