In [1]:
import carla
from carla.data.api import Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from carla.recourse_methods.catalog.wachter import Wachter
from carla.models.api import MLModel
from sklearn.preprocessing import OneHotEncoder
from carla.recourse_methods import GrowingSpheres
from typing import Union


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]


In [2]:
class CustomData(Data):
    def __init__(self):
        # Load the data 
        self._df = pd.read_csv("brfss13.csv").drop("Unnamed: 0", axis=1)

        # Encoding the target column
        self._df['Heartdis'] = self._df['Heartdis'].map({'Yes': 0, 'No': 1}) #Ici j'ai inversé car le model met de base à 1

        # Défine columns
        self._target = "Heartdis"
        self._onehot_cols = ["Smoking", "Age", "Gender", "Diabetes", "Kidney", "Stroke"]
        self._continuous = ["BMI", "Sleep", "Exercise", "Alcohol", "Fruit"]

        # Initialize encoder
        self._encoder = OneHotEncoder(drop=None, sparse=False, handle_unknown="ignore")
        self._encoder.fit(self._df[self._onehot_cols])

        # Get encoded column names
        encoded_col_names = self._encoder.get_feature_names(self._onehot_cols)

        # Apply encoding
        encoded_cols = self._encoder.transform(self._df[self._onehot_cols])
        encoded_df = pd.DataFrame(encoded_cols, columns=encoded_col_names)

        # Make sure all columns are present
        encoded_df = encoded_df.reindex(columns=encoded_col_names, fill_value=0)

        # Delete original columns
        df_numerical = self._df.drop(columns=self._onehot_cols + [self._target])

        # Concatenate
        self._data = pd.concat([df_numerical.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

        # Unchanging columns (gender, age)
        self._immutables = [col for col in encoded_df.columns if "Gender" in col or "Age" in col]

        # Blocks integer 
        self._data['Sleep'] = self._data['Sleep'].astype(int)
        self._data['Exercise'] = self._data['Exercise'].astype(int)
        self._data['Alcohol'] = self._data['Alcohol'].astype(int)
        self._data['Fruit'] = self._data['Fruit'].astype(int)


        # Target
        self._target_column = self._df[self._target]

        # Split train/test
        self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
            self._data, self._target_column, test_size=0.2, random_state=42
        )

    @property
    def raw(self):
        return self._df

    @property
    def df(self):
        return self._data

    @property
    def target(self):
        return self._target

    @property
    def categorical(self):
        return self._onehot_cols

    @property
    def continuous(self):
        return self._continuous

    @property
    def immutables(self):
        return self._immutables

    @property
    def name(self):
        return "heartdis"

    @property
    def encoder(self):
        return self._encoder

    def df_train(self):
        return self._X_train

    def df_test(self):
        return self._X_test

    def transform(self, x):
        return x

    def inverse_transform(self, x):
        return x

In [3]:
class SKLearnModel(MLModel):
    def __init__(self, model, data, predict_proba=True):
        self._model = model
        self._data = data
        self._predict_proba = predict_proba
        self._feature_input_order = data.df.columns.tolist()

    @property
    def data(self):
        return self._data

    @property
    def feature_input_order(self):
        return self._feature_input_order

    @property
    def raw_model(self):
        return self._model

    @property
    def backend(self):
        return "sklearn"

    def predict(self, x: pd.DataFrame) -> np.ndarray:
        return self._model.predict(x[self.feature_input_order])

    def predict_proba(self, x: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        if self._predict_proba:
            # If x is an array, pass it as is
            if isinstance(x, np.ndarray):
                return self._model.predict_proba(x)
            # Otherwise we assume it's a DataFrame
            else:
                return self._model.predict_proba(x[self.feature_input_order])
        else:
            raise NotImplementedError("predict_proba is disabled.")


In [4]:
# Step 1 : instantiate data
data = CustomData()

In [5]:
# Step 2 : training a sklearn model
model_sklearn = RandomForestClassifier(n_estimators=100, random_state=0)
model_sklearn.fit(data.df, data.raw["Heartdis"])

RandomForestClassifier(random_state=0)

In [6]:
# Step 3 : wrap it up with Carla
model = SKLearnModel(model_sklearn, data)

In [7]:
# Step 4 : generate conterfactual (Wachter par exemple)
cf = GrowingSpheres(mlmodel=model)

In [8]:
import random

# We retrieve the index of instances with Heartdis == 0
heart_yes_indices = data.raw[data.raw["Heartdis"] == 0].index.tolist()

# A random index is selected from these
random_index = random.choice(heart_yes_indices)

# Retrieve the corresponding instance in data.df (encoded data)
instance = data.df.iloc[[random_index]]

# For verification, we can also display the raw target value
print(f"Instance choisie - index: {random_index}, Heartdis: {data.raw.iloc[random_index]['Heartdis']}")

Instance choisie - index: 3985, Heartdis: 0


In [9]:
# Example on one instance (first line)
cf_result = cf.get_counterfactuals(instance)

In [12]:
import pandas as pd
from IPython.display import display, HTML

def display_cf_comparison(original, cf, onehot_cols, encoder):
    # Discretize one-hot columns to make them easier to read
    def decode(df):
        df_cat = pd.DataFrame(
            encoder.inverse_transform(df[encoder.get_feature_names(onehot_cols)]),
            columns=onehot_cols
        )
        df_num = df.drop(columns=encoder.get_feature_names(onehot_cols))
        return pd.concat([df_num.reset_index(drop=True), df_cat.reset_index(drop=True)], axis=1)

    original_decoded = decode(original.copy())
    cf_decoded = decode(cf.copy())

    # List of all dataset columns
    all_columns = original_decoded.columns

    # Create a DataFrame for comparison with “-” for unmodified columns
    comparison_dict = {"Feature": [], "Original": [], "Counterfactual": []}
    
    for col in all_columns:
        original_value = original_decoded[col].iloc[0]
        cf_value = cf_decoded[col].iloc[0]
        
        # If the value in the counterfactual is None, it is considered unmodified.
        if pd.isna(cf_value):  # If cf_value is None or NaN
            cf_value = "-"  # Replace with a hyphen (“-”)
        
        if original_value == cf_value:
            comparison_dict["Feature"].append(col)
            comparison_dict["Original"].append(original_value)
            comparison_dict["Counterfactual"].append("-")  # Indicate “-” for unmodified values

    # Convert to DataFrame
    comparison_df = pd.DataFrame(comparison_dict)

    # Pretty display in HTML
    styled = comparison_df.style.set_properties(**{
        'background-color': '#f0f8ff',
        'color': 'black',
        'border-color': 'gray'
    }).set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#4682b4'), ('color', 'white')]
    }])

    display(HTML("<h3>🔍 Comparison: Original instance vs. counterfactual</h3>"))
    display(styled)


In [13]:
display_cf_comparison(instance, cf_result, data._onehot_cols, data.encoder)

Unnamed: 0,Feature,Original,Counterfactual
0,Age,Age 55 to 59,-
1,Gender,Male,-
2,Diabetes,Yes,-
3,Kidney,No,-
4,Stroke,No,-
