In [1]:
!pip install git+https://github.com/BirkhoffG/jax-relax.git

Collecting git+https://github.com/BirkhoffG/jax-relax.git
  Cloning https://github.com/BirkhoffG/jax-relax.git to /tmp/pip-req-build-htfut5wf
  Running command git clone --filter=blob:none --quiet https://github.com/BirkhoffG/jax-relax.git /tmp/pip-req-build-htfut5wf
  Resolved https://github.com/BirkhoffG/jax-relax.git to commit 9e70d353376bd075bd300970504daec4188b0bb2
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
!pip install nbdev



In [3]:
!pip install pytorch_lightning



In [4]:
from relax.methods import VanillaCF
from relax import DataModule, MLModule, generate_cf_explanations, benchmark_cfs
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import functools as ft
import jax

In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics import Accuracy, MaxMetric

# Read and preprocess the data
dataframe = pd.read_csv('/content/final_novo.csv', sep=';')
dataframe = dataframe.drop(['Chave'], axis=1)
dataframe = dataframe.fillna(0)
dataframe['Anos educacao formal'] = dataframe['Anos educacao formal'].replace(-5, -1)
dataframe['sexo'].replace({'M': 0, 'F': 1}, inplace=True)

notears_selected = [
   "Suic_familia",
    "Capaz de tomar decisões importantes",
    "Estudante",
    "Hipocondriase",
    "Sentimentos_culpa",
    "Trabalho e interesses",
    'Dep_familia',
    'Alc_familia',
    'Capaz de desfrutar das coisas',
    'Droga',
    'Suicidio',
]

df_suic = dataframe[notears_selected].copy()
df_suic.dropna(inplace=True)
df_suic = df_suic.astype(int)

'''class MyDataset(Dataset):
    def __init__(self, input_dataframe, split="train", target="Suicidio", ignore_columns=[], train_ratio=0.8):
        self.split = split
        self.target = target
        self.ignore_columns = ignore_columns

        for coll in self.ignore_columns:
            if coll in input_dataframe.columns:
                input_dataframe = input_dataframe.drop(coll, axis=1)

        self.classification_dim = len(input_dataframe[self.target].unique())
        self.data_dim = len(input_dataframe.columns) - 1
        self.embedding_dim = input_dataframe.max().max() + 1

        y = input_dataframe[target].values
        x = input_dataframe.drop(target, axis=1).values

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=42)

    def __len__(self):
        if self.split == "train":
            return len(self.x_train)
        elif self.split == "test":
            return len(self.x_test)
        else:
            raise ValueError("Split must be train or test")

    def __getitem__(self, idx):
        target = torch.zeros(self.classification_dim)
        if self.split == "train":
            target[self.y_train[idx]] = 1
            return (torch.tensor(self.x_train[idx]), target)
        elif self.split == "test":
            target[self.y_test[idx]] = 1
            return (torch.tensor(self.x_test[idx]), target)
        else:
            raise ValueError("Split must be train or test")

# Instantiate the dataset
train_dataset = MyDataset(df_suic, split="train", target="Suicidio", ignore_columns=[], train_ratio=0.8)
test_dataset = MyDataset(df_suic, split="test", target="Suicidio", ignore_columns=[], train_ratio=0.2)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)  # Shuffle training data
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128, n_layers=2):
        super(MLP, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, hidden_dim))
        for i in range(n_layers - 1):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.Dropout(0.5))
            self.layers.append(nn.LeakyReLU())
        self.layers.append(nn.Linear(hidden_dim, output_dim))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class ClassificationModel(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_out, hidden_dim=128, n_layers=2):
        super(ClassificationModel, self).__init__()
        self.embedding_layer = nn.Embedding(embedding_dim, hidden_out)
        self.mlp = MLP(input_dim * hidden_out, output_dim, hidden_dim, n_layers)

    def forward(self, x):
        x = x.long()  # Ensure x is Long
        x = self.embedding_layer(x)
        x = x.view(x.shape[0], -1)
        x = self.mlp(x)
        return F.softmax(x, dim=1)

class BaseModel(LightningModule):
    def __init__(self, input_dim, output_dim, embedding_dim, embedding_out, hidden_dim):
        super().__init__()
        self.model = ClassificationModel(input_dim, output_dim, embedding_dim, embedding_out, hidden_dim=hidden_dim, n_layers=2)
        self.lr = 1e-3
        self.save_hyperparameters()
        self.accuracy = Accuracy(task='multiclass',num_classes=5)  # Corrigir para o número certo de classes (0 a 4)
        self.val_acc_best = MaxMetric()

    def forward(self, x):
        return self.model(x)

    def step(self, batch):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = F.cross_entropy(y_hat, y.argmax(dim=1))  # Use cross entropy
        acc = self.accuracy(y_hat, y.argmax(dim=1))
        return loss, acc

    def training_step(self, batch, batch_idx):
        loss, acc = self.step(batch)
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc = self.step(batch)
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def pred_fn(X):
      model.eval()  # Colocar o modelo em modo de avaliação
      with torch.no_grad():
          X_tensor = torch.FloatTensor(X)  # Converter para tensor
          return model(X_tensor).numpy()  # Retornar como numpy array

# Initialize model
model = BaseModel(
    input_dim=train_dataset.data_dim,
    output_dim=train_dataset.classification_dim,
    embedding_dim=100,
    embedding_out=64,
    hidden_dim=128
)

# Initialize callbacks
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='checkpoints/', filename='best-checkpoint', save_top_k=1, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.05, patience=10, mode='min')

# Initialize a trainer
trainer = Trainer(
    devices=1,
    check_val_every_n_epoch=10,
    log_every_n_steps=10,
    callbacks=[checkpoint_callback, early_stopping],
    enable_progress_bar=False
)

# Train the model
trainer.fit(model, train_loader, test_loader)'''


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['sexo'].replace({'M': 0, 'F': 1}, inplace=True)
  dataframe['sexo'].replace({'M': 0, 'F': 1}, inplace=True)


'class MyDataset(Dataset):\n    def __init__(self, input_dataframe, split="train", target="Suicidio", ignore_columns=[], train_ratio=0.8):\n        self.split = split\n        self.target = target\n        self.ignore_columns = ignore_columns\n\n        for coll in self.ignore_columns:\n            if coll in input_dataframe.columns:\n                input_dataframe = input_dataframe.drop(coll, axis=1)\n\n        self.classification_dim = len(input_dataframe[self.target].unique())\n        self.data_dim = len(input_dataframe.columns) - 1\n        self.embedding_dim = input_dataframe.max().max() + 1\n\n        y = input_dataframe[target].values\n        x = input_dataframe.drop(target, axis=1).values\n\n        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=42)\n\n    def __len__(self):\n        if self.split == "train":\n            return len(self.x_train)\n        elif self.split == "test":\n            return len(s

In [6]:
'''def predic_fn(X):
    if not isinstance(X, np.ndarray):
        X = np.array(X)  # Converta para um array NumPy

    model.eval()  # Colocar o modelo em modo de avaliação
    with torch.no_grad():
        if X.ndim == 1:
            X = np.expand_dims(X, axis=0)  # Adiciona uma nova dimensão

        X_tensor = torch.FloatTensor(X)  # Converter para tensor
        return model(X_tensor).numpy()  # Retornar como numpy array
'''

'def predic_fn(X):\n    if not isinstance(X, np.ndarray):\n        X = np.array(X)  # Converta para um array NumPy\n\n    model.eval()  # Colocar o modelo em modo de avaliação\n    with torch.no_grad():\n        if X.ndim == 1:\n            X = np.expand_dims(X, axis=0)  # Adiciona uma nova dimensão\n\n        X_tensor = torch.FloatTensor(X)  # Converter para tensor\n        return model(X_tensor).numpy()  # Retornar como numpy array\n'

In [7]:
'''train_dataset.x_test[0].astype(float)
X_tensor = torch.tensor(train_dataset.x_test[0], dtype=torch.float32)  # `device` pode ser 'cpu' ou 'cuda'
'''

"train_dataset.x_test[0].astype(float)\nX_tensor = torch.tensor(train_dataset.x_test[0], dtype=torch.float32)  # `device` pode ser 'cpu' ou 'cuda'\n"

In [8]:
'''def pred_fn( x):
        x = torch.LongTensor(X.values).to(model.device)
        return model(x)
cf = vcf.generate_cf(X_test[:1], pred_fn, permit_int=True)'''

'def pred_fn( x):\n        x = torch.LongTensor(X.values).to(model.device)\n        return model(x)\ncf = vcf.generate_cf(X_test[:1], pred_fn, permit_int=True)'

In [12]:
df_suic['Suicidio']=np.where((df_suic['Suicidio'] >1),1,0)
target = "Suicidio"
X = df_suic.drop(columns=[target])
y = df_suic[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
model1 = MLModule()
model1.train((X_train, y_train), epochs=500, batch_size=64)

Epoch 1/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 73ms/step - accuracy: 0.6251 - loss: 1.2037
Epoch 2/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6962 - loss: 0.6677
Epoch 3/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7084 - loss: 0.6229
Epoch 4/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7239 - loss: 0.6160
Epoch 5/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7542 - loss: 0.5874
Epoch 6/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7431 - loss: 0.5879
Epoch 7/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7460 - loss: 0.5845
Epoch 8/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7672 - loss: 0.5664
Epoch 9/500
[1m44/44[0m [32m━━━━━━━━━━━━━━━

<relax.ml_model.MLModule at 0x7d34ee0d3310>

In [15]:
X_test =X_test.astype(float)

In [None]:
'''vcf = VanillaCF(config={'n_steps': 1000, 'lr': 0.05})
cf = vcf.generate_cf(X_test[:1].values, model1.pred_fn,permit_int =True)'''
#assert cf.shape == y_test[0].shape

In [None]:
'''data_module = DataModule.from_numpy(df_suic.drop(columns='Suicidio').values, df_suic.Suicidio.values)
exps = generate_cf_explanations(vcf, data_module, model1.pred_fn)
benchmark_cfs([exps])'''

In [None]:
'''import pandas as pd
import numpy as np

def save_counterfactuals_to_dataframe(explanation, data, pred_fn):
    # Extracting the original data (features and target values)
    original_data = data.xs  # Original features
    original_target = pred_fn(data.xs)  # Original target values (previsões)

    # Extracting the counterfactual data (features and counterfactual predictions)
    cfs = explanation.cfs  # Counterfactuals
    counterfactual_target = pred_fn(cfs)  # Predições para os contrafactuais

    # Verifique as dimensões para garantir que são 1D antes de adicioná-las ao DataFrame
    # Caso as features sejam 2D (como uma matriz), achate-as
    if original_data.ndim > 1:
        original_data = original_data.flatten()  # Achatar para 1D
    if cfs.ndim > 1:
        cfs = cfs.flatten()  # Achatar para 1D

    # Convert counterfactuals and original data into a dataframe
    data_dict = {
        "original_features": original_data.tolist(),
        "original_target": original_target.tolist(),
        "counterfactual_features": cfs.tolist(),
        "counterfactual_target": counterfactual_target.tolist()
    }

    # Criar o DataFrame com os dados
    df = pd.DataFrame(data_dict)

    # Se você tiver múltiplas colunas de features, pode expandir da seguinte forma
    for i in range(original_data.shape[1]):  # Caso seja um array multidimensional
        df[f"original_feature_{i}"] = original_data[:, i]
        df[f"counterfactual_feature_{i}"] = cfs[:, i]

    return df

# Chamada da função e armazenamento dos resultados em um DataFrame
explanation = generate_cf_explanations(vcf, data_module, model1.pred_fn)
df = save_counterfactuals_to_dataframe(explanation, data_module, model1.pred_fn)

# Visualizar o DataFrame
print(df.head())'''


In [None]:
 # Extracting the original data (features and target values)
'''original_data = data_module.xs  # Original features
original_target = pred_fn(data.xs)  # Original target values (previsões)

    # Extracting the counterfactual data (features and counterfactual predictions)
cfs = explanation.cfs  # Counterfactuals
counterfactual_target = pred_fn(cfs)  # Predições para os contrafactuais

    # Verifique as dimensões para garantir que são 1D antes de adicioná-las ao DataFrame
    # Caso as features sejam 2D (como uma matriz), achate-as
if original_data.ndim > 1:
        original_data = original_data.flatten()  # Achatar para 1D
if cfs.ndim > 1:
        cfs = cfs.flatten()  # Achatar para 1D

    # Convert counterfactuals and original data into a dataframe
data_dict = {
        "original_features": original_data.tolist(),
        "original_target": original_target.tolist(),
        "counterfactual_features": cfs.tolist(),
        "counterfactual_target": counterfactual_target.tolist()
    }

    # Criar o DataFrame com os dados
df = pd.DataFrame(data_dict)

    # Se você tiver múltiplas colunas de features, pode expandir da seguinte forma
for i in range(original_data.shape[1]):  # Caso seja um array multidimensional
        df[f"original_feature_{i}"] = original_data[:, i]
        df[f"counterfactual_feature_{i}"] = cfs[:, i]


# Chamada da função e armazenamento dos resultados em um DataFrame
explanation = generate_cf_explanations(vcf, data_module, model1.pred_fn)
df = save_counterfactuals_to_dataframe(explanation, data_module, model1.pred_fn)'''

In [16]:
from sklearn.model_selection import train_test_split
from relax.methods import VanillaCF
import numpy as np
import jax.numpy as jnp

# Definir a configuração do VanillaCF
vcf = VanillaCF(config={'n_steps': 1000, 'lr': 0.05})

# Converter DataFrame para NumPy para evitar strings como índices
X_test_np = X_test.values

# Mapear índices numéricos para os intervalos categóricos correspondentes
feature_ranges = {
    0: [0, 1],  # Suic_familia
    1: [0, 1],  # Dep_familia
    2: [0, 1],  # Estudante
    3: [0, 1],  # Alc_familia
    4: [1, 5],  # Capaz de tomar decisões importantes
    5: [1, 5],  # Capaz de desfrutar das coisas
    6: [0, 4],  # Hipocondriase
    7: [0, 4],  # Sentimentos_culpa
    8: [0, 4],  # Trabalho e interesses
    9: [0, 2]   # Droga
}

def round_to_category(values, feature_ranges):
    values_jax = jnp.array(values)  # Converte para um array JAX
    for feature_idx, ranges in feature_ranges.items():
        # Arredondar e aplicar limite, retornando um novo array JAX imutável
        values_jax = values_jax.at[:, feature_idx].set(
            jnp.round(values_jax[:, feature_idx]).clip(ranges[0], ranges[-1])
        )
    return values_jax

# Função para gerar e ajustar contrafactuais
def generate_and_adjust_cf(vcf, X_test_np, pred_fn, feature_ranges):
    cf_result = vcf.generate_cf(X_test_np[:1], model1.pred_fn, permit_int=True)
    cf_adjusted = round_to_category(cf_result, feature_ranges)
    return cf_adjusted

# Gerar e ajustar os contrafactuais
cf = generate_and_adjust_cf(vcf, X_test_np, model1, feature_ranges)

# Preparar o DataModule e gerar explicações contrafactuais com o ReLax
data_module = DataModule.from_numpy(X.values, y.values)
exps = generate_cf_explanations(vcf, data_module, model1.pred_fn)

# Executar o benchmark
benchmark_cfs([exps])

Running for 1,000 iterations:   0%|          | 0/1000 [00:00<?, ?it/s]

Running for 1,000 iterations:   0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,Unnamed: 1,acc,validity,proximity
Explanation,VanillaCF,0.75664,0.806729,4.229963


In [17]:
import pandas as pd
import numpy as np

def round_to_category(values, feature_ranges):
    values_jax = jnp.array(values)  # Converte para um array JAX
    for feature_idx, ranges in feature_ranges.items():
        # Arredondar e aplicar limite, retornando um novo array JAX imutável
        values_jax = values_jax.at[:, feature_idx].set(
            jnp.round(values_jax[:, feature_idx]).clip(ranges[0], ranges[-1])
        )
    return values_jax

# 2. Converter os contrafactuais contínuos para categóricos
cfs_continuous = exps.cfs
cfs_categorical = round_to_category(cfs_continuous, feature_ranges)

# Supondo que 'cfs_categorical' seja um array tridimensional como mostrado
# Primeiro, vamos fazer a conversão para uma forma bidimensional (n_samples, n_features)
cfs_categorical_reshaped = cfs_categorical.squeeze()  # Remove a dimensão extra

# Agora, mapeia os índices de features originais para suas versões contrafactuais
original_features = df_suic.drop(columns=['Suicidio']).values

# Predição do target para os contrafactuais
# Supondo que o modelo esteja em 'model1'
predicted_target_cf = model1.pred_fn(cfs_categorical_reshaped)

# Salvando o DataFrame com as informações
df_result = pd.DataFrame({
    'ID': df_suic.index,  # ID do índice original de `df_suic`
})

# Adicionando as features originais e contrafactuais com sufixo '_cf'
for i, feature_name in enumerate(df_suic.drop(columns=['Suicidio']).columns):
    # Coluna para a feature original
    df_result[feature_name] = original_features[:, i]
    # Coluna para a feature contrafactual com o sufixo '_cf'
    df_result[f'{feature_name}_cf'] = cfs_categorical_reshaped[:, i]

# Adicionando os valores de 'Suicidio' original e predito para contrafactuais
df_result['Suicidio_Original'] = df_suic['Suicidio'].values

# Obter a classe com a maior probabilidade (classe com maior valor de probabilidade)
predicted_target_cf_class = np.argmax(predicted_target_cf, axis=1)

# Isso retorna o índice da classe com maior probabilidade para cada amostra.
# Se o seu target for multi-classe e as classes forem 0 e 1, predicted_target_cf_class será um array de 0s e 1s.

# Agora podemos adicionar isso ao DataFrame
df_result['Suicidio_CF'] = predicted_target_cf_class

In [18]:
df_result['Suicidio_CF'].value_counts()

Unnamed: 0_level_0,count
Suicidio_CF,Unnamed: 1_level_1
1,3189
0,764


In [None]:
'''import pandas as pd
import numpy as np


def round_to_category(values, feature_ranges):
    values_jax = jnp.array(values)  # Converte para um array JAX
    for feature_idx, ranges in feature_ranges.items():
        # Arredondar e aplicar limite, retornando um novo array JAX imutável
        values_jax = values_jax.at[:, feature_idx].set(
            jnp.round(values_jax[:, feature_idx]).clip(ranges[0], ranges[-1])
        )
    return values_jax

# 2. Converter os contrafactuais contínuos para categóricos
cfs_continuous = exps.cfs
cfs_categorical = round_to_category(cfs_continuous, feature_ranges)

# 3. Obter o target predito para cada contrafactual
predicted_target_cf = model1.pred_fn(cfs_categorical)

# 4. Construir o DataFrame final
# Assumindo que X_test é o conjunto original das features categóricas e y_test o target original
df_result = pd.DataFrame({
    'ID': df_suic.index,  # ID do índice original de `df_suic`
    'Original_Features': list(df_suic.drop(columns=['Suicidio']).values),  # Features originais categóricas
    'CF_Features': list(cfs_categorical),      # Features contrafactuais ajustadas para categorias
    'Suicidio_Original': df_suic.Suicidio.values,          # Target original
    'Suicidio_CF': predicted_target_cf  # Target predito após contrafactual
})

# Opcional: Expanda as colunas para facilitar a leitura
df_result = df_result.explode(['Original_Features', 'CF_Features'])

# Exibir o DataFrame final
print(df_result)'''
