In [1]:
#import libraries
import torch
import pandas as pd
import numpy as np

# Import dataset utils
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

import importlib
if importlib.util.find_spec('ipywidgets') is not None:
    from tqdm.auto import tqdm
else:
    from tqdm import tqdm

import networkx as nx

from typing import Union

In [2]:
dataframe = pd.read_csv('/content/data.csv', sep=';')

dataframe = dataframe.fillna(-1)
dataframe['Anos educacao formal'] = dataframe['Anos educacao formal'].replace(-5, -1)
dataframe['sexo'].replace({'M': 0, 'F': 1}, inplace=True)
df_suic = dataframe.copy()
df_suic = df_suic.astype(float)
df_suic['Chave'] = df_suic.index
df_suic.shape

(3953, 69)

In [3]:
notears_selected = [
    "Suic_familia",
    "Drog_familia",
    "Capaz de tomar decisões importantes",
    "Estudante",
    "Hipocondriase",
    "Sentimentos_culpa",
    "Trabalho e interesses",
    "Energia",
    "Suicidio",
    "Ansiedade"]

# Read dataset

In [4]:
 df_novo = pd.DataFrame(columns = ["Suic_familia",
    "Drog_familia",
    "Capaz de tomar decisões importantes",
    "Estudante",
    "Hipocondriase",
    "Sentimentos_culpa",
    "Trabalho e interesses",
    "Energia",
    "Suicidio",
    "Ansiedade"])

# Reading features new individual

In [23]:
Suic_familia = int(input("Suicide in the family: Enter 1 for Yes and 0 for No.") )
Drog_familia = int(input("Drugs in the family: Enter 1 for Yes and 0 for No.") )
Capaz_tomar_decisoes = int(input("CAble to make important decisions: Enter a number from 1 to 5, with 1 being low capacity and 5 being high capacity.") )
Estudante = int(input("Student: Enter 1 for Yes and 0 for No.") )
Hipocondriase = int(input("Hypochondriasis: Enter a number from 0 to 4, with 0 being no and 4 being a high level.") )
Sentimento_culpa = int(input("Feelings of guilt: Enter a number from 0 to 4, with 0 being no and 4 being high.") )
Trabalho_interesses = int(input("Work and interests: Enter a number from 0 to 4, with 0 being no and 4 being high level.") )
Energia = int(input("Energy: Enter a number from 0 to 2, with 0 being no and 2 being high level.") )
Suicidio = int(input("Suicide: Enter a number from 0 to 4, where 0 has no ideation and 4 has already made an attempt.") )
Ansiedade = int(input("Anxiety: Enter a number from 0 to 4, with 0 being no and 4 being high level.") )

Suicide in the family: Enter 1 for Yes and 0 for No.0
Drugs in the family: Enter 1 for Yes and 0 for No.1
CAble to make important decisions: Enter a number from 1 to 5, with 1 being low capacity and 5 being high capacity.2
Student: Enter 1 for Yes and 0 for No.1
Hypochondriasis: Enter a number from 0 to 4, with 0 being no and 4 being a high level.3
Feelings of guilt: Enter a number from 0 to 4, with 0 being no and 4 being high.2
Work and interests: Enter a number from 0 to 4, with 0 being no and 4 being high level.4
Energy: Enter a number from 0 to 2, with 0 being no and 2 being high level.1
Suicide: Enter a number from 0 to 4, where 0 has no ideation and 4 has already made an attempt.2
Anxiety: Enter a number from 0 to 4, with 0 being no and 4 being high level.3


In [24]:
df_novo=df_novo.append({'Suic_familia' : float(Suic_familia), 'Drog_familia' : float(Drog_familia), 'Capaz de tomar decisões importantes' : float(Capaz_tomar_decisoes), 'Estudante' : float(Estudante),
                        'Hipocondriase' : float(Hipocondriase), 'Sentimentos_culpa' : float(Sentimento_culpa), 'Trabalho e interesses' : float(Trabalho_interesses), 'Energia' : float(Energia),
                        'Suicidio' : float(Suicidio), 'Ansiedade': float(Ansiedade)}, ignore_index = True)

  df_novo=df_novo.append({'Suic_familia' : float(Suic_familia), 'Drog_familia' : float(Drog_familia), 'Capaz de tomar decisões importantes' : float(Capaz_tomar_decisoes), 'Estudante' : float(Estudante),


In [25]:
df_novo['Suic_familia'] = df_novo['Suic_familia'].astype(float)
df_novo['Drog_familia'] = df_novo['Drog_familia'].astype(float)
df_novo['Capaz de tomar decisões importantes'] = df_novo['Capaz de tomar decisões importantes'].astype(float)
df_novo['Estudante'] = df_novo['Estudante'].astype(float)
df_novo['Hipocondriase'] = df_novo['Hipocondriase'].astype(float)
df_novo['Sentimentos_culpa'] = df_novo['Sentimentos_culpa'].astype(float)
df_novo['Trabalho e interesses'] = df_novo['Trabalho e interesses'].astype(float)
df_novo['Energia'] = df_novo['Energia'].astype(float)
df_novo['Suicidio'] = df_novo['Suicidio'].astype(float)
df_novo['Ansiedade'] = df_novo['Ansiedade'].astype(float)

# Create causal datasets

In [26]:
# Dataset from X
class CausalDataset(Dataset):
    def __init__(self, X, target:Union[list, int]):
        self.X = torch.tensor(X, dtype=torch.double)
        self.target = target
        if isinstance(target, int):
            self.target = [target]


        # X is every variable except the target list
        dims = list(range(self.X.shape[1]))
        self.x = self.X[:, [i for i in dims if i not in self.target]]
        self.y = self.X[:, self.target]

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx].double(), self.y[idx].double()

In [27]:
class MLP(torch.nn.Module):
    def __init__(self, dim_list:list, add_dropout:bool=False):
        super(MLP, self).__init__()
        torch.manual_seed(3)
        self.layers = torch.nn.ModuleList()
        for i in range(len(dim_list) - 2):
            self.layers.append(torch.nn.Linear(dim_list[i], dim_list[i+1]))
            self.layers.append(torch.nn.ReLU())
            if add_dropout:
                self.layers.append(torch.nn.Dropout(0.3))

        self.layers.append(torch.nn.Linear(dim_list[-2], dim_list[-1]))
        self.double()

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Out train pipeline

In [28]:
def trainPipeline(dataset, model, batch_size=64, train_split=0.8, verbose=True, max_epochs=100, patience=10, lr=0.001, weight_decay=0.0, loss_fn=nn.MSELoss()):
    train_size = int(train_split * len(dataset))
    test_size = len(dataset) - train_size
    torch.manual_seed(3)
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    criterion = loss_fn
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    patience = patience

    train_losses = []
    test_losses = []
    for epoch in tqdm(range(max_epochs)):
        train_loss = 0
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        test_loss = 0
        model.eval()
        with torch.no_grad():
            for data, target in test_loader:
                output = model(data)
                loss = criterion(output, target)
                test_loss += loss.item()
        test_loss /= len(test_loader)
        test_losses.append(test_loss)

        if verbose:
            print(f"Epoch {epoch} - Train loss: {train_loss} - Test loss: {test_loss}")

        # add early stopping with patience variable
        if epoch > 10:
            all_patience = test_losses[-patience:]
            test_loss = test_losses[-1]
            if all([test_loss > x for x in all_patience]):
                print("Early stopping")
                break

    # test model on test set and calculate accuracy
    model.eval()
    with torch.no_grad():
        y_pred_test = []
        y_true_test = []
        y_pred_train = []
        y_true_train = []

        for data, target in train_loader:
            output = model(data)
            y_pred_train.append(output)
            y_true_train.append(target)
        for data, target in test_loader:
            output = model(data)
            y_pred_test.append(output)
            y_true_test.append(target)

        y_pred_test = torch.cat(y_pred_test, dim=0)
        y_true_test = torch.cat(y_true_test, dim=0)
        y_pred_train = torch.cat(y_pred_train, dim=0)
        y_true_train = torch.cat(y_true_train, dim=0)

        train_error = torch.abs(y_pred_train - y_true_train)
        test_error = torch.abs(y_pred_test - y_true_test)

        # calculate accuracy for each target
        test_accuracy = []
        train_accuracy = []
        for i in range(len(y_true_test[0])):
            test_accuracy.append(torch.sum(test_error[:, i] < 0.5) / len(test_error))
            train_accuracy.append(torch.sum(train_error[:, i] < 0.5) / len(train_error))

        print(f"Final Train accuracy: {train_accuracy}")
        print(f"Final Test accuracy: {test_accuracy}")

    metrics = {
        "train_loss": train_losses,
        "test_loss": test_losses,
        "train_error": train_error,
        "test_error": test_error,
        "test_accuracy": test_accuracy
    }

    return model, metrics


## First train one model for each target and get errors value

## Now add the errors to the dataset and train the counterfactual model

In [29]:
# Create the counterfactual model

class CounterfactualModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CounterfactualModel, self).__init__()
        self.individuality_model = MLP([input_size, hidden_size, hidden_size, hidden_size, 1], add_dropout=True)
        self.counterfactual_model = MLP([input_size+1, hidden_size, hidden_size, hidden_size, output_size], add_dropout=True)

    def individuality(self, x):
        return self.individuality_model(x)

    def counterfactual(self, x, i):
        x = torch.cat([x, i], dim=1)
        return self.counterfactual_model(x)

    def forward(self, x):
        i = self.individuality(x)
        y = self.counterfactual(x, i)
        return y

# Now we are going to redo everything, but with the NOTEAR features

In [30]:
df_suic_notears = df_suic.copy()
df_suic_notears = df_suic_notears[notears_selected]
df_suic_notears=pd.concat([df_suic_notears,df_novo], ignore_index=True)

# Create models for each target
targets = ["Suicidio", "Ansiedade"]
targets_idx = [df_suic_notears.columns.get_loc(c) for c in targets]

# Create the dataset
notears_suic_dataset = CausalDataset(df_suic_notears.values, targets_idx[0])
notears_anx_dataset = CausalDataset(df_suic_notears.values, targets_idx[1])
sample_x, sample_y = notears_suic_dataset[0]

# Now we predict the targets with a MLP
suic_causal_model = MLP([sample_x.shape[0], 64, 64, 64, 1], add_dropout=True)
anx_causal_model = MLP([sample_x.shape[0], 64, 64, 64, 1], add_dropout=True)

suic_causal_model, suic_causal_metrics = trainPipeline(notears_suic_dataset, suic_causal_model, verbose=False, max_epochs=1000, weight_decay=1e-5)
anx_causal_model, anx_causal_metrics = trainPipeline(notears_anx_dataset, anx_causal_model, verbose=False, max_epochs=1000, weight_decay=1e-5)



  0%|          | 0/1000 [00:00<?, ?it/s]

Final Train accuracy: [tensor(0.5057)]
Final Test accuracy: [tensor(0.4134)]


  0%|          | 0/1000 [00:00<?, ?it/s]

Final Train accuracy: [tensor(0.5307)]
Final Test accuracy: [tensor(0.4804)]


In [31]:
# calculate errors
suic_causal_erros = []
anx_causal_erros = []
with torch.no_grad():
    for data, y_true in notears_suic_dataset:
        y_pred = suic_causal_model(data)
        error = torch.abs(y_pred - y_true).item()
        suic_causal_erros.append(error)
    for data, y_true in notears_anx_dataset:
        y_pred = anx_causal_model(data)
        error = torch.abs(y_pred - y_true).item()
        anx_causal_erros.append(error)

suic_causal_erros = np.array(suic_causal_erros)
anx_causal_erros = np.array(anx_causal_erros)

# add errors to the dataframe
df_suic_notears_with_errors = df_suic_notears.copy()
df_suic_notears_with_errors["Suicidio_error"] = suic_causal_erros
df_suic_notears_with_errors["Ansiedade_error"] = anx_causal_erros

# create the causal dataset with the errors
targets_idx = [df_suic_notears_with_errors.columns.get_loc(c) for c in targets]
notears_causal_dataset_error = CausalDataset(df_suic_notears_with_errors.values, targets_idx)
sample_x, sample_y = notears_causal_dataset_error[0]

# create the counterfactual model
causal_cf_model = CounterfactualModel(sample_x.shape[0], 64, sample_y.shape[0])

# train the counterfactual model
causal_cf_model, causal_cf_metrics = trainPipeline(notears_causal_dataset_error, causal_cf_model, verbose=False, max_epochs=1000, weight_decay=1e-5)


  0%|          | 0/1000 [00:00<?, ?it/s]

Final Train accuracy: [tensor(0.8227), tensor(0.7453)]
Final Test accuracy: [tensor(0.7547), tensor(0.6713)]


In [32]:
# Load the dataset
targets = ["Suicidio", "Ansiedade"]
df = df_suic_notears_with_errors
# print available columns
# print possible values for each column
for col in df.columns:
    if "error" not in col:
        print(f"{col}: {sorted(df[col].unique())}")

# load models
counterfact = causal_cf_model
counterfact = counterfact.eval()

Suic_familia: [0.0, 1.0]
Drog_familia: [0.0, 1.0]
Capaz de tomar decisões importantes: [-1.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Estudante: [-1.0, 0.0, 0.5, 1.0]
Hipocondriase: [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Sentimentos_culpa: [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Trabalho e interesses: [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Energia: [-1.0, 0.0, 1.0, 2.0]
Suicidio: [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0]
Ansiedade: [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0]


# Initial prediction

In [33]:
def contrafactual(df):
  df_suic_notears_with_errors = df
  # Select two individuals
  dataset = CausalDataset(df_suic_notears_with_errors.values, [df.columns.get_loc(c) for c in targets])

  idx = [0,3953]

  data1, y1 = dataset[idx[0]]
  data2, y2 = dataset[idx[1]]

  # predict the counterfactuals for crossed individuals
  with torch.no_grad():
      i1 = counterfact.individuality(data1.unsqueeze(0))
      i2 = counterfact.individuality(data2.unsqueeze(0))

      cf11 = counterfact.counterfactual(data1.unsqueeze(0), i1)
      cf12 = counterfact.counterfactual(data2.unsqueeze(0), i2)

      print(f"Counterfactual Suicide: {cf12.squeeze(0).round().numpy().tolist()[0]}")
      print()

In [34]:
print('Initial prediction')
contrafactual(df_suic_notears_with_errors)

Initial prediction
Counterfactual Suicide: 2.0



# Counterfactual simulations

In [35]:
print('Enter the value indicated according to the feature you want to modify')

Enter the value indicated according to the feature you want to modify


In [36]:
def alter_feature(df,Variavel,valor):
  df_suic_notears_with_errors = df
  if Variavel == '1':
    df_suic_notears_with_errors['Suic_familia'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Suicide in the family')
  elif Variavel == '2':
    df_suic_notears_with_errors['Drog_familia'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Drugs in the family')
  elif Variavel == '3':
    df_suic_notears_with_errors['Capaz de tomar decisões importantes'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Able to make important decisions')
  elif Variavel == '4':
    df_suic_notears_with_errors['Estudante'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Student')
  elif Variavel == '5':
    df_suic_notears_with_errors['Hipocondriase'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Hipocondriase')
  elif Variavel == '6':
    df_suic_notears_with_errors['Sentimentos_culpa'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Feelings of guilt')
  elif Variavel == '7':
    df_suic_notears_with_errors['Trabalho e interesses'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Work and interests')
  elif Variavel == '8':
    df_suic_notears_with_errors['Energia'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Energy')
  else:
    df_suic_notears_with_errors['Ansiedade'].loc[(df_suic_notears_with_errors.index==3953)] = float(Valor)
    print('Anxiety')
  return df_suic_notears_with_errors

In [38]:
for i in range(0,9):
  df_simula = df_suic_notears_with_errors.copy()
  print('Enter the value indicated according to the feature you want to modify')
  Variavel = input("1 Suicide in the family, 2 Drugs in the family, 3 Able to make important decisions, 4 Student, 5 Hypochondriasis, 6 Feelings of guilt, 7 Work and interests, 8 Energy and 9 Anxiety")
  Valor = input("Enter the new value")
  df_simula = alter_feature(df_simula, Variavel, Valor)
  contrafactual(df_simula)
  if i<7:
    simulacao = input("Do you want to run a new simulation? Enter 1 for Yes and 0 for No")
    if simulacao=='0':
      break

Enter the value indicated according to the feature you want to modify
1 Suicide in the family, 2 Drugs in the family, 3 Able to make important decisions, 4 Student, 5 Hypochondriasis, 6 Feelings of guilt, 7 Work and interests, 8 Energy and 9 Anxiety3
Enter the new value0
Able to make important decisions
Counterfactual Suicide: 2.0

Do you want to run a new simulation? Enter 1 for Yes and 0 for No1
Enter the value indicated according to the feature you want to modify
1 Suicide in the family, 2 Drugs in the family, 3 Able to make important decisions, 4 Student, 5 Hypochondriasis, 6 Feelings of guilt, 7 Work and interests, 8 Energy and 9 Anxiety7
Enter the new value0
Work and interests
Counterfactual Suicide: 2.0

Do you want to run a new simulation? Enter 1 for Yes and 0 for No1
Enter the value indicated according to the feature you want to modify
1 Suicide in the family, 2 Drugs in the family, 3 Able to make important decisions, 4 Student, 5 Hypochondriasis, 6 Feelings of guilt, 7 Work 