In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import json

dataname = "hailfinder"

mi = 3      # The number of values a variable can take is ranged in [2, mi-1]
di = 5      # The dirichlet alpha that controls the data distribution
n = 10      # The number of data silos

silos = []

folderpath = f"./data/distributed/{dataname}/m{mi}_d{di}_n{n}"
groundtruth = np.loadtxt(f"./data/distributed/{dataname}/adj.txt", delimiter=' ')

if not Path(folderpath).exists():
    print("Folder", folderpath, "not exist!")
else:
    for file in sorted(os.listdir(folderpath)):
        filename = os.path.join(folderpath, file)
        silo_data = pd.read_csv(filename)
        silos.append(silo_data)
        print("Loaded file:", filename, end="\n")   
             
merged_df = pd.concat(silos, axis=0)
merged_df = merged_df.reindex(sorted(merged_df.columns, key=lambda item: int(item[1:])), axis=1)
all_vars = merged_df.columns

Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-0.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-1.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-2.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-3.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-4.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-5.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-6.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-7.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-8.csv
Loaded file: ./data/distributed/hailfinder/m3_d5_n10/silo-9.csv


#### Constraint-based PC

In [4]:
import bnlearn as bn

model = bn.structure_learning.fit(merged_df, methodtype='cs', verbose=0)

adj_mtx = np.zeros([len(all_vars), len(all_vars)])
for edge in model['dag_edges']:     # type:ignore
    source, target = edge
    source_id = int(source[1:]) - 1
    target_id = int(target[1:]) - 1
    adj_mtx[source_id][target_id] = 1

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge, swap_pos

etrue = true_edge(groundtruth, adj_mtx)
espur = spur_edge(groundtruth, adj_mtx)
efals = fals_edge(groundtruth, adj_mtx)
emiss = miss_edge(groundtruth, adj_mtx)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

#### Invariant Causal Prediction

In [22]:
import causalicp as icp

for var_idx in range(len(all_vars)):
    print("Testing for variable:", all_vars[var_idx], end="\t")
    result = icp.fit([silo.to_numpy() for silo in silos], var_idx, alpha=0.05, precompute=True, verbose=False, color=False)
    print("Causes: ", result.accepted_sets)

Testing for variable: X1	Causes:  []
Testing for variable: X2	Causes:  []
Testing for variable: X3	Causes:  []
Testing for variable: X4	Causes:  []
Testing for variable: X5	Causes:  []


#### Causal discovery from nonstationary/heterogeneous data (CDNOD)

In [None]:
from causallearn.search.ConstraintBased.CDNOD import cdnod
from baselines.FL_FedCDH.mycausallearn.utils.data_utils import get_cpdag_from_cdnod, get_dag_from_pdag
from causallearn.utils.cit import fisherz

c_indx = []
for i in range(len(silos)):
    data = silos[i]
    c_indx += [i+1] * len(data)
c_indx = np.array(c_indx).reshape(len(silos)*len(silos[0]), 1)

cg = cdnod(merged_df.to_numpy(), c_indx, 0.05, fisherz)

est_graph = cg.G.graph[0:len(all_vars), 0:len(all_vars)]
est_cpdag = get_cpdag_from_cdnod(est_graph) # est_graph[i,j]=-1 & est_graph[j,i]=1  ->  est_graph_cpdag[i,j]=1
est_dag_from_pdag = get_dag_from_pdag(est_cpdag) # return a DAG from a PDAG in causaldag

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge

etrue = true_edge(groundtruth, est_dag_from_pdag)
espur = spur_edge(groundtruth, est_dag_from_pdag)
efals = fals_edge(groundtruth, est_dag_from_pdag)
emiss = miss_edge(groundtruth, est_dag_from_pdag)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

#### Constraint-based Fast Causal Inference (FCI)

In [None]:
from causallearn.search.ConstraintBased.FCI import fci

# default parameters
g, edges = fci(merged_df.to_numpy())

In [None]:
# for edge in edges:
#     print(edge, "\t", edge.get_node1(), edge.get_numerical_endpoint1(), edge.get_node2(), edge.get_numerical_endpoint2())

adj_mtx = np.zeros([len(all_vars), len(all_vars)])
for edge in edges:     # type:ignore
    source_id = int(edge.get_node1().get_name()[1:]) - 1
    target_id = int(edge.get_node2().get_name()[1:]) - 1
    
    if edge.get_numerical_endpoint1() == -1:
        adj_mtx[source_id][target_id] = 1
    elif edge.get_numerical_endpoint1() != 2:
        adj_mtx[target_id][source_id] = 1

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge, swap_pos

etrue = true_edge(groundtruth, adj_mtx)
espur = spur_edge(groundtruth, adj_mtx)
efals = fals_edge(groundtruth, adj_mtx)
emiss = miss_edge(groundtruth, adj_mtx)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

#### Value-based HillClimb GES

In [None]:
import bnlearn as bn

model = bn.structure_learning.fit(merged_df, methodtype='hc', verbose=0)
adj_mtx = model['adjmat'].to_numpy() * 1.0      # type:ignore

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge

etrue = true_edge(groundtruth, adj_mtx.T)
espur = spur_edge(groundtruth, adj_mtx.T)
efals = fals_edge(groundtruth, adj_mtx.T)
emiss = miss_edge(groundtruth, adj_mtx.T)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

#### Value-based Treesearch Chow-Liu Algorithm

In [None]:
import bnlearn as bn
# Preprocessing raw dataset
dfhot, dfnum = bn.df2onehot(merged_df)

# Structure learning
model = bn.structure_learning.fit(dfnum, methodtype='cl', verbose=0, root_node='X1')
adj_mtx = model['adjmat'].to_numpy() * 1.0      # type:ignore

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge

etrue = true_edge(groundtruth, adj_mtx)
espur = spur_edge(groundtruth, adj_mtx)
efals = fals_edge(groundtruth, adj_mtx)
emiss = miss_edge(groundtruth, adj_mtx)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

#### Value-based Tree-augmented Naive Bayes (TAN) 

In [None]:
import bnlearn as bn

# Structure learning
model = bn.structure_learning.fit(merged_df, methodtype='tan', class_node='X1', verbose=0)
pruned_model = bn.independence_test(model, merged_df, alpha=0.05, prune=True)

adj_mtx = pruned_model['adjmat'].to_numpy() * 1.0      # type:ignore

In [None]:
from plot_utils import true_edge, spur_edge, fals_edge, miss_edge

etrue = true_edge(groundtruth, adj_mtx)
espur = spur_edge(groundtruth, adj_mtx)
efals = fals_edge(groundtruth, adj_mtx)
emiss = miss_edge(groundtruth, adj_mtx)

# print(etrue)
print(len(etrue), len(espur), len(emiss), len(efals))

### CDT.CAUSALITY

#### Concave penalized Coordinate Descent with reparametrization (CCDR)

In [None]:
from cdt.causality.graph import CCDr

#### Greedy Interventional Equivalence Search algorithm (GIES)

In [None]:
from cdt.causality.graph import GIES
import networkx as nx

obj = GIES()
output = obj.predict(merged_df)

In [None]:

for e in output.edges:
    print(e)

#### Structural Agnostic Model

In [None]:
from cdt.causality.graph import SAM

obj = SAM()
output = obj.predict(merged_df)

#### Causal Generative Neural Networks

In [None]:
from cdt.causality.graph import CGNN