In [1]:
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
import random

class Node:
    def __init__(self, num_vals, id, alpha=1.) -> None:
        """
            num_vals: Int
            id: Int
            alpha: Float (dirichlet's alpha)
        """
        self.id = id
        self.num_vals = num_vals
        self.local_marginal_probs = pd.DataFrame({f"X{self.id}": [i for i in range(num_vals)]})
        self.parents = []
        self.cur_val = -1
        self.alpha = alpha
        self.local_conditional_probs = None
        
    def set_parents(self, parents):
        """
            parents: List[Node]
        """
        # print(f"Node {self.id} have parents: {[pa.id for pa in parents]}")
        self.parents = parents
        return
        
    def get_level(self):
        if len(self.parents):
            return max([pa.get_level() for pa in self.parents]) + 1
        else:
            return 0
    
    def set_condprob(self):
        if len(self.parents):
            self.local_conditional_probs = self.local_marginal_probs.copy()
            pa_comb = 1
            for pa in self.parents:
                pa_comb *= pa.num_vals
                self.local_conditional_probs = self.local_conditional_probs.merge(pa.local_marginal_probs, how="cross")
            
            # Generating conditional probability, constraints: sum (P(X=x|Z=z)) = 1 for all x
            self.local_conditional_probs['prob'] = [1] * len(self.local_conditional_probs)
            self.local_conditional_probs = self.local_conditional_probs.sort_values(by=[f'X{pa.id}' for pa in self.parents])
            for i in range(pa_comb):
                self.local_conditional_probs['prob'].iloc[self.num_vals*i:self.num_vals*(i+1)] = np.random.dirichlet([self.alpha] * self.num_vals).flatten()
        return
    
    def set_margprob(self, dirichlet_alpha=1):
        if not len(self.parents):
            probs = np.random.dirichlet([dirichlet_alpha] * self.num_vals).flatten()
            try:
                old_probs = self.local_marginal_probs['prob']
                while np.sqrt((old_probs - probs).pow(2).sum()) < 0.25:
                    probs = np.random.dirichlet([dirichlet_alpha] * self.num_vals).flatten()
            except:
                pass
                
            self.local_marginal_probs['prob'] = probs.flatten()
        return
    
    def sample(self):
        if len(self.parents):
            conditions = np.array([True for _ in range(len(self.local_conditional_probs))]) # type:ignore
            for pa in self.parents:
                select = (self.local_conditional_probs[f'X{pa.id}'] == pa.cur_val).to_numpy()   # type:ignore
                conditions = np.logical_and(conditions,select)
            cond_df = self.local_conditional_probs[conditions]  # type:ignore
            self.cur_val = np.random.choice(cond_df[f'X{self.id}'], size=1, p=cond_df[f'prob']).item()
        else:
            self.cur_val = np.random.choice(self.local_marginal_probs[f'X{self.id}'], size=1, p=self.local_marginal_probs[f'prob']).item()
        return
        

class DAG:
    def __init__(self, adj_mtx, max_numvals, alpha) -> None:
        self.adj_mtx = adj_mtx
        self.nodes = [Node(np.random.randint(2, max_numvals), i + 1, alpha) for i in range(adj_mtx.shape[0])]
        self.__endogeneous_nodes = []
        self.__render_adjmtx()
        self.__order_nodes()
        self.__init_condprobs()
        self.__init_margprob()
        
    def __render_adjmtx(self):
        for i in range(self.adj_mtx.shape[0]):
            parents = [self.nodes[j] for j in range(self.adj_mtx.shape[0]) if self.adj_mtx[j][i] == 1]
            self.nodes[i].set_parents(parents)  # type:ignore
            if len(parents) == 0:
                self.__endogeneous_nodes.append(self.nodes[i])
        return
    
    def __order_nodes(self):
        self.nodes.sort(key=lambda item: item.get_level())
        return
    
    def __init_condprobs(self):
        for node in self.nodes:
            node.set_condprob()    # type:ignore
        return
    
    def __init_margprob(self):
        for node in self.nodes:
            node.set_margprob(1.0)    # type:ignore
        return
    
    def reinit_endoprob(self, dirichlet_alpha):
        chosen_id = np.random.choice([i for i in range(len(self.__endogeneous_nodes))], size=1).item()
        self.__endogeneous_nodes[chosen_id].set_margprob(dirichlet_alpha)
        return
    
    def disseminate(self, n):
        df = pd.DataFrame(columns=[f'X{node.id}' for node in self.nodes])
        for i in tqdm(range(n), leave=False):
            res = []
            for node in self.nodes:
                # print(f"X{node.id}-->", end="")
                node.sample()
                res.append(node.cur_val)
            # print("|")
            df.loc[len(df),:] = res  # type:ignore
        return df

In [2]:
import os
from pathlib import Path
from utils import is_acyclic

def gen_data(dag, n=10000, savepath="./data", filename="output.csv"):
    df = dag.disseminate(n)
    
    if filename is not None:
        res_path = os.path.join(savepath)
        if not Path(res_path).exists():
            os.makedirs(res_path)
        
        df.to_csv(os.path.join(res_path, filename), index=False)
    
    return df

In [3]:
# from utils import compute_mll
from typing import List

def compute_mll(summary_with_ch: pd.DataFrame, potential_parent: list, num_env):
    if len(potential_parent):
        marginalized_ch = summary_with_ch.groupby(potential_parent)['count'].sum().reset_index()
        output = summary_with_ch.merge(marginalized_ch, on=potential_parent, how='left')            
        output.insert(0, f'probs_{num_env}', output['count_x']/output['count_y'])
        output.insert(0, f'joint_{num_env}', output['count_x']/output['count_x'].sum())
        mll = np.array(output['count_x']).dot(np.log(output[f'probs_{num_env}'])).item()
        output = output.drop(['count_x', 'count_y'], axis=1)
        # return mll, output.rename({'count_y': f'count_y{num_env}', 'count_x': f'count_x{num_env}'}, axis=1)
        return mll, output
    else:
        output = summary_with_ch.copy()
        output.insert(0, f'probs_{num_env}', output['count']/output['count'].sum())
        mll = np.array(output['count']).dot(np.log(output[f'probs_{num_env}'])).item()
        output = output.drop(['count'], axis=1)
        return mll, output


def compute_variance_viasilos(silos: List[pd.DataFrame], variable: str, parents: list, verbose=False):
    conditional_probs_record = silos[0][parents + [variable]].groupby(parents + [variable]).count().reset_index()
    mll_list = []
    env = 0
    for data in silos:
        vertical_sampled_data = data[parents + [variable]]
        vertical_sampled_data.insert(0, 'count', [1] * len(vertical_sampled_data))
        
        summary_with_ch = vertical_sampled_data.groupby(parents + [variable])['count'].sum().reset_index()
        mll, output = compute_mll(summary_with_ch, parents, env)
        conditional_probs_record = conditional_probs_record.merge(output, on=parents + [variable], how='left')
        mll_list.append(mll)
        env += 1
            
    mean_mll = np.mean(mll_list)
    var_avg = conditional_probs_record.iloc[:, len(parents) + 1:].var(axis=1, skipna=True).mean()
    if verbose:
        print(conditional_probs_record)
    return var_avg, mean_mll, conditional_probs_record


def compute_weighted_variance_viasilos(silos: List[pd.DataFrame], variable: str, parents: list, verbose=False):
    variance, _, df = compute_variance_viasilos(silos, variable, parents, verbose=verbose)
    if len(parents):
        joint_mat = np.array([df[f'joint_{i}'] for i in range(len(silos))]).T
        probs_mat = np.array([df[f'probs_{i}'] for i in range(len(silos))]).T
        probs_mean = np.array([np.mean(probs_mat[i][~np.isnan(probs_mat[i])], keepdims=True) for i in range(probs_mat.shape[0])])
        prod = joint_mat * (probs_mat - probs_mean)**2
        return np.mean(prod[~np.isnan(prod)])
    else:
        return variance


def get_condprob(dag, node_id):
    if len(dag.nodes[node_id].parents):
        return dag.nodes[node_id].local_conditional_probs
    else:
        return dag.nodes[node_id].local_marginal_probs
    

def compute_condprob(df: pd.DataFrame, variable: str, conditioned_vars: list):
    vertical_sampled_data = df[conditioned_vars + [variable]]
    vertical_sampled_data.insert(0, 'count', [1] * len(vertical_sampled_data))
    summary_with_ch = vertical_sampled_data.groupby([variable] + conditioned_vars)['count'].sum().reset_index()
    _, output = compute_mll(summary_with_ch, conditioned_vars, 0)
    return output

### Generic graph

In [None]:
import bnlearn as bn

model = bn.import_DAG('../data/munin.bif', verbose=0)

In [None]:
dataname = "munin"
adj_mtx = model['adjmat'].to_numpy() * 1
# data = json.load(open(f"../CausalBKAI/data/TestData/bnlearn_discrete_10000/truth_dag_adj/{dataname}.json", "r"))
# adj_mtx = np.array(data['Adj'])


# adj_mtx = np.array(
#     [[0,1,0],
#      [0,0,1],
#      [0,0,0]]
# )

mi = 3
di = 5

dag = DAG(adj_mtx, max_numvals=mi, alpha=di)
# get_condprob(dag, 2)

In [None]:
# track_endo = get_condprob(dag, 0)[['X1']].groupby(['X1']).sum().reset_index()
n = 10
silos = []
for i in range(n):
    dag.reinit_endoprob(dirichlet_alpha=di)
    # track_endo = track_endo.merge(get_condprob(dag, 0).rename({"prob": f"prob{i}"}, axis=1), how='left', on=['X1'])
    df = gen_data(dag, 5000, savepath=f"../data/distributed/{dataname}/m{mi}_d{di}_n{n}", filename=f"silo-{i}.csv") # f"silo-{i}.csv"
    silos.append(df)

# track_endo

In [None]:
np.savetxt(f"../data/distributed/{dataname}/adj.txt", adj_mtx)

In [None]:
get_condprob(dag, 0)

In [None]:
get_condprob(dag, 1)

In [None]:
get_condprob(dag, 2)

In [None]:
variance = compute_weighted_variance_viasilos(silos, 'X2', ['X1'], verbose=True)
variance

In [None]:
variance = compute_weighted_variance_viasilos(silos, 'X1', ['X2'], verbose=True)
variance

In [None]:
variance= compute_weighted_variance_viasilos(silos, 'X3', ['X1', 'X2'], verbose=False)
variance

### Categorical graph

In [None]:
import networkx as nx
import numpy as np
import random

In [None]:
num_node = 20
p = 1/(num_node - 1)

graph = nx.erdos_renyi_graph(n=num_node, p=p, seed=0, directed=True)

try:
    while True:
        cycle_list = nx.find_cycle(graph, orientation='original')
        if len(cycle_list):
            random.shuffle(cycle_list)
            a, b, type = cycle_list[0]
            graph.remove_edge(a, b)
        else:
            break
except:
    pass

In [None]:
adj_mtx = np.zeros([num_node, num_node])
for edge in graph.edges:
    a, b = edge
    adj_mtx[a][b] = 1
    
outdegrees = np.sum(adj_mtx, axis=0, keepdims=True)
indegrees = np.sum(adj_mtx, axis=1, keepdims=True)


if outdegrees.max() < indegrees.max():
    adj_mtx = adj_mtx.T

In [None]:
mi = 3
di = 1
dag = DAG(adj_mtx, max_numvals=mi, alpha=di)

In [None]:
n = 10
# silos = []
for i in range(n):
    dag.reinit_endoprob(dirichlet_alpha=1)
    df = gen_data(dag, 5000, savepath=f"./data/distributed/erdos_renyi/d{num_node}_p{p}/m{mi}_d{di}_n{n}", filename=f"silo-{i}.csv") # f"silo-{i}.csv"
    # silos.append(df)

In [None]:
with open(f"./data/distributed/erdos_renyi/d{num_node}_p{p}/adj.txt", "w") as f:
    np.savetxt(f, adj_mtx)

### Continuous data

In [4]:
import networkx as nx
import numpy as np
import random
import pandas as pd
from tqdm import tqdm

In [5]:
import igraph as ig

def simulate_dag(d, s0):
    def _random_permutation(M):
        # np.random.permutation permutes first axis only
        P = np.random.permutation(np.eye(M.shape[0]))
        return P.T @ M @ P

    def _random_acyclic_orientation(B_und):
        return np.tril(_random_permutation(B_und), k=-1)

    def _graph_to_adjmat(G):
        return np.array(G.get_adjacency().data)

    # Erdos-Renyi
    G_und = ig.Graph.Erdos_Renyi(n=d, m=s0)
    B_und = _graph_to_adjmat(G_und)
    B = _random_acyclic_orientation(B_und)
    
    B_perm = _random_permutation(B)
    return B_perm


def simulate_parameter(B, w_ranges=((-2.0, -0.5), (0.5, 2.0))):
    W = np.zeros(B.shape)
    S = np.random.randint(len(w_ranges), size=B.shape)  # which range
    for i, (low, high) in enumerate(w_ranges):
        U = np.random.uniform(low=low, high=high, size=B.shape)
        W += B * (S == i) * U
    return W

In [6]:
class Node:
    def __init__(self, weights, id) -> None:
        """
            id: Int
            alpha: Float (dirichlet's alpha)
        """
        self.id = id
        self.generative_weights = weights
        self.parents = []
        self.cur_val = -1
        self.mean = np.random.randn()
    
    def set_parents(self, parents):
        self.parents = parents
        return
        
    def get_level(self):
        if len(self.parents):
            return max([pa.get_level() for pa in self.parents]) + 1
        else:
            return 0
    
    def sample(self):
        if len(self.parents):
            current_vals = np.zeros_like(self.generative_weights)
            for pa in self.parents:
                id = pa.id - 1
                current_vals[id] = pa.cur_val
                
            self.cur_val = (current_vals @ self.generative_weights).item()
        else:
            self.cur_val = np.random.normal(self.mean, scale=1, size=1).item()
        return


class DAG:
    def __init__(self, adj_mtx:np.ndarray) -> None:
        self.adj_mtx = adj_mtx
        self.nodes = [Node(adj_mtx[:,i].flatten(), i + 1) for i in range(adj_mtx.shape[0])]
        self.__endogeneous_nodes = []
        self.__render_adjmtx()
        self.__order_nodes()
        
    def __render_adjmtx(self):
        for i in range(self.adj_mtx.shape[0]):
            parents = [self.nodes[j] for j in range(self.adj_mtx.shape[0]) if self.adj_mtx[j][i] != 0]
            self.nodes[i].set_parents(parents)  # type:ignore
            if len(parents) == 0:
                self.__endogeneous_nodes.append(self.nodes[i])
        return
    
    def __order_nodes(self):
        self.nodes.sort(key=lambda item: item.get_level())
        return
    
    def disseminate(self, n):
        df = pd.DataFrame(columns=[f'X{node.id}' for node in self.nodes])
        for i in tqdm(range(n), leave=False):
            res = []
            for node in self.nodes:
                node.sample()
                res.append(node.cur_val)
            df.loc[len(df),:] = res  # type:ignore
        return df

In [105]:
d, s = 10, 15
adj_mtx = simulate_dag(d, s)
w_mtx = simulate_parameter(adj_mtx)

In [None]:
dag = DAG(w_mtx)
df = dag.disseminate(10000)
df = df.reindex(sorted(df.columns, key=lambda item: int(item[1:])), axis=1)

In [107]:
np.savetxt(f"../data/notears/linearGaussian/W_true_{d}_{s}.csv", adj_mtx, delimiter=",")
np.savetxt(f"../data/notears/linearGaussian/raw/X_{d}_{s}.csv", df.to_numpy(), delimiter=",")

In [None]:
df