In [1]:
import sys
import logging
import warnings
logging.getLogger('pgmpy').setLevel(logging.ERROR)
logging.getLogger('pgmpy').setLevel(logging.WARNING)
warnings.filterwarnings("ignore") 
warnings.filterwarnings("ignore", category=RuntimeWarning, module="networkx.utils.backends")


import os
import random
import itertools
import pysnooper
import pandas as pd
import pyagrum as gum
import concurrent.futures
from functools import reduce
import numpy as np
from time import *
import networkx as nx
from Decom_Tree import *
from copy import deepcopy
os.environ["NUMEXPR_MAX_THREADS"] = "16"
from pgmpy.utils import get_example_model
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors import factor_product,factor_divide
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import DiscreteBayesianNetwork,JunctionTree
from pgmpy.factors.discrete import DiscreteFactor, TabularCPD
from pgmpy.inference import VariableElimination, BeliefPropagation

In [2]:
def generate_state_names(g):
    return {node: [0, 1] for node in list(g.nodes)}

def get_random_cpds_with_labels(model, state_names, inplace=False, seed=None):
    cpds = []
    for node in model.nodes():
        parents = list(model.predecessors(node))
        cpds.append(
            TabularCPD.get_random(
                variable=node,
                evidence=parents,
                cardinality={var: 2 for var in model.nodes()}, 
                state_names=state_names,  
                seed=seed
            )
        )
    if inplace:
        model.add_cpds(*cpds)
    else:
        return cpds
    
def mar_pro(t,r):
        
    factors_order = [set(t.factors[i].variables) for i in range(len(t.factors))]
    factor_copy = deepcopy(t.factors)

    while t.number_of_nodes() > 0:

        min_degree_node = min(t.nodes(), key=t.degree)

        self_index = factors_order.index(set(min_degree_node))

        S_factor = deepcopy(factor_copy[self_index]) 

        if len(t.nodes) > 1:

            neibor_set = set(*t.adj[min_degree_node])

            S = set(min_degree_node)&neibor_set #

            S_marginal_nodes = set(factor_copy[self_index].variables) - S

            S_factor.marginalize(S_marginal_nodes)


            marginal_nodes =  set(min_degree_node) - (S|set(R)) 

            neibor_inder = factors_order.index(neibor_set)

            if len(marginal_nodes)>0:
                factor_copy[self_index].marginalize(marginal_nodes )

            factor_copy[neibor_inder] = factor_product(factor_copy[self_index],factor_copy[neibor_inder]) #P(A)P(B)
            factor_copy[neibor_inder] = factor_divide(factor_copy[neibor_inder],S_factor)#P(A)P(B)/P(S)

            #print(neibor_set,factor_copy[neibor_inder])

        else:
            marginal_nodes =  set(factor_copy[self_index].variables) - set(R)
            #print(marginal_nodes)
            if len(marginal_nodes)>0:
                factor_copy[self_index].marginalize(marginal_nodes)

        t.remove_node(min_degree_node)
    
    return factor_copy[self_index]

In [3]:
model = get_example_model('child')
G = nx.DiGraph()
G.add_nodes_from(model.nodes)
G.add_edges_from(model.edges)
state_names = generate_state_names(G)
decom = Graph_Decom(G)
atoms = decom.Decom()

In [20]:
logging.getLogger('pgmpy').setLevel(logging.ERROR)
T = decom.spann_tree()

bn = DiscreteBayesianNetwork()
bn.add_nodes_from(list(G.nodes))
bn.add_edges_from(list(G.edges))

learn_bn = DiscreteBayesianNetwork()
learn_bn.add_nodes_from(list(G.nodes))
learn_bn.add_edges_from(list(G.edges))

columns = []
for size in [100, 500, 1000, 2500, 5000, 7500, 10000]:
    columns.extend([f"{size}_Bias", f"{size}_RMSE", f"{size}_Bias_full", f"{size}_RMSE_full",f"{size}_Bias_1", f"{size}_RMSE_1",f"{size}_Bias_1_full", f"{size}_RMSE_1_full"])
data = pd.DataFrame(index=range(100), columns=columns)

sum_sub_T,sum_full_T,sum_Bias,sum_RMSE = [],[],[],[]
for sample_size in [100,500,1000,2500,5000,7500,10000]:
    print(sample_size)
    
    sub_T,full_T = 0,0
    for row_index in range(100): #100 distributions
        
        #In-cluster
        Bias,RMSE,Bias_full,RMSE_full, = 0,0,0,0
        atom =  list(random.sample(list(T.nodes), 1))
        C = list(atom[0])
        R =  list(random.sample(C, 2))

        bn.cpds = []
        get_random_cpds_with_labels(bn, state_names, inplace=True)
        ori_infer = BeliefPropagation(bn) 
        ori_query = ori_infer.query(variables = [R[0]], evidence= {R[1]:0}, show_progress=False).values[0] 
        
        for k in range(100):#Each distribution is sampled 100 times to make inferences
            
            df = BayesianModelSampling(bn).forward_sample(size=sample_size, show_progress=False)

            learn_bn.cpds = []
            learn_bn.fit(df,estimator=MaximumLikelihoodEstimator)
            start = time()
            full_infer = BeliefPropagation(learn_bn) 
            full_query = full_infer.query(variables = [R[0]], evidence= {R[1]:0}, show_progress=False).values[0] 
            full_T += time()-start
            Bias_full += (full_query-ori_query)
            RMSE_full += (full_query-ori_query)**2
            
            sub_df = df[C]
            sub_model = DiscreteBayesianNetwork(list(G.subgraph(C).edges))
            sub_model.add_nodes_from(C)
            sub_model.cpds = []
            sub_model.fit(sub_df,estimator=MaximumLikelihoodEstimator)        
            start = time()
            sub_infer = VariableElimination(sub_model) 
            sub_query = sub_infer.query(variables = [R[0]], evidence= {R[1]:0}, show_progress=False).values[0]  
            sub_T += time()-start
            
            Bias += (sub_query-ori_query)
            RMSE += (sub_query-ori_query)**2

        data.at[row_index, f"{sample_size}_Bias"] = Bias/100  
        data.at[row_index, f"{sample_size}_RMSE"] = (RMSE/100)** 0.5  
        data.at[row_index, f"{sample_size}_Bias_full"] = Bias_full/100  
        data.at[row_index, f"{sample_size}_RMSE_full"] = (RMSE_full/100)** 0.5  
        
        
        #Cross-cluster
        Bias,RMSE,Bias_full,RMSE_full, = 0,0,0,0
        R = list(random.sample(list(G.nodes), 2))

        #bn.cpds = []
        #get_random_cpds_with_labels(bn, state_names, inplace=True)
        #ori_infer = BeliefPropagation(bn) 
        ori_query = ori_infer.query(variables = [R[0]], evidence= {R[1]:0}, show_progress=False).values[0] 

        
        for k in range(100):#Each distribution is sampled 100 times to make inferences
            
            df = BayesianModelSampling(bn).forward_sample(size=sample_size, show_progress=False)

            learn_bn.cpds = []
            learn_bn.fit(df,estimator=MaximumLikelihoodEstimator)
            start = time()
            full_infer = BeliefPropagation(learn_bn) 
            full_query = full_infer.query(variables = [R[0]], evidence= {R[1]:0}, show_progress=False).values[0] 
            full_T += time()-start
            Bias_full += (full_query-ori_query)
            RMSE_full += (full_query-ori_query)**2
            
            clique_trees = JunctionTree(T.edges())
            evi_0 = 0
            for anode in list(T.nodes):
                sub_model = DiscreteBayesianNetwork(list(G.subgraph(list(anode)).edges))
                sub_model.cpds = []
                sub_model.fit(data=df[list(anode)],estimator=MaximumLikelihoodEstimator)
                L = [cpd.to_factor() for cpd in sub_model.cpds]
                factors = L[0]
                for j in range(1,len(L)):
                    factors = factor_product(factors,L[j])
                if R[1] in anode:
                    start1 = time()
                    sub_infer = VariableElimination(sub_model) 
                    evi_0 = sub_infer.query(variables = [R[1]], evidence= {}, show_progress=False).values[0]
                    end1 = time()
                clique_trees.add_factors(factors)

            remove_node = set(T.nodes) - set(decom.PPDD(set(R)).nodes)
            P_T = clique_trees.copy()
            for rnode in remove_node:
                P_T.remove_node(rnode)

            start = time()   
            OUR = (mar_pro(P_T,set(R)).values[0][0])/evi_0
            sub_T = sub_T+(time()-start)+(end1-start1)
            Bias += (OUR-ori_query)
            RMSE += (OUR-ori_query)**2
            
        data.at[row_index, f"{sample_size}_Bias_1"] = Bias/100  
        data.at[row_index, f"{sample_size}_RMSE_1"] = (RMSE/100)** 0.5  
        data.at[row_index, f"{sample_size}_Bias_1_full"] = Bias_full/100  
        data.at[row_index, f"{sample_size}_RMSE_1_full"] = (RMSE_full/100)** 0.5  


100
500
1000
2500
5000
7500
10000


In [22]:
data

Unnamed: 0,100_Bias,100_RMSE,100_Bias_full,100_RMSE_full,100_Bias_1,100_RMSE_1,100_Bias_1_full,100_RMSE_1_full,500_Bias,500_RMSE,...,7500_Bias_1_full,7500_RMSE_1_full,10000_Bias,10000_RMSE,10000_Bias_full,10000_RMSE_full,10000_Bias_1,10000_RMSE_1,10000_Bias_1_full,10000_RMSE_1_full
0,-0.003793,0.066298,-0.003793,0.066298,0.001979,0.021454,0.001987,0.021456,0.001089,0.02129,...,0.002347,0.011033,-0.000082,0.004105,-0.000082,0.004105,-0.000363,0.004885,-0.000363,0.004885
1,0.002371,0.072221,0.002371,0.072221,-0.004044,0.048325,-0.004038,0.048316,0.004039,0.028245,...,-0.000294,0.005674,-0.000089,0.002985,-0.000089,0.002985,0.000164,0.005908,0.000164,0.005908
2,-0.000063,0.03621,-0.000063,0.03621,0.006653,0.051399,0.006653,0.051399,-0.001331,0.032263,...,0.000946,0.005499,0.000186,0.004801,0.000186,0.004801,-0.00037,0.005776,-0.00037,0.005776
3,0.00026,0.047457,0.00026,0.047457,0.005704,0.08043,0.005704,0.08043,0.002415,0.026658,...,0.000218,0.005806,-0.000199,0.002249,-0.000171,0.002227,-0.000251,0.004721,-0.000248,0.004711
4,-0.004883,0.06475,-0.004883,0.06475,-0.003696,0.052247,-0.003481,0.052322,-0.000277,0.010259,...,0.000224,0.006013,-0.000065,0.004526,-0.000065,0.004526,-0.000029,0.004241,-0.000029,0.004241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.004709,0.056117,0.004709,0.056117,-0.002138,0.048289,-0.001883,0.047091,0.001769,0.030069,...,-0.000324,0.00598,0.000754,0.006135,0.000754,0.006135,-0.000226,0.005058,-0.000227,0.005056
96,0.004261,0.055227,0.004261,0.055227,-0.005041,0.044788,-0.005041,0.044788,-0.001081,0.024972,...,-0.000478,0.006782,0.000289,0.008044,0.000289,0.008044,-0.000761,0.006999,-0.000761,0.006999
97,0.013068,0.083759,0.013068,0.083759,-0.012295,0.056916,-0.0123,0.056875,0.000502,0.020935,...,-0.00089,0.005765,0.001962,0.016311,0.001962,0.016311,0.000222,0.004062,0.000222,0.004062
98,0.002128,0.03614,0.002128,0.03614,-0.008149,0.058739,-0.008149,0.058739,-0.000223,0.030546,...,-0.000958,0.006083,0.00036,0.005277,0.00036,0.005277,-0.000699,0.004796,-0.000699,0.004796


In [24]:
data.to_csv("Bias_RMSE.csv")