In [81]:
import itertools
import logging
logging.getLogger('pgmpy').setLevel(logging.ERROR)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, module="networkx.utils.backends")
import pandas as pd
import concurrent.futures
import random
from Decom_Tree import *
import numpy as np
import networkx as nx
from scipy.linalg import inv, det
from pgmpy.global_vars import logger
from pgmpy.utils import get_example_model
from pgmpy.factors.continuous import LinearGaussianCPD
from pgmpy.models import  LinearGaussianBayesianNetwork

In [2]:
def get_random_cpds(self, loc=0, scale=1, inplace=False, seed=None):
    seed = seed if seed is not None else int(time.time() * 1000)
    cpds = []
    for i, var in enumerate(self.nodes()):
        parents = self.get_parents(var)
        cpds.append(
            LinearGaussianCPD.get_random(
                variable=var,
                evidence=parents,
                loc=loc,
                scale=scale,
                seed=(seed + i),
            )
        )
    if inplace:
        self.add_cpds(*cpds)
    else:
        return cpds

#ecoli70 46 26 19
#magic-niab 44 9 33
#magic-irri 64 9 43
#arth150 107 82 24
#[ "ecoli70","magic-niab","magic-irri","arth150"]

In [83]:
def kl_divergence(mu1, cov1, mu2, cov2):
    """
    Compute the Kullbackâ€“Leibler (KL) divergence between two multivariate Gaussian distributions.
    
    Parameters:
        mu1 (ndarray): Mean vector of the first distribution.
        cov1 (ndarray): Covariance matrix of the first distribution.
        mu2 (ndarray): Mean vector of the second distribution.
        cov2 (ndarray): Covariance matrix of the second distribution.
        
    Returns:
        float: KL divergence D_KL(N(mu1, cov1) || N(mu2, cov2))
    """
    d = len(mu1)  # Dimensionality of the distributions
    term1 = np.trace(np.dot(inv(cov2), cov1))  # Tr(Sigma_2^-1 * Sigma_1)
    term2 = np.dot(np.dot((mu2 - mu1).T, inv(cov2)), (mu2 - mu1))  # (mu2 - mu1)^T * Sigma_2^-1 * (mu2 - mu1)
    term3 = np.log(det(cov2) / det(cov1))  # log(det(Sigma_2) / det(Sigma_1))
    kl_divergence = 0.5 * (term1 + term2 - d + term3)
    return kl_divergence


def hellinger_distance(mu1, cov1, mu2, cov2):
    """
    Compute the Hellinger distance between two multivariate Gaussian distributions.
    
    Parameters:
        mu1 (ndarray): Mean vector of the first distribution.
        cov1 (ndarray): Covariance matrix of the first distribution.
        mu2 (ndarray): Mean vector of the second distribution.
        cov2 (ndarray): Covariance matrix of the second distribution.
        
    Returns:
        float: Hellinger distance H(N(mu1, cov1), N(mu2, cov2))
    """
    # Compute the determinants of the covariance matrices
    det_cov1 = det(cov1)
    det_cov2 = det(cov2)
    
    # Compute the average of the two covariance matrices
    avg_cov = (cov1 + cov2) / 2
    
    # Compute the exponent term in the Hellinger distance formula
    diff_mu = mu1 - mu2
    term = np.dot(np.dot(diff_mu.T, inv(avg_cov)), diff_mu)  # (mu1 - mu2)^T * Sigma_avg^-1 * (mu1 - mu2)
    exponent = -0.125 * term
    
    # Compute the squared Hellinger distance
    hellinger_squared = 1 - (det_cov1 ** 0.25) * (det_cov2 ** 0.25) / (det(avg_cov) ** 0.5) * np.exp(exponent)
    return np.sqrt(hellinger_squared)


In [99]:
columns = pd.MultiIndex.from_product(
    [[ "ecoli70", "magic-niab","magic-irri", "arth150"], 
     [500, 1000, 2500, 5000, 7500, 10000],  
     ["klPQ", "hellinger"]]  
)

result_continue = pd.DataFrame(columns=columns)

for file in [ "ecoli70", "magic-niab","magic-irri", "arth150"]:
    print(file)
    save_file = f'{file}.bif' 
    learn_file = f'{file}1.bif' 
    model = get_example_model(file)
    G = nx.DiGraph()
    G.add_nodes_from(model.nodes)
    G.add_edges_from(model.edges)
    decom = Graph_Decom(G)
    atoms = decom.Decom()
    for sample_size in [500, 1000, 2500, 5000, 7500, 10000]:
        succ_num = 100
        row = 0
        while succ_num:
            bn = LinearGaussianBayesianNetwork()
            bn.add_nodes_from(list(G.nodes))
            bn.add_edges_from(list(G.edges)) 
            get_random_cpds(bn, inplace=True)
            df = bn.simulate(sample_size)
            
            learn_bn = LinearGaussianBayesianNetwork()
            learn_bn.add_nodes_from(list(G.nodes))
            learn_bn.add_edges_from(list(G.edges))
            
            learn_bn.cpds = []
            Traverse = []
            for i in atoms:
                sub_model = LinearGaussianBayesianNetwork(list(G.subgraph(list(i)).edges))
                sub_model.add_nodes_from(i)
                sub_model.cpds = []
                sub_model.fit(df[list(i)])
                for node in i:
                    if node not in Traverse:
                        if len(model.get_parents(node)) == len(sub_model.get_parents(node)):
                            Traverse.append(node)
                            learn_bn.add_cpds(sub_model.get_cpds(node))
    
            mean, cov = bn.to_joint_gaussian()
            mean_learn, cov_learn = learn_bn.to_joint_gaussian()
            kl_value = kl_divergence(mean, cov, mean_learn, cov_learn)
            hellinger_value = hellinger_distance(mean, cov, mean_learn, cov_learn)
            for metric_name, value in [("klPQ", kl_value), ("hellinger", hellinger_value)]:
                result_continue.loc[row + 1, (file, sample_size, metric_name)] = round(value, 4)                
            succ_num -= 1
            row += 1        

print(result_continue)

ecoli70
magic-niab
    ecoli70                                                                  \
      500               1000              2500              5000              
       klPQ hellinger    klPQ hellinger    klPQ hellinger    klPQ hellinger   
1    0.1887    0.2122  0.0747    0.1358  0.0361    0.0948  0.0171    0.0652   
2    0.1388    0.1846  0.0783    0.1386  0.0334    0.0909  0.0149    0.0609   
3    0.1799    0.2067  0.0874    0.1467  0.0288    0.0846  0.0151    0.0614   
4    0.1399    0.1851  0.0994    0.1562  0.0333    0.0911  0.0154     0.062   
5    0.1802    0.2072  0.0952    0.1528  0.0293    0.0857  0.0175     0.066   
..      ...       ...     ...       ...     ...       ...     ...       ...   
96    0.173     0.203  0.0912    0.1502  0.0343    0.0921  0.0165    0.0643   
97   0.1602    0.1955  0.0804     0.141   0.039    0.0985  0.0156    0.0623   
98   0.1762    0.2064  0.0901    0.1495   0.031    0.0881  0.0188    0.0684   
99   0.1722    0.2039  0.0701    