In [1]:
import json
import numpy as np
import pandas as pd
import itertools

from collections import defaultdict
from os.path import join as path_join
from tqdm.autonotebook import tqdm


def calculate_nodes_probs(bn_name_path):
    graph = json.loads(open(path_join(bn_name_path, 'graph.json'), 'r').read())
    nodes_probs = defaultdict(defaultdict)
    for node in graph:
        parent_nodes = node['parents']
        
        if not parent_nodes:
            probs = pd.read_csv(path_join(bn_name_path, node['name'] + ".csv")).set_index(
                node['name']).iloc[:, 0].to_dict()
            nodes_probs[node['name']] = probs
        elif len(parent_nodes) == 1:
            conditional_probs = pd.read_csv(path_join(bn_name_path, node['name'] + ".csv"))
            conditional_probs[parent_nodes] = conditional_probs[parent_nodes].astype(
                {col: 'str' for col in parent_nodes}
            )
            conditional_probs.set_index(parent_nodes[0], inplace=True)
            for value in node['values']:
                nodes_probs[node['name']][value] = 0
                for parent_value in list(nodes_probs[parent_nodes[0]].keys()):
                    nodes_probs[node['name']][value] += \
                    conditional_probs[value][parent_value] * nodes_probs[parent_nodes[0]][parent_value]
        else:
            conditional_probs = pd.read_csv(path_join(bn_name_path, node['name'] + ".csv"))
            conditional_probs[parent_nodes] = conditional_probs[parent_nodes].astype(
                {col: 'str' for col in parent_nodes}
            )
            conditional_probs.set_index(parent_nodes, inplace=True)
            for value in node['values']:
                nodes_probs[node['name']][value] = 0
                for parent_node_values in list(conditional_probs.index):
                    parent_probs = [
                        nodes_probs[p_node][p_node_value] for \
                        (p_node, p_node_value) in list(zip(parent_nodes, parent_node_values))
                    ]
                    nodes_probs[node['name']][value] += np.prod(parent_probs) * conditional_probs[value][parent_node_values]
    
    nodes_probs = {
        k: {
            value_name: round(value, 4) for value_name, value in proba_dist.items() 
        } for k, proba_dist in nodes_probs.items()
    }
    return nodes_probs

def calculate_joint_probs(bn_name_path):
    graph = json.loads(open(path_join(bn_name_path, 'graph.json'), 'r').read())
    nodes = list([node['name'] for node in graph])
    node2parents = {node['name']: node['parents'] for node in graph}
    nodes_all_states = list(itertools.product(*[
        node['values'] for node in graph
    ]))
    joint_probs = pd.DataFrame(data=nodes_all_states, columns=nodes)
    joint_probs["Probability"] = 1
    for i in tqdm(range(len(joint_probs))):
        nodes_states_set = list(zip(nodes,list(joint_probs[nodes].iloc[i].values)))
        for (node, node_state) in nodes_states_set:
            parent_nodes = node2parents[node]
            if (len(parent_nodes) == 0):
                probs = pd.read_csv(path_join(bn_name_path, node + ".csv")).set_index(node)
                joint_probs["Probability"].iloc[i] *= probs["Probability"][node_state]
            elif (len(parent_nodes) == 1):
                parent_node = parent_nodes[0]
                parent_node_State = joint_probs[parent_node].iloc[i]
                probs = pd.read_csv(path_join(bn_name_path, node + ".csv"))
                if (probs[parent_node].values.dtype == bool):
                    probs[parent_node] = list(map(str, probs[parent_node]))
                probs.set_index(parent_node, inplace=True)
                joint_probs["Probability"].iloc[i] *= probs[node_state][parent_node_State]
            elif (len(parent_nodes) > 1):
                parent_nodes_State = joint_probs[parent_nodes].iloc[i]
                probs = pd.read_csv(path_join(bn_name_path, node + ".csv"))
                for col_name in parent_nodes:
                    if (probs[col_name].values.dtype == bool):
                        probs[col_name] = list(map(str, probs[col_name]))
                probs.set_index(parent_nodes, inplace=True)
                joint_probs["Probability"].iloc[i] *= probs.loc[tuple(parent_nodes_State), node_state]
    joint_probs.index = range(1, len(joint_probs) + 1)
    joint_probs.Probability = joint_probs.Probability.apply(lambda x: round(x, 6))
    return joint_probs

  from tqdm.autonotebook import tqdm


In [2]:
nodes_probs = calculate_nodes_probs("animal")
print(json.dumps(nodes_probs))
with open(path_join('results', 'animal_nodes_probs.json'), 'w') as f:
    f.write(json.dumps(nodes_probs))

{"Animal": {"Monkey": 0.2, "Penguin": 0.2, "Platypus": 0.2, "Robin": 0.2, "Turtle": 0.2}, "Environment": {"Air": 0.1, "Land": 0.5, "Water": 0.4}, "HasShell": {"True": 0.2, "False": 0.8}, "BearsYoungAs": {"Live": 0.2, "Eggs": 0.8}, "Class": {"Bird": 0.4, "Mammal": 0.4, "Reptile": 0.2}, "WarmBlooded": {"True": 0.8, "False": 0.2}, "BodyCovering": {"Fur": 0.4, "Feathers": 0.4, "Scales": 0.2}}


In [3]:
joint_probs = calculate_joint_probs('animal')

joint_probs[joint_probs.Probability > 0]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1080.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joint_probs["Probability"].iloc[i] *= probs["Probability"][node_state]





Unnamed: 0,Animal,Environment,HasShell,BearsYoungAs,Class,WarmBlooded,BodyCovering,Probability
115,Monkey,Land,False,Live,Mammal,True,Fur,0.2
344,Penguin,Land,False,Eggs,Bird,True,Feathers,0.1
416,Penguin,Water,False,Eggs,Bird,True,Feathers,0.1
637,Platypus,Water,False,Eggs,Mammal,True,Fur,0.2
704,Robin,Air,False,Eggs,Bird,True,Feathers,0.1
776,Robin,Land,False,Eggs,Bird,True,Feathers,0.1
972,Turtle,Land,True,Eggs,Reptile,False,Scales,0.1
1044,Turtle,Water,True,Eggs,Reptile,False,Scales,0.1


In [4]:
joint_probs.to_csv(path_join('results', 'animal_join_probs.csv'), index=False)

In [5]:
nodes_probs = calculate_nodes_probs("asia")
print(json.dumps(nodes_probs))
with open(path_join('results', 'asia_nodes_probs.json'), 'w') as f:
    f.write(json.dumps(nodes_probs))

{"VisitAsia": {"Visit": 0.01, "NoVisit": 0.99}, "Smoking": {"Smoking": 0.5, "NoSmoking": 0.5}, "Tuberculosis": {"Present": 0.0104, "Absent": 0.9896}, "LungCancer": {"Present": 0.055, "Absent": 0.945}, "Tb_or_Ca": {"True": 0.0648, "False": 0.9352}, "XRay": {"Abnormal": 0.1103, "Normal": 0.8897}, "Bronchitis": {"Present": 0.45, "Absent": 0.55}, "Dyspnea": {"True": 0.4393, "False": 0.5607}}


In [6]:
joint_probs = calculate_joint_probs('asia')

joint_probs[joint_probs.Probability > 0].sample(10)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=256.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joint_probs["Probability"].iloc[i] *= probs["Probability"][node_state]





Unnamed: 0,VisitAsia,Smoking,Tuberculosis,LungCancer,Tb_or_Ca,XRay,Bronchitis,Dyspnea,Probability
132,NoVisit,Smoking,Present,Present,True,Abnormal,Absent,False,5.8e-05
215,NoVisit,NoSmoking,Present,Absent,True,Normal,Absent,True,4.8e-05
64,Visit,Smoking,Absent,Absent,False,Normal,Absent,False,0.001462
35,Visit,Smoking,Absent,Present,True,Abnormal,Absent,True,0.00013
123,Visit,NoSmoking,Absent,Absent,False,Abnormal,Absent,True,1.6e-05
84,Visit,NoSmoking,Present,Absent,True,Abnormal,Absent,False,5.1e-05
254,NoVisit,NoSmoking,Absent,Absent,False,Normal,Present,False,0.027654
145,NoVisit,Smoking,Present,Absent,True,Abnormal,Present,True,0.002358
133,NoVisit,Smoking,Present,Present,True,Normal,Present,True,5e-06
227,NoVisit,NoSmoking,Absent,Present,True,Abnormal,Absent,True,0.002353


In [7]:
joint_probs.to_csv(path_join('results', 'asia_join_probs.csv'), index=False)