In [287]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import warnings

from itertools import product

In [288]:
def create_factor(variable: str, parent_list: list):
    """
    Creates a dataframe with all possible combinations of outcomes for parent- and child nodes

    Arguments:
    variable (string): The name of the child node, 'Xi'
    parent_list (list): A list of strings, representing the parent nodes ['Xj', 'Xk']

    Returns:
    pandas.DataFrame: A dataframe with all possible outcomes
    """
    d = 1 + len(parent_list)
    columns = parent_list + [variable]
    df = pd.DataFrame(product([0,1],repeat=d),columns=columns)
    return df

def cpd_factor(variable: str, parent_list: list, alpha: float):
    """ 
    Creates a factor using the create_factor() function, and adds a factor value to each outcome 

    Arguments:
    variable (string): The name of the child node
    parent_list (list): The names of the parent nodes
    alpha (float): The alpha parameter

    Returns:
    pandas.DataFrame: A factor, in the form of a pandas dataframe

    """
    cpd = create_factor(variable, parent_list)
    variables = list(cpd.columns)
    n = len(variables)
    cpd_ = cpd.copy() 
    if n == 1:
        cpd_["prob"] = 1 - alpha
        return cpd_
    else:
        child = variables[n-1]
        # generate factor 
        parents = cpd_.iloc[:,:-1].copy()
        parents["sum"] = parents.sum(axis=1)
        cpd_["prob"] = 1 - (alpha ** (1 + parents["sum"]))
        # dataframe.where() updates the probability where condition is false
        # St. we can compute the probability of P(X=0|parents) = 1 - P(X=1|parents)
        cpd_["prob"].where(cpd_[f"{child}"] == 1, 1-cpd_["prob"],axis=0,inplace=True)
        return cpd_



In [289]:
def factor_product(factor_1: pd.DataFrame, factor_2: pd.DataFrame):
    """
    Multiplies two factors 

    Arguments:
    factor1 (pandas.DataFrame): a factor in the form of a dataframe
    factor2 (pandas.DataFrame): a factor in the form of a dataframe

    Returns:
    new_factor (pandas.DataFrame): A new factor, joined at the common nodes, with the factor values multiplied

    """
    # List of all common nodes to join on
    nodes_to_join = factor_1.columns.intersection(factor_2.columns)
    nodes_to_join = nodes_to_join.drop("prob")
    nodes = list(nodes_to_join)

    # Inner join the factors by common nodes
    new_factor = factor_1.merge(factor_2, how='inner', on=nodes)
    new_factor["prob"] = new_factor["prob_x"]*new_factor["prob_y"] # Multiply the probabilities
    new_factor = new_factor.drop(columns=["prob_x","prob_y"])
    return new_factor

In [290]:
# From algorithm 9.1 in the book (p. 298)
def sum_product_eliminate_var(factors : list, var : str, print_info = False):
    """
    Eliminates one variable and multiplies all factors with that variable in their scope

    Arguments:
    factors (list): The set of all factors currently in the network
    var (str): The name of the variable to be eliminated
    print_info (bool): Command to print debugging information or not

    Returns: 
    list: A list containing the factors without the eliminated variable in the scope plus the product of the factors with the eliminated variable in the scope 
    """
    factors_list = [i[0] for i in factors] # Factors
    names_list = [i[1] for i in factors] # Names of factors

    lst_scope = []   # Factors in scope
    lst_scope_i = [] # Names of factors in scope   
    lst_not_scope = [] # Factors not in scope
    lst_not_scope_i = [] # Names of factors not in scope

    # Split factors into variable in scope and variable not in scope
    for i in range(0, len(factors_list)):
        if var in factors_list[i].columns:
            lst_scope.append(factors_list[i])
            lst_scope_i.append(names_list[i])
        else:
            lst_not_scope.append(factors_list[i])
            lst_not_scope_i.append(names_list[i])

    # Multiply all factors with variable in scope
    factor_ = lst_scope[0]
    for i in range(1, len(lst_scope)):
        factor_ = factor_product(factor_, lst_scope[i])

    # All variables that should not be marginalized
    col = list(factor_.columns[factor_.columns != var].drop("prob")) 

    # Marginalize the variable / sum out variable
    tau = factor_.groupby(col, as_index = False)['prob'].sum()
    tau_name = 'tau_' + str(var) 

    # Add the new factor to the list of factors not in scope
    lst_not_scope = lst_not_scope + [tau]
    lst_not_scope_i = lst_not_scope_i + [tau_name]
    
    # Debugging prints
    if print_info:
        for elm in lst_scope:
            print(elm.columns)
        print(lst_scope_i)

    return list(zip(lst_not_scope, lst_not_scope_i))

In [291]:
def sum_product_ve(factors: list, order: list):
    """ 
    Run the sum_product_eliminate_var() function on all variables to be eliminated

    Arguments:
    factors (list): list of factors
    order (list): list of variables in elimination order

    Returns:
    factors (list): list of remaining factors after eliminating all variables in the elimination order
    """

    # For each variable in the elimination order
    for var in order:
        factors = sum_product_eliminate_var(factors, var)

    factors_list = [i[0] for i in factors] # Factors
    names_list = [i[1] for i in factors] # Names of factors

    factor_ = factors_list[0]
    for i in range(1, len(factors_list)):
        factor_ = factor_product(factor_, factors_list[i])


    
    return [(factor_,'Sum-Product')]

In [292]:
def cond_prob_ve (factors: list, queries: list, evidence: list, order: list):
    """ 
    Calculates the conditional probability of the query variables, given evidence. Corresponds to Algorithm 9.2 in PGM.

    Arguments: 
    factors (list): list of factors
    queries (list): list of variables (string) to be inferred
    evidence (list): list of tuples containing the name and a value of a variable, e.g. ('X1',1) corresponding to X1=1.as_integer_ratio
    order (list): list of variables in elimination order

    Returns:
    alpha (pd.DataFrame): CPD table containing the conditional distribution of the query variables given the evidence
    phi (list of tuples): list containing tuples of factors and the name of the factor e.g. (pd.DataFrame, 'phi_Xi')
    """
    order_ = order.copy() # Fix for wierd python handeling of list
    factors_list = [i[0] for i in factors] # Factors
    names_list = [i[1] for i in factors] # Names of factors

    for i in range(0,len(factors_list)): # Loop through factors
        for elm in evidence: # Loop through evidence
            if elm[0] in factors_list[i].columns:
                # Retrict factor to evidence == e, and replace in factor list
                factors_list[i] = factors_list[i][factors_list[i][elm[0]]==elm[1]]
    
    # Remove elements we want to keep from elemination order
    for elm in evidence:
        if elm[0] in order_:
            order_.remove(elm[0])
    
    for var in queries:
        if var in order_:
            order_.remove(var)
    
    concat_factor_list = list(zip(factors_list,names_list))
    
    phi = sum_product_ve(concat_factor_list, order_)

    alpha = phi[0][0].groupby(queries, as_index = False)['prob'].sum()
    alpha['prob'] = alpha['prob'] / alpha['prob'].sum() # Normalize the probability
   
    return alpha, phi

# Test

In [293]:
alpha = 0.5

In [294]:
# The scope consist of the parents and the node itself
variables_to_eliminate = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9']
elimination_scope = [("X1", ['X7', 'X8']),("X2", ['X4', 'X8']),("X3", ['X9', 'X8']),
                     ("X5", ['X9', 'X8']),("X4", ['X11', 'X8', 'X9']),("X6", ['X7', 'X10']),
                     ("X7", []),("X8", []),("X9", ['X10', 'X6']), ("X10", ['X7']),("X11", ['X8', 'X9'])]


In [295]:
# Generate factors for all nodes
# list form: [(factor, variable name), ... ]
factor_list = []
for elm in elimination_scope:
    name = 'phi_' + str(elm[0])
    cpd = cpd_factor(elm[0], elm[1], alpha)
    factor_list.append((cpd, name))

In [296]:
#print(sum_product_ve(factor_list, variables_to_eliminate))

In [297]:
# evidence - observed values e for variables in E
#evidence = [('X1', 1), ('X2',1), ('X3',1),('X4',1),("X5",1),('X6',1),('X7',1),('X8',1),('X9',1)] # Change the values to your preference.
evidence = [('X8',0),('X9',0)]
#evidence = [('X7',0)]

# If a variable is in query it cannot be in variables_to_eliminate
# order - Elimination order (order)
variables_to_eliminate = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7','X8', 'X9', 'X10', 'X11']

# query_var - Set of query variables Y to be inferred
query_var = ['X11']
#query_var = ['X10']

cond_prob_ve(factor_list, query_var, evidence, variables_to_eliminate)

(   X11  prob
 0    0   0.5
 1    1   0.5,
 [(   X8  X9  X11      prob
   0   0   0    0  0.056152
   1   0   0    1  0.056152,
   'Sum-Product')])