In [1]:
import pandas as pd
import numpy as np
from pygobnilp.gobnilp import Gobnilp
import pgmpy
import itertools
from pgmpy.estimators.CITests import pearsonr
from pgmpy.estimators.CITests import chi_square
from pgmpy.estimators import PC
from pgmpy.base import DAG
from pgmpy.independencies import Independencies
import time
import collections

In [2]:
def csv_format_discrete(csv_file):
    df = pd.read_csv(csv_file, sep="\s+")
    return df.drop([0])

#returns the csv_file in a pandas dataframe, formatted properly, discrete dataset only

In [3]:
#small network : n <= 20 nodes, medium network: 20 <= n <= 50 , large: 50 <= n <= 100, ... 
df_small = csv_format_discrete(r"C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat")
df_medium = csv_format_discrete(r"C:\Users\User\Documents\GitHub\ML_FYP\dataset\alarm_10000.dat")
# df_large = csv_format_discrete(r"")

### chi-squared test

In [4]:
df_small

Unnamed: 0,One,Two,Three,Four,Five,Six,Seven,Eight
1,1,1,0,0,0,1,0,1
2,1,0,0,0,0,0,1,0
3,0,0,1,0,0,0,1,0
4,1,0,1,0,0,0,1,0
5,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...
9996,0,0,1,0,0,0,0,0
9997,1,0,1,0,0,0,1,1
9998,0,0,0,0,0,0,1,0
9999,1,0,0,0,0,0,1,0


In [5]:
#0th order chi2 test
def chi2bool(df, rho):
    '''
    0th order CI test
    
    Parameters:
    df -> pandas dataframe
    rho -> significance level, only accept 0 <= rho <= 1.0
    '''
    chi2 = []
    v = list(df)
    my_list = list(itertools.combinations(v,2))
    y_0, y_1 = [x[0] for x in my_list], [x[1] for x in my_list]
    for i,j in zip(y_0, y_1):
        chi = chi_square(X=i, Y=j, Z=[], data=df, significance_level=rho)
        chi2.append((i,j,chi))
    true_0 = [x for x in chi2 if True in x]
    false_0 = [x for x in chi2 if False in x]
    return true_0, false_0

### Pearson's Product Moment Correlation Coefficient

In [6]:
df_gaus = pd.read_csv(r"C:\Users\User\Documents\GitHub\ML_FYP\dataset\gaussian.dat", sep="\s+")
df_gaus

Unnamed: 0,A,B,C,D,E,F,G
0,1.113083,1.932164,7.074806,8.660411,0.881591,24.719501,9.216382
1,-0.247948,11.334343,24.347372,23.355432,7.040113,36.812996,3.678833
2,1.854508,3.032020,11.086473,11.055891,3.834530,22.017182,2.424513
3,0.833911,3.857970,11.224775,11.937471,1.005624,23.285643,6.085473
4,0.488614,4.512613,10.000476,12.537179,4.084746,24.537596,5.117575
...,...,...,...,...,...,...,...
4995,0.625860,1.587941,5.952431,8.516822,4.894385,16.950123,2.558535
4996,0.241723,-0.497071,1.695681,5.563204,7.454081,21.165266,4.867241
4997,2.527720,-2.706339,0.984988,1.597447,3.692427,18.808892,5.598200
4998,1.484585,1.468603,7.206672,8.486116,2.611856,21.528922,5.106188


In [7]:
#0th order chi2 test
def PMCC(df, rho):
    '''
    0th order CI test
    
    Parameters:
    df -> pandas dataframe
    rho -> significance level, only accept 0 <= rho <= 1.0
    '''
    R_list = []
    v = list(df)
    my_list = list(itertools.combinations(v,2))
    y_0, y_1 = [x[0] for x in my_list], [x[1] for x in my_list]
    for i,j in zip(y_0, y_1):
        R = pearsonr(X=i, Y=j, Z=[], data=df, significance_level=rho)
        R_list.append((i,j,R))
    true_0 = [x for x in R_list if True in x]
    false_0 = [x for x in R_list if False in x]
    return true_0, false_0

### Sorting Functions

In [8]:
def create_permutations(my_list):
    '''
    takes in a list, remove extra permutations from the list and only creates a pair of permutations
    '''
    p_permutations = []
    new_list = list(set(tuple(sorted(l[:2])) for l in my_list))
    for items in new_list:
        new_list_0 = list(itertools.permutations(items[:2],2))[1]
        p_permutations.append(new_list_0)
    return new_list + p_permutations

In [9]:
#given 2 lists, append them together and remove duplicates
def remove_permutations(list_):
    '''
    returns a sorted list without permutation
    '''
    return sorted(list(set(tuple(sorted(l[:2])) for l in list_)))

### 1st order CI

In [10]:
#1st order CI
def cond_1_generate(df, rho):
    v = list(df)
    p_permutations = list(itertools.permutations(v,3))
    order_0 = create_permutations(chi2bool(df, rho)[0])
    generated = [x for x in p_permutations if x[:2] not in order_0]
    generated_0 = set(tuple(sorted(items[:2])) for items in generated)
    generate_return = [x for x in generated if x[:2] in generated_0]
    return generate_return

In [11]:
def cond_1_test(df, rho):
    chi2_data = []
    phi = cond_1_generate(df, rho)
    for i,j,k in phi:
        chi2 = chi_square(X=i, Y=j, Z=[k], data=df, significance_level=rho)
        chi2_data.append((i,j,k,chi2))
    true_list = [x for x in chi2_data if True in x]
    false_list = [x for x in chi2_data if False in x]
    return true_list, false_list

### CONDITIONAL MAIN CODE

In [12]:
def PC_(df, n, rho):
    '''
    Note:
    This function will start from at least 0th order CI
    
    Parameters:
    df (pandas dataframe)
    n an integer, the stopping point of the while loop
    rho (the significance level, only accepts values between 0 and 1 inclusive)
    
    Returns:
    A list which contains every independent X and Y
    '''
    N = 3
    v = list(df)
    # remove_list = [x for x in p_permute]
    remove_list = create_permutations([x[:2] for x in chi2bool(df, rho)[0]])
    #x[:2] for 0th order and its permutations, so we can later remove it 
    empty_list = []
    while N <= n:
        list_permutations = [x for x in itertools.permutations(v, N) if x[:2] not in remove_list]
        p_1, p_2, p_3 = [x[:1] for x in list_permutations], [x[1:2] for x in list_permutations], [x[2:] for x in list_permutations]
        for i,j,k in zip(p_1, p_2, p_3):
            chi2 = chi_square(X=i[0], Y=j[0], Z=k, data=df, significance_level=rho)
            empty_list.append((i[0],j[0],k,chi2))
        true_list = create_permutations([x[:2] for x in empty_list if True in x])
        #create_permutations ensures only 1 set of permutations of (X,Y) and (Y,X) and removes dupes
        remove_list = remove_list + create_permutations(true_list)
        remove_list = create_permutations(remove_list)
        N += 1
        
    return remove_permutations(remove_list) #returns 1 set of permutations

#ON MEDIUM SIZED NETWORKS:
#1st order CI takes 2minutes to run
#2nd order CI 9-15minutes to run
#nP5 takes ??? minutes to run
#nP6 not doable

### small network

In [None]:
%%time
n = 0
N = 1
chi_list = []
m = Gobnilp()
while n < N:
    empty_list = []
    m.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\alarm_10000.dat')
    for i,j in m.adjacency.items():
        if j.X == 1.0: #j.X == 1.0 implies there is an edge between the nodes
            empty_list.append(i)
    #chi2 test
    empty_list = [list(x) for x in empty_list]
    phi_0, phi_1 = [x[0] for x in empty_list], [x[1] for x in empty_list]
    for i,j in zip(phi_0, phi_1):
        chi2 = chi_square(X=i, Y=j, Z=[], data=df_medium, significance_level=0.05)
        chi_list.append((i,j,chi2))
    true_list = [x[:2] for x in chi_list if True in x]
    for i,j in true_list:
        m.add_obligatory_independence([i],[j])
    n += 1

In [None]:
m.adjacency

In [None]:
k_list = []
for i,j in m.adjacency.items():
        if j.X >= 0.9: #j.X == 1.0 implies there is an edge between the nodes
            k_list.append(i)
k_list = [list(x) for x in k_list]
n = 0
N = len(k_list)
X_adjacent = []
Y_adjacent = []
XY_adjacent = []
while n < N:
    
    ##X##
    empty_X = []
    t_0, t_1 = k_list[n][0], k_list[n][1]
    p_x = [x for x in k_list if t_0 in x and t_1 not in x]
    for i in p_x:
        h_x = [x for x in i if t_0 not in x]
        empty_X.append(h_x)
    X_adjacent.append((k_list[n], empty_X, len(empty_X)))
    ##X##
    
    ##Y##
    empty_Y = []
    t_0, t_1 = k_list[n][0], k_list[n][1]
    p_y = [x for x in k_list if t_1 in x and t_0 not in x]
    for i in p_y:
        h_y = [x for x in i if t_1 not in x]
        empty_Y.append(h_y)
    Y_adjacent.append((k_list[n], empty_Y, len(empty_Y)))
    ##Y##
    
    ##XY##
    X_Y_adjacent = []
    t_ = X_adjacent[n][1] + Y_adjacent[n][1]
    h1 = set(tuple(x) for x in t_)
    h2 = [list(x) for x in h1]
    XY_adjacent.append((k_list[n], h2, len(h2)))
    ##XY
    
    ##
    
#     for i,j,k 
    
    
    
    
    n += 1

In [None]:
empty = []
for i,j,k in XY_adjacent:
    empty.append(j)
empty = [x for x in empty if [] not in x]

In [None]:
empty

In [None]:
for i,j,k in XY_adjacent:
    print(j)

In [None]:
%%time
n = 0
N = 1
chi_list = []
m = Gobnilp()
while n < N:
    empty_list = []
    m.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat')
    for i,j in m.adjacency.items():
        if j.X == 1.0: #j.X == 1.0 implies there is an edge between the nodes
            empty_list.append(i)
    #chi2 test
    empty_list = [list(x) for x in empty_list]
    phi_0, phi_1 = [x[0] for x in empty_list], [x[1] for x in empty_list]
    for i,j in zip(phi_0, phi_1):
        chi2 = chi_square(X=i, Y=j, Z=[], data=df_small, significance_level=0.05)
        chi_list.append((i,j,chi2))
    true_list = [x[:2] for x in chi_list if True in x]
    for i,j in true_list:
        m.add_obligatory_independence([i],[j])
    n += 1

In [None]:
new_list = [x for x in chi2bool(df_small, 0.4)]
new_list

In [None]:
%%time
chi_list = [x[:2] for x in chi2bool(df_small, 0.4)[0]]
m_small = Gobnilp()
for i,j in chi_list:
    m_small.add_forbidden_adjacency((i,j))
m_small.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat')

In [None]:
m_small.adjacency

In [None]:
s_small = Gobnilp()
s_small.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat')

### medium-sized network

In [None]:
m_medium = Gobnilp()

In [None]:
chi2bool(df_medium, 0.05)

In [None]:
%%time 
y_medium = PC_(df_medium, 2, 0.05)
for i,j in y_medium:
        m_medium.a([i],[j])
y_medium

In [None]:
%%time
m_medium.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\alarm_10000.dat')

In [None]:
%%time
m_medium.learn(start='MIP solution')

In [None]:
m0 = Gobnilp()

In [None]:
%%time
m0.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat')

In [None]:
%%time
m.learn(start='MIP solution', palim=10)

In [None]:
(list(m.forbidden_arrows))

### PC Algorithm

In [None]:
c = PC(data=df_small)

In [None]:
%%time
s = Gobnilp()
V = [x[:2] for x in chi2bool(df_medium, 0.01)[0]]
V_0 = [x[0] for x in V]
V_1 = [x[1] for x in V]
for x,y in zip(V_0, V_1):
    s.add_obligatory_independence([x], [y])
s.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\alarm_10000.dat', palim=5)
#start 1500

In [None]:
s.forbidden_arrows

In [None]:
%%time
kprime = Gobnilp()
kprime.learn(r'C:\Users\User\Documents\GitHub\ML_FYP\dataset\asia_10000.dat')