In [None]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import chi2
from math import floor
from joblib import Parallel, delayed
from sklearn.svm import LinearSVC
import random
import os
from sklearn.metrics import precision_recall_fscore_support as full_score
from sklearn.model_selection import train_test_split
%load_ext pycodestyle_magic

In [None]:
# loading preselected nodes
main_directory= '/home/a20114261/alarm_datasets/Alarm10/'
save_folder='/home/a20114261/alarm_datasets/log_results/'

if not os.path.exists(main_directory):
    print("Bad routing.")
preselected_nodes=[] # will have [node_str,balance_ranking]
# so far, we need Node, size of estimated pc set per filename, balance ranking
ff = open(save_folder+'mmpc_partitioned/alarm10_binary_nodes_selected.txt','r')
for e in ff.read().split('\n'):
    spl_line = e.split(';')
    if (len(spl_line[0])==0):
        continue
    preselected_nodes.append([spl_line[0],float(spl_line[1])])
ff.close()

In [None]:
# loading markov blanket for each preselected node
X_graph = pd.read_csv('/home/a20114261/Alarm10_graph.txt',delimiter='  ',header=None)
graph_heads=[]
for i in range(1,len(X_graph.keys())+1):
    graph_heads.append('Node'+str(i))
X_graph.columns = graph_heads

In [None]:
def related_nodes(X_graph,graph_heads,ind):
    print("Acoording to the true graph, the PC set for ",X_graph.keys()[ind],"is:")
    related = []
    
    for i in range(0,len(X_graph[X_graph.keys()[ind]])):
        if (X_graph[X_graph.keys()[ind]][i]==1):
            print(X_graph.keys()[i])
            related.append(X_graph.keys()[i])
    X_graph = X_graph.transpose()
    X_graph.columns = graph_heads
    for i in range(0,len(X_graph[X_graph.keys()[ind]])):
        if (X_graph[X_graph.keys()[ind]][i]==1):
            print(X_graph.keys()[i])
            related.append(X_graph.keys()[i])
    X_graph = X_graph.transpose()
    X_graph.columns = graph_heads
    
    return related

In [None]:
testing_related = []
for i in range(0,len(X_graph.keys())):    
    print("====================================================")
    testing_related.append(related_nodes(X_graph,graph_heads,i))

print(testing_related)
    

In [None]:
for i in range(0,len(X_graph.keys())):    
    print("====================================================")
    related_nodes(X_graph,graph_heads,i)

In [None]:
%%time
alpha=0.05
print("======================================================================================")
print("True MB Evaluation")
print("======================================================================================")


PCs=[]
for filename in os.listdir(main_directory):
    print("Linking "+filename+" to the Markov Blanket of each preselected node")
#     X = pd.read_csv(main_directory+'/'+filename,delimiter='  ',header=None)
#     X.columns = graph_heads
    print("======================================================================================")
    for TargetNodeSelected in [x[0].split('@')[0] for x in preselected_nodes]:
    #  -----------------    
        PC_dict={}
        PC_dict['TargetNode']=TargetNodeSelected
        PC_dict['PC']=related_nodes(X_graph,graph_heads,int(TargetNodeSelected.split('Node')[1])-1)
        PC_dict['filename']=filename
        PCs.append(PC_dict)

In [45]:
for i in range(len(PCs)-1,-1,-1):
    if (len(PCs[i]['PC'])==0):
        PCs.pop(i)

In [53]:
%%capture
# evaluation for each node, using each file as training set, in parallel
print("======================================================================================")
print("True-MB Scoring phase")
print("======================================================================================")

PC_mean_score = Parallel(n_jobs=40)(map(delayed(CandidateScore_Parallel),PCs))

In [55]:
PC_mean_score[0]

{'PC': ['Node23', 'Node76', 'Node96', 'Node128'],
 'TargetNode': 'Node101',
 'class_precisions': [[[[0, 26, 0.0], [1, 974, 1.0]], 'Alarm10_s1000_v1.txt'],
  [[[0, 15, 0.0], [1, 985, 1.0]], 'Alarm10_s1000_v6.txt'],
  [[[0, 22, 0.0], [1, 978, 1.0]], 'Alarm10_s1000_v5.txt'],
  [[[0, 23, 0.0], [1, 977, 1.0]], 'Alarm10_s1000_v10.txt'],
  [[[0, 24, 0.0], [1, 976, 1.0]], 'Alarm10_s1000_v4.txt'],
  [[[0, 22, 0.0], [1, 978, 1.0]], 'Alarm10_s1000_v3.txt'],
  [[[0, 34, 0.0], [1, 966, 1.0]], 'Alarm10_s1000_v7.txt'],
  [[[0, 18, 0.0], [1, 982, 1.0]], 'Alarm10_s1000_v2.txt'],
  [[[0, 31, 0.0], [1, 969, 1.0]], 'Alarm10_s1000_v8.txt']],
 'filename': 'Alarm10_s1000_v9.txt'}

In [57]:
# logging scored results
from operator import itemgetter
log_directory="/home/a20114261/alarm_datasets/log_results/mmpc_partitioned"
ff = open(log_directory+'/scored_true-mb_log.txt','w')
for e in sorted(PC_mean_score, key=itemgetter('filename'),reverse=True):
    ff.write(e['filename']+';')
    ff.write(e['TargetNode']+';')
    for node in e['PC']:
        ff.write(node+'_')
    ff.write(';')
    for class_pred in e['class_precisions']:
            ff.write(class_pred[1]+'_')
            for arr_acc in class_pred[0]:
                ff.write(str(arr_acc[0])+'\t')
                ff.write(str(arr_acc[1])+'\t')
                ff.write(str(arr_acc[2])+'\t')
            ff.write('_')
    ff.write('\n')
ff.close()

In [52]:
def CandidateScore_Parallel(PobDict):

    # training and scoring
    X_train_df = pd.read_csv(main_directory+PobDict['filename'],delimiter='  ',header=None)
    X_train_df.columns=graph_heads
    
    #X_train, X_test, Y_train, Y_test = train_test_split(X_train_df, X_train_df[TargetEvalNode], test_size=0.1, random_state=0)
    
    X_train = X_train_df
    Y_train = X_train_df[PobDict['TargetNode']]
    clf = LinearSVC()
    
    try:
        clf.fit(X_train[PobDict['PC']],Y_train)
    except:
        PobDict['class_precisions']=[]
        return PobDict
        
    
    
    PobDict['class_precisions']=[]
    for filename in [x for x in os.listdir(main_directory) if x.split('_')[1]==PobDict['filename'].split('_')[1]]:
        print(len([x for x in os.listdir(main_directory) if x.split('_')[1]==PobDict['filename'].split('_')[1]]))
        if (filename == PobDict['filename']):
            continue
        print("*****")
        print()
        print("Testing on dataset: "+filename)
        X_test = pd.read_csv(main_directory+filename,delimiter='  ',header=None)
        X_test.columns = graph_heads
        

        # get score values
        precision = []
        values_counter=set(X_train_df[PobDict['TargetNode']])
        for val in values_counter:
            #Y_pred_c = clf.predict(X_train_df.query( TargetEvalNode+'== '+str(i))[PobDict['supercpc']][-100:])
            if (len(X_test.query( PobDict['TargetNode']+'== '+str(val)))==0):
                precision.append([val,0,0])
                continue
            Y_pred_c = clf.predict(X_test.query( PobDict['TargetNode']+'== '+str(val))[PobDict['PC']])
            precision.append([val,len(Y_pred_c),Y_pred_c.tolist().count(val)/len(Y_pred_c)])

        for e in range(0,len(precision)):
            print('precision for class '+str(precision[e][0])+' with '+str(precision[e][1])+' samples in dataset : '+str(precision[e][2]))
    
        print("G-mean score: "+str(g_mean(precision)))
        print("*****")
        print()
        #print(clf.score(X_test[PobDict['supercpc'][:10]],Y_test))
        PobDict['class_precisions'].append([precision,filename])
    return PobDict

In [None]:
def balance_scoring(size1, size2):
    f_size1 = float(size1)
    f_size2 = float(size2)
    if (f_size1/f_size2 <= 1):
        return f_size1/f_size2
    else:
        return f_size2/f_size1

In [None]:
def g_mean(precision_arr):
    counter=1
    for e in precision_arr:
        counter=counter*e[2]
    
    return counter**(1/len(precision_arr))