In [None]:
import reliable as rc
import datetime
import os
import gc
import time
import sys
import random
import joblib
import json as js
import networkx as nx
from statistics import mean
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from csv import reader
from matplotlib import pyplot as plt
from collections import deque
from joblib import Parallel, delayed

In [None]:
data_path = 'Data/'

Config = {
    'StackOverFlow':{'k': 27}, 
    'HepPh':{'k': 118},
    'Reddit':{'k': 12}, 
    'Email_EU':{'k': 8}, 
}

data_pick = ['Email_EU', 'Reddit', 'HepPh', 'StackOverFlow']
query_all = joblib.load('input/query_all.pkl.zip')

## Index Construction

In [None]:
for i, data in enumerate(data_pick):
    graph_path = data_path + data + '/'
    graphs = sorted(os.listdir(graph_path))
    print(graphs)
    list_G_ori = Parallel(n_jobs=-1, verbose=0)(delayed(nx.read_gml)(graph_path+g) for g in graphs[:20])

    start_time0 = time.process_time()
    # construction
    theta_thres_all = []
    wcf_indices = []

    print('Constructing Index for: {} with {} instances.'.format(data, len(list_G_ori)))
    theta_thres_all = Parallel(n_jobs=-1, verbose=1)(delayed(rc.theta_thres_table)(g) for g in list_G_ori)
    wcf_indices = Parallel(n_jobs=-1, verbose=1)(delayed(rc.theta_tree)(theta_thres_all[i],g) for i,g in enumerate(list_G_ori))
    joblib.dump(wcf_indices, 'input/wcf_indices-'+data+'.pkl.zip')

# Varying parameters

In [None]:
# Varying k

theta = 0.4
k_all = [0.2,0.4,0.6,0.8]
data_pick = ['Email_EU', 'Reddit', 'HepPh', 'StackOverFlow']
data_pick.reverse()
iterables = [data_pick, k_all]
index = pd.MultiIndex.from_product(iterables, names=['data', 'k_ratio'])

vary_k = pd.DataFrame(columns=['EEF', 'WCF'], index=index)

for i, data in enumerate(data_pick):
    print('For dataset {}'.format(data))
    graph_path = data_path + data + '/'
    graphs = os.listdir(graph_path)
    list_G_ori = Parallel(n_jobs=-1, verbose=10)(delayed(nx.read_gml)(graph_path+g) for g in graphs[:10])

    # construction
    wcf_indices = joblib.load('input/wcf_indices-'+data+'.pkl.zip')[:10]

    EB_time = []
    IB_time = []
    for k_ratio in tqdm(k_all, desc='Processing query K'):
        k = int(Config[data]['k']*k_ratio)
        list_G = list_G_ori.copy()
        for query in tqdm(query_all[data]):
            
            start_time1 = time.time()
            g1, score1 = rc.EEF(list_G, query, theta, k)
            end_time1 = time.time ()
            EB_time.append(end_time1 - start_time1)

            start_time2 = time.time()
            g2, score2 = rc.WCF_search(list_G, wcf_indices, query, theta, k)
            end_time2 = time.time()
            IB_time.append(end_time2 - start_time2)

        vary_k.loc[(data, k_ratio)] = {'EEF': mean(EB_time), 'WCF': mean(IB_time)}

In [None]:
# Varying theta

k_ratio = 0.4
theta_all = [0.0,0.2,0.4,0.6,0.8]
data_pick = ['Email_EU', 'Reddit', 'HepPh', 'StackOverFlow']
iterables = [data_pick, theta_all]
index = pd.MultiIndex.from_product(iterables, names=['data', 'theta'])

vary_theta = pd.DataFrame(columns=['EEF', 'WCF'], index=index)

for i, data in enumerate(data_pick):
    print('For dataset {}'.format(data))
    graph_path = data_path + data + '/'
    graphs = os.listdir(graph_path)
    list_G_ori = Parallel(n_jobs=-1)(delayed(nx.read_gml)(graph_path+g) for g in graphs[:10])

    # construction
    wcf_indices = joblib.load('input/wcf_indices-'+data+'.pkl.zip')[:10]

    EB_time = []
    IB_time = []
    for theta in tqdm(theta_all, desc='Processing query theta'):
        k = int(Config[data]['k']*k_ratio)
        list_G = list_G_ori.copy()
        for query in tqdm(query_all[data]):
            
            # print('\t\tProcessing query K={}'.format(k))
            start_time1 = time.time()
            g1, score1 = rc.EEF(list_G, query, theta, k)
            end_time1 = time.time ()
            EB_time.append(end_time1 - start_time1)

            start_time2 = time.time()
            g2, score2 = rc.WCF_search(list_G, wcf_indices, query, theta, k)
            end_time2 = time.time()
            IB_time.append(end_time2 - start_time2)

        vary_theta.loc[(data, theta)] = {'EEF': mean(EB_time), 'WCF': mean(IB_time)}

In [None]:
# Varying T

theta = 0.4
t_all = [2,5,10,15,20]
data_pick = ['Email_EU', 'Reddit', 'HepPh', 'StackOverFlow']
iterables = [data_pick, t_all]
index = pd.MultiIndex.from_product(iterables, names=['data', 't'])

vary_t = pd.DataFrame(columns=['EEF', 'WCF'], index=index)

for i, data in enumerate(data_pick):
    print('For dataset {}'.format(data))
    graph_path = data_path + data + '/'
    graphs = os.listdir(graph_path)
    list_G_ori = Parallel(n_jobs=-1)(delayed(nx.read_gml)(graph_path+g) for g in graphs[:20])

    # construction
    wcf_indices_ori = joblib.load('input/wcf_indices-'+data+'.pkl.zip')[:20]

    EB_time = []
    IB_time = []
    for t in tqdm(t_all, desc='Processing query t'):
        k = int(Config[data]['k']*0.4)
        list_G = list_G_ori[:t].copy()
        wcf_indices = wcf_indices_ori[:t].copy()
        for query in tqdm(query_all[data]):
            
            start_time1 = time.time()
            g1, score1 = rc.EEF(list_G, query, theta, k)
            end_time1 = time.time ()
            EB_time.append(end_time1 - start_time1)

            start_time2 = time.time()
            g2, score2 = rc.WCF_search(list_G, wcf_indices, query, theta, k)
            end_time2 = time.time()
            IB_time.append(end_time2 - start_time2)

        vary_t.loc[(data, t)] = {'EEF': mean(EB_time), 'WCF': mean(IB_time)}

# Index Maintenance

In [None]:
data = 'Reddit'
graph_path = data_path + data + '/'
graph_test = os.listdir(graph_path)[0]
g_test = nx.read_gml(graph_path+graph_test)
theta_thres_df_old = rc.theta_thres_table(g_test)
wcf_index = rc.theta_tree(theta_thres_df_old,g_test)

# ratios = [0.01, 0.03, 0.05]
ratios = [100, 200, 300, 500, 1000]
recons_time = pd.DataFrame(columns=['Reconstruct','Maintenance'], index=ratios)
for ratio in ratios:
    # sampled_edges = random.sample(g_test.edges, int(ratio*len(g_test.edges)))
    sampled_edges = random.sample(g_test.edges, ratio)
    g_test_new = g_test.copy()

    update_v_list = set()
    for e in tqdm(sampled_edges):
        new_weight = random.randint(0,10)*0.1
        g_test_new.add_edge(*e, weight=new_weight)

        # update from removing
        theta1 = g_test[e[0]][e[1]]['weight']
        subcore = []
        subcore.extend(list(rc.sub_core(g_test,e[0])[1].keys()))
        subcore.extend(list(rc.sub_core(g_test,e[1])[1].keys()))
        subcore = [i for i in subcore if (theta_thres_df_old.loc[i]<=theta1).any()>0]
        update_v_list.update(subcore)

        # update from inserting
        theta2 = new_weight
        purecore = []
        purecore.extend(list(rc.pure_core(g_test,e[0])[1].keys()))
        purecore.extend(list(rc.pure_core(g_test,e[1])[1].keys()))
        purecore = [i for i in purecore if (theta_thres_df_old.loc[i]<=theta2).any()>0]
        update_v_list.update(purecore)
    print(len(update_v_list))

    ts = time.time()
    tt0 = rc.theta_thres_table(g_test_new)
    new_wcf_index0 = rc.theta_tree(tt0, g_test_new)
    t0 = time.time()-ts

    ts = time.time()
    new_wcf_index1 = rc.maintenance_index(list(update_v_list),g_test_new,new_wcf_index0, theta_thres_df_old)
    t1 = time.time()-ts
    recons_time.loc[ratio] = {'Reconstruct':t0, 'Maintenance':t1}

# Index Compression

In [None]:
save_path = '...'
for data in tqdm(data_pick):
    graph_path = data_path + data + '/'
    graphs = os.listdir(graph_path)
    list_G_ori = Parallel(n_jobs=-1)(delayed(nx.read_gml)(graph_path+g) for g in graphs)
    # construction
    theta_thres_all = []
    wcf_indices = []

    theta_thres_all = Parallel(n_jobs=-1, verbose=10)(delayed(rc.theta_thres_table)(g) for g in list_G_ori)
    wcf_indices = Parallel(n_jobs=-1, verbose=10)(delayed(rc.theta_tree)(theta_thres_all[i],g) for i,g in enumerate(list_G_ori))
    
    # joblib.dump(wcf_indices, save_path + data+'_indices.pkl.zip')

    old_indices = rc.indices_to_json(wcf_indices)
    with open(save_path + data+'_old_indices.json', 'w') as fp:
        js.dump(old_indices, fp)

    compressed_index, auxiliary_table = rc.compress_wcf_indices(wcf_indices, 1)
    new_indices = rc.indices_to_json(compressed_index)
    with open(save_path + data+'_new_indices.json', 'w') as fp:
        js.dump(new_indices, fp)

    with open(save_path + data+'_auxiliary_table.json', 'w') as fp:
        js.dump(auxiliary_table, fp)