In [144]:
import numpy as np 
import pandas as pd 
from functools import reduce
import operator
from scipy import stats
from tqdm import tqdm

In [145]:
test_time_start = {
    "reddit": 2261813.658,
    "Contacts": 2047800,
    "wikipedia": 2218288.6,
    "uci": 6714558.3,
    "SocialEvo": 18711359,
    "mooc": 2250151.6,
    "lastfm": 120235473,
    "enron": 93431801,
    "Flights": 106,
    "UNvote": 2019686400,
    "CanParl": 347155200,
    "USLegis": 10,
    "UNtrade": 883612800
}

In [146]:
def f_ATD(E, E_, dataset):
    E_test = E[E['ts']>test_time_start[dataset]]
    E_fake_test = E_[E_['ts']>test_time_start[dataset]]
    
    T = np.max(np.array(E_test['ts'])) - np.min(np.array(E_test['ts']))
    n = len(E_test)
    
    Q = 0.0
    for index, row in E_test.iterrows():
        u = row['u']
        v = row['i']
        t = row['ts']
        
        T_uv = np.array(E_fake_test[(E_fake_test['u']==u)&(E_fake_test['i']==v)]['ts'])
        
        Q += np.min(np.abs(T_uv -t))
        
    return Q/(n*T)

In [147]:
def f_ACD(E, E_, dataset):
    E_test = E[E['ts']>test_time_start[dataset]]
    E_fake_test = E_[E_['ts']>test_time_start[dataset]]
    
    T = np.max(np.array(E_test['ts'])) - np.min(np.array(E_test['ts']))
    n = len(E_test)
    T_bar = T/n
    
    Q = 0.0
    for index, row in E_test.iterrows():
        u = row['u']
        v = row['i']
        t = row['ts']
        
        count_real = len(E_test[(E_test['u']==u)&(E_test['i']==v)&(E_test['ts']>t-T_bar)&(E_test['ts']<t+T_bar)])
        count_fake = len(E_fake_test[(E_fake_test['u']==u)&(E_fake_test['i']==v)&(E_fake_test['ts']>t-T_bar)&(E_fake_test['ts']<t+T_bar)])
        
        Q += np.abs(count_real - count_fake)
    
    return Q/n

In [148]:
datasets = [ "wikipedia", "reddit", "uci"]
distortions = ['intense_5', 'shuffle']

In [149]:
dataset = datasets[1]
distortion = distortions[1]
E_real = pd.read_csv(f"/home/chri6578/Documents/gttp/data/{dataset}/ml_{dataset}.csv") 

ATD_all = []
ACD_all = []
for sample in tqdm(range(1,11)):
    E_distort = pd.read_csv(f"/home/chri6578/Documents/gttp/data/{dataset}/{distortion}_{sample}_ml_{dataset}.csv")
    ATD_all.append(f_ATD(E_real, E_distort, dataset))
    ACD_all.append(f_ACD(E_real, E_distort, dataset))


100%|██████████| 10/10 [16:40<00:00, 100.10s/it]


In [150]:
ATD_all

[0.09971171953691128,
 0.09839795047943865,
 0.09912133218912875,
 0.09906911349777363,
 0.09900999985162606,
 0.0985220263954334,
 0.09937001318235185,
 0.09902831996743215,
 0.09852367890855039,
 0.0985590201865276]

In [151]:
ACD_all

[1.0329344582469986,
 1.032825403749492,
 1.03297411442791,
 1.0331823093776953,
 1.0330137706088216,
 1.033063340834961,
 1.0331624812872395,
 1.0330236846540495,
 1.0330038565635937,
 1.0328848880208592]

In [152]:
def confidence95(metric_list):
    # Step 1: Calculate the mean
    mean = np.mean(metric_list)
    # Step 2: Calculate the standard error of the mean (SEM)
    sem = stats.sem(metric_list)
    # Step 3: Find the critical value for a 95% confidence interval
    confidence_level = 0.95
    degrees_of_freedom = len(metric_list) - 1
    critical_value = stats.t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

    # Step 4: Calculate the margin of error
    margin_of_error = critical_value * sem
    return mean, margin_of_error

In [153]:
confidence95(ATD_all)

(0.09893131741951737, 0.00030433736045848805)

In [154]:
confidence95(ACD_all)

(1.033006830777162, 8.003200026644353e-05)