In [14]:
import numpy as np 
import pandas as pd 
from functools import reduce
import operator
from scipy import stats
from tqdm import tqdm

In [15]:
test_time_start = {
    "reddit": 2261813.658,
    "Contacts": 2047800,
    "wikipedia": 2218288.6,
    "uci": 6714558.3,
    "SocialEvo": 18711359,
    "mooc": 2250151.6,
    "lastfm": 120235473,
    "enron": 93431801,
    "Flights": 106,
    "UNvote": 2019686400,
    "CanParl": 347155200,
    "USLegis": 10,
    "UNtrade": 883612800
}

In [16]:
def f_ATD(E, E_, dataset):
    E_test = E[E['ts']>test_time_start[dataset]]
    E_fake_test = E_[E_['ts']>test_time_start[dataset]]
    
    T = np.max(np.array(E_test['ts'])) - np.min(np.array(E_test['ts']))
    n = len(E_test)
    
    Q = 0.0
    for index, row in E_test.iterrows():
        u = row['u']
        v = row['i']
        t = row['ts']
        
        T_uv = np.array(E_fake_test[(E_fake_test['u']==u)&(E_fake_test['i']==v)]['ts'])
        
        Q += np.min(np.abs(T_uv -t))
        
    return Q/(n*T)

In [17]:
def f_ACD(E, E_, dataset):
    E_test = E[E['ts']>test_time_start[dataset]]
    E_fake_test = E_[E_['ts']>test_time_start[dataset]]
    
    T = np.max(np.array(E_test['ts'])) - np.min(np.array(E_test['ts']))
    n = len(E_test)
    T_bar = T/n
    
    Q = 0.0
    for index, row in E_test.iterrows():
        u = row['u']
        v = row['i']
        t = row['ts']
        
        count_real = len(E_test[(E_test['u']==u)&(E_test['i']==v)&(E_test['ts']>t-T_bar)&(E_test['ts']<t+T_bar)])
        count_fake = len(E_fake_test[(E_fake_test['u']==u)&(E_fake_test['i']==v)&(E_fake_test['ts']>t-T_bar)&(E_fake_test['ts']<t+T_bar)])
        
        Q += np.abs(count_real - count_fake)
    
    return Q/n

In [18]:
datasets = [ "wikipedia", "reddit", "uci", "lastfm", "mooc"]
distortions = ['intense_5', 'shuffle']

In [50]:
dataset = datasets[3]
distortion = distortions[1]
E_real = pd.read_csv(f"/home/chri6578/Documents/gttp/data/{dataset}/ml_{dataset}.csv") 

ATD_all = []
ACD_all = []
for sample in tqdm(range(1,11)):
    E_distort = pd.read_csv(f"/home/chri6578/Documents/gttp/data/{dataset}/{distortion}_{sample}_ml_{dataset}.csv")
    ATD_all.append(f_ATD(E_real, E_distort, dataset))
    ACD_all.append(f_ACD(E_real, E_distort, dataset))


100%|██████████| 10/10 [43:48<00:00, 262.83s/it]


In [51]:
ATD_all

[0.07999215425097446,
 0.0800863057123326,
 0.08023985374303896,
 0.08016883669822782,
 0.07945098730221332,
 0.08018698750731404,
 0.08007953780818729,
 0.08002210789113358,
 0.08020953375042707,
 0.07983265602747991]

In [52]:
ACD_all

[1.0011342193992763,
 1.000871286720353,
 1.0010465751729685,
 1.001149686027448,
 1.0007784869513214,
 1.0009640864893847,
 1.001453863048163,
 1.0009537754039368,
 1.0011342193992763,
 1.001180619283792]

In [53]:
def confidence95(metric_list):
    # Step 1: Calculate the mean
    mean = np.mean(metric_list)
    # Step 2: Calculate the standard error of the mean (SEM)
    sem = stats.sem(metric_list)
    # Step 3: Find the critical value for a 95% confidence interval
    confidence_level = 0.95
    degrees_of_freedom = len(metric_list) - 1
    critical_value = stats.t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

    # Step 4: Calculate the margin of error
    margin_of_error = critical_value * sem
    return mean, margin_of_error

In [54]:
confidence95(ATD_all)

(0.0800268960691329, 0.00016884958616460372)

In [55]:
confidence95(ACD_all)

(1.0010666817895921, 0.00013569106443766214)