In [46]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import sys
sys.path.append('../')
from utils import get_mat
import networkx as nx
from numpy.random import laplace

In [47]:
directory_path = '../baselines'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
baseline = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)

    # Check if the string matches the pattern
    if match:
        # Extract values from the match object
        name, N, t, e1, e2, exp = match.groups()
        e1 = float(e1)
        e2 = float(e2)
        # skip incomplete datasets
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    else:
        print(file + " doesn't match!")
        continue
    
    
    baseline.append(df)

# Concatenate the list of DataFrames into a single DataFrame
baseline = pd.concat(baseline, ignore_index=True)

# take the average of all experiments
baseline = baseline.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
baseline = baseline.drop('exper', axis=1)

baseline

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Bitcoin,0.33,0.33,0.132971,0.045946,0.031959,1.238628,0.28,0.345366,0.15444,0.34
1,0.5,20,CA-HepPh,0.33,0.33,0.13133,0.115,0.028148,1.505061,0.530769,0.982799,0.677937,0.34
2,0.5,20,Chamelon,0.33,0.33,0.112688,0.154545,0.023305,2.359417,0.527273,0.847022,0.599987,0.34
3,0.5,20,Congress,0.33,0.33,0.067174,0.1,0.023021,4.101716,0.0,0.469515,0.290453,0.34
4,0.5,20,Enron,0.33,0.33,0.111488,0.030952,0.017772,0.893006,0.430769,0.925084,0.592444,0.34
5,0.5,20,facebook,0.33,0.33,0.097231,0.2075,0.023416,2.155904,0.375,0.90572,0.698587,0.34
6,1.0,20,Bitcoin,0.33,0.33,0.13022,0.156757,0.023377,1.270408,0.27,0.611684,0.141137,0.34
7,1.0,20,Chamelon,0.33,0.33,0.177857,0.6,0.006136,1.673495,0.481818,0.186871,0.354664,0.34
8,1.0,20,Congress,0.33,0.33,0.049764,0.175,0.021643,4.363562,0.05,0.533857,0.360186,0.34
9,1.5,20,Bitcoin,0.33,0.33,0.124946,0.327027,0.021545,1.335093,0.24,0.633558,0.105724,0.34


In [48]:
directory_path = '../our_params'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+\.\d+)_(\d+\.\d+)_(\d+\.\d+)_(\d+)\.csv')
    pattern2 = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)
    match2 = pattern2.match(file)

    # Check if the string matches the pattern
    if match2:
        name, N, t, e1, e2, exp = match2.groups()
        e1 = float(e1)
        e2 = float(e2)
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
    elif match:
        # Extract values from the match object
        name = match.group(1)
        # skip incomplete datasets
        df['dataset'] = name

        e1 = float(match.group(3))
        df['e1'] = e1
        e2 = float(match.group(4))
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
  
    else:
        print(file + " doesn't match!")
        continue
    
    
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
dfs = pd.concat(dfs, ignore_index=True)

# take the average of all experiments
dfs = dfs.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
dfs = dfs.drop('exper', axis=1)

dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3,exper,N.1
0,0.49,35.0,Congress,0.1,0.7,0.089323,0.05,0.03177,4.060461,0.075,0.24149,0.278525,0.2,,
1,0.49,40.0,Chamelon,0.3,0.6,0.170594,0.204545,0.020017,2.219902,0.490909,0.248995,0.434066,0.1,,
2,0.49,55.0,Bitcoin,0.1,0.7,0.148409,0.051351,0.036189,1.240916,0.23,0.848224,0.070023,0.2,,
3,0.49,30.0,Facebook,0.5,0.4,0.188861,0.25,0.014942,1.12928,0.4125,0.577457,0.488736,0.1,,
4,0.49,35.0,Congress,0.1,0.7,0.087509,0.125,0.025169,4.36473,0.075,0.301302,0.211387,0.2,,
5,0.49,45.0,CA-HepPh,0.3,0.6,0.184496,0.589167,0.008475,0.975896,0.523077,0.534889,0.352215,0.1,4.5,45.0
6,0.49,50.0,Enron,0.1,0.7,0.124655,0.049405,0.021584,0.761232,0.453846,0.730503,0.537604,0.2,,
7,1.96,15.0,Facebook,0.6,0.3,0.222654,0.6375,0.003531,0.379884,0.2875,0.491493,0.288392,0.1,,
8,1.96,25.0,Congress,0.2,0.7,0.179506,0.425,0.023369,2.295741,0.175,0.146244,0.274172,0.1,,
9,1.96,30.0,Chamelon,0.3,0.6,0.25671,0.831818,0.003859,1.143631,0.327273,0.06592,0.193583,0.1,,


In [49]:
reduced_eps = [0.49, 1.96, 3.43]
metrics = ['nmi','evc_overlap','evc_MAE','deg_kl', 'diam_rel','cc_rel','mod_rel']
datasets = list(set(dfs['dataset'].unique()) & set(baseline['dataset'].unique()))

eps = [0.5, 2.0, 3.5]
datasets

['Congress', 'Chamelon', 'CA-HepPh', 'Bitcoin', 'Enron']

In [50]:
baseline.loc[(baseline['dataset'] == 'Congress') & (baseline['eps'] == eps[0])]

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
3,0.5,20,Congress,0.33,0.33,0.067174,0.1,0.023021,4.101716,0.0,0.469515,0.290453,0.34


In [51]:
dfs.loc[(dfs['dataset'] == 'Congress') & (dfs['eps'] == reduced_eps[0])]

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3,exper,N.1
0,0.49,35.0,Congress,0.1,0.7,0.089323,0.05,0.03177,4.060461,0.075,0.24149,0.278525,0.2,,
4,0.49,35.0,Congress,0.1,0.7,0.087509,0.125,0.025169,4.36473,0.075,0.301302,0.211387,0.2,,


In [52]:
from numpy import NaN, inf


changes = pd.DataFrame()
for dataset in datasets:
    for i in range(len(eps)):
        print(dataset, eps[i], reduced_eps[i])
        b = baseline.loc[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])][metrics]
        o = dfs.loc[(dfs['dataset'] == dataset) & (dfs['eps'] == reduced_eps[i])][metrics]
        if (len(o)+ len(b) < 2):
            continue
        display(b)
        display(o)
        # display(o.iloc[0].div(b.iloc[0]))
        
        row = baseline[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])].copy()
        row[metrics] = o.iloc[0].div(b.iloc[0])
        row.replace(inf, NaN, inplace=True)
        # display(row)
        changes = changes.append(row, ignore_index=True)
changes    
    

Congress 0.5 0.49


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
3,0.067174,0.1,0.023021,4.101716,0.0,0.469515,0.290453


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
0,0.089323,0.05,0.03177,4.060461,0.075,0.24149,0.278525
4,0.087509,0.125,0.025169,4.36473,0.075,0.301302,0.211387


Congress 2.0 1.96


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
15,0.099529,0.4,0.022606,4.280953,0.175,0.378314,0.201917


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
8,0.179506,0.425,0.023369,2.295741,0.175,0.146244,0.274172


Congress 3.5 3.43


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
27,0.136142,0.425,0.019708,3.39397,0.1,0.290462,0.331283


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
14,0.295844,0.4,0.023318,2.560122,0.1,0.100047,0.158672


Chamelon 0.5 0.49


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
2,0.112688,0.154545,0.023305,2.359417,0.527273,0.847022,0.599987


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
1,0.170594,0.204545,0.020017,2.219902,0.490909,0.248995,0.434066


Chamelon 2.0 1.96


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
14,0.250432,0.863636,0.002191,1.166399,0.418182,0.113346,0.180686


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
9,0.25671,0.831818,0.003859,1.143631,0.327273,0.06592,0.193583


Chamelon 3.5 3.43


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
26,0.284222,0.9,0.003017,1.001662,0.318182,0.053704,0.142273


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
15,0.304278,0.818182,0.00379,1.143416,0.236364,0.059696,0.15774


CA-HepPh 0.5 0.49


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
1,0.13133,0.115,0.028148,1.505061,0.530769,0.982799,0.677937


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
5,0.184496,0.589167,0.008475,0.975896,0.523077,0.534889,0.352215


CA-HepPh 2.0 1.96


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
13,0.211123,0.946667,0.004078,0.625294,0.384615,0.541411,0.252721


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
13,0.256237,0.895,0.004054,0.670381,0.292308,0.570015,0.221093


CA-HepPh 3.5 3.43
Bitcoin 0.5 0.49


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
0,0.132971,0.045946,0.031959,1.238628,0.28,0.345366,0.15444


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
2,0.148409,0.051351,0.036189,1.240916,0.23,0.848224,0.070023


Bitcoin 2.0 1.96


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
12,0.139286,0.535135,0.021229,1.190983,0.16,0.475302,0.126383


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
10,0.175153,0.57027,0.014031,1.037409,0.24,0.765557,0.110991


Bitcoin 3.5 3.43


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
24,0.150744,0.572973,0.019304,1.155929,0.15,0.223973,0.060773


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
16,0.175561,0.735135,0.010327,1.003181,0.19,0.515809,0.059981


Enron 0.5 0.49


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
4,0.111488,0.030952,0.017772,0.893006,0.430769,0.925084,0.592444


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
6,0.124655,0.049405,0.021584,0.761232,0.453846,0.730503,0.537604


Enron 2.0 1.96


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
16,0.144096,0.677083,0.00524,0.628397,0.292308,0.423605,0.415548


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
11,0.186419,0.657143,0.003503,0.553139,0.238462,0.432393,0.397928


Enron 3.5 3.43


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
28,0.164491,0.675298,0.003049,0.501648,0.161538,0.23884,0.285417


Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
17,0.205891,0.726488,0.002407,0.501798,0.161538,0.325748,0.335208


Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Congress,0.33,0.33,1.329734,0.5,1.380002,0.989942,,0.514339,0.958935,0.34
1,2.0,20,Congress,0.33,0.33,1.803552,1.0625,1.03376,0.536269,1.0,0.386568,1.357845,0.34
2,3.5,20,Congress,0.33,0.33,2.173052,0.941176,1.183147,0.754315,1.0,0.344442,0.47896,0.34
3,0.5,20,Chamelon,0.33,0.33,1.513857,1.323529,0.85891,0.940869,0.931034,0.293965,0.723458,0.34
4,2.0,20,Chamelon,0.33,0.33,1.025071,0.963158,1.761574,0.980481,0.782609,0.581586,1.071381,0.34
5,3.5,20,Chamelon,0.33,0.33,1.070564,0.909091,1.256222,1.141519,0.742857,1.111575,1.108712,0.34
6,0.5,20,CA-HepPh,0.33,0.33,1.404831,5.123188,0.301099,0.64841,0.985507,0.54425,0.51954,0.34
7,2.0,20,CA-HepPh,0.33,0.33,1.213685,0.945423,0.994042,1.072105,0.76,1.052833,0.87485,0.34
8,0.5,20,Bitcoin,0.33,0.33,1.116095,1.117647,1.132372,1.001848,0.821429,2.456018,0.453396,0.34
9,2.0,20,Bitcoin,0.33,0.33,1.257504,1.065657,0.660928,0.871053,1.5,1.610677,0.878209,0.34


In [53]:
invert = ['evc_MAE','deg_kl', 'diam_rel', 'cc_rel', 'mod_rel'] 
changes[invert] = 1.0-changes[invert]
changes[['nmi','evc_overlap']] = changes[['nmi','evc_overlap']] - 1.0

In [54]:
changes

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Congress,0.33,0.33,0.329734,-0.5,-0.380002,0.010058,,0.485661,0.041065,0.34
1,2.0,20,Congress,0.33,0.33,0.803552,0.0625,-0.03376,0.463731,0.0,0.613432,-0.357845,0.34
2,3.5,20,Congress,0.33,0.33,1.173052,-0.058824,-0.183147,0.245685,0.0,0.655558,0.52104,0.34
3,0.5,20,Chamelon,0.33,0.33,0.513857,0.323529,0.14109,0.059131,0.06896552,0.706035,0.276542,0.34
4,2.0,20,Chamelon,0.33,0.33,0.025071,-0.036842,-0.761574,0.019519,0.2173913,0.418414,-0.071381,0.34
5,3.5,20,Chamelon,0.33,0.33,0.070564,-0.090909,-0.256222,-0.141519,0.2571429,-0.111575,-0.108712,0.34
6,0.5,20,CA-HepPh,0.33,0.33,0.404831,4.123188,0.698901,0.35159,0.01449275,0.45575,0.48046,0.34
7,2.0,20,CA-HepPh,0.33,0.33,0.213685,-0.054577,0.005958,-0.072105,0.24,-0.052833,0.12515,0.34
8,0.5,20,Bitcoin,0.33,0.33,0.116095,0.117647,-0.132372,-0.001848,0.1785714,-1.456018,0.546604,0.34
9,2.0,20,Bitcoin,0.33,0.33,0.257504,0.065657,0.339072,0.128947,-0.5,-0.610677,0.121791,0.34


In [55]:
# mean of the mean
changes[changes['dataset'] == 'Enron'][metrics].mean().mean()

0.08655531764454794

In [56]:
# mean of the mean
changes[metrics].mean(skipna = True).mean()


0.13109714957442814

In [57]:
changes.describe()

Unnamed: 0,eps,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,13.0,14.0,14.0,14.0
mean,1.892857,0.33,0.33,0.338292,0.34835,0.01648,0.104454,0.026195,-0.026681,0.11059,0.34
std,1.243113,0.0,0.0,0.313535,1.11413,0.380119,0.163069,0.215412,0.695936,0.26667,0.0
min,0.5,0.33,0.33,0.025071,-0.5,-0.761574,-0.141519,-0.5,-1.456018,-0.357845,0.34
25%,0.5,0.33,0.33,0.12974,-0.050144,-0.206635,0.00229,0.0,-0.300803,-0.050278,0.34
50%,2.0,0.33,0.33,0.254594,0.064078,-0.013901,0.089446,0.014493,0.094797,0.067484,0.34
75%,3.125,0.33,0.33,0.386057,0.241676,0.301328,0.143707,0.184211,0.478183,0.238694,0.34
max,3.5,0.33,0.33,1.173052,4.123188,0.698901,0.463731,0.257143,0.706035,0.546604,0.34


In [58]:
changes[changes['dataset'] == 'Enron'][metrics].mean()

nmi            0.221168
evc_overlap    0.214169
evc_MAE        0.109251
deg_kl         0.089008
diam_rel       0.043546
cc_rel        -0.058095
mod_rel       -0.013160
dtype: float64

In [59]:
changes[changes['dataset'] == 'Chamelon'][metrics].mean().mean()

0.07231040183354767

In [64]:
changes[changes['eps'] == 0.5][metrics].mean().mean()

0.25492475644190665

In [61]:
changes[changes['eps'] == 2.0][metrics].mean().mean()

0.07117763921961283

In [62]:
changes[changes['eps'] == 3.5][metrics].mean().mean()

0.052137724231141616

In [63]:
changes[metrics].std()*100

nmi             31.353482
evc_overlap    111.413023
evc_MAE         38.011859
deg_kl          16.306906
diam_rel        21.541232
cc_rel          69.593572
mod_rel         26.667028
dtype: float64