In [246]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import sys
sys.path.append('../')
from utils import get_mat
import networkx as nx
from numpy.random import laplace

In [247]:
directory_path = '../baselines'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
baseline = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)

    # Check if the string matches the pattern
    if match:
        # Extract values from the match object
        name, N, t, e1, e2, exp = match.groups()
        e1 = float(e1)
        e2 = float(e2)
        # skip incomplete datasets
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    else:
        print(file + " doesn't match!")
        continue
    
    
    baseline.append(df)

# Concatenate the list of DataFrames into a single DataFrame
baseline = pd.concat(baseline, ignore_index=True)

# take the average of all experiments
baseline = baseline.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
baseline = baseline.drop('exper', axis=1)

baseline

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Bitcoin,0.33,0.33,0.132971,0.045946,0.031959,1.238628,0.28,0.345366,0.15444,0.34
1,0.5,20,CA-HepPh,0.33,0.33,0.13133,0.115,0.028148,1.505061,0.530769,0.982799,0.677937,0.34
2,0.5,20,Chamelon,0.33,0.33,0.112688,0.154545,0.023305,2.359417,0.527273,0.847022,0.599987,0.34
3,0.5,20,Congress,0.33,0.33,0.067174,0.1,0.023021,4.101716,0.0,0.469515,0.290453,0.34
4,0.5,20,Enron,0.33,0.33,0.111488,0.030952,0.017772,0.893006,0.430769,0.925084,0.592444,0.34
5,0.5,20,Facebook,0.33,0.33,0.097231,0.2075,0.023416,2.155904,0.375,0.90572,0.698587,0.34
6,1.0,20,Bitcoin,0.33,0.33,0.13022,0.156757,0.023377,1.270408,0.27,0.611684,0.141137,0.34
7,1.0,20,Chamelon,0.33,0.33,0.177857,0.6,0.006136,1.673495,0.481818,0.186871,0.354664,0.34
8,1.0,20,Congress,0.33,0.33,0.049764,0.175,0.021643,4.363562,0.05,0.533857,0.360186,0.34
9,1.5,20,Bitcoin,0.33,0.33,0.124946,0.327027,0.021545,1.335093,0.24,0.633558,0.105724,0.34


In [248]:
directory_path = '../our_params'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+\.\d+)_(\d+\.\d+)_(\d+\.\d+)_(\d+)\.csv')
    pattern2 = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)
    match2 = pattern2.match(file)

    # Check if the string matches the pattern
    if match2:
        name, N, t, e1, e2, exp = match2.groups()
        e1 = float(e1)
        e2 = float(e2)
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
    elif match:
        # Extract values from the match object
        name = match.group(1)
        # skip incomplete datasets
        df['dataset'] = name

        e1 = float(match.group(3))
        df['e1'] = e1
        e2 = float(match.group(4))
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
  
    else:
        print(file + " doesn't match!")
        continue
    
    
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
dfs = pd.concat(dfs, ignore_index=True)

# take the average of all experiments
dfs = dfs.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
dfs = dfs.drop('exper', axis=1)

dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.49,35.0,Congress,0.1,0.7,0.089323,0.05,0.03177,4.060461,0.075,0.24149,0.278525,0.2
1,0.49,40.0,Chamelon,0.3,0.6,0.170594,0.204545,0.020017,2.219902,0.490909,0.248995,0.434066,0.1
2,0.49,55.0,Bitcoin,0.1,0.7,0.148409,0.051351,0.036189,1.240916,0.23,0.848224,0.070023,0.2
3,0.49,30.0,Facebook,0.5,0.4,0.188861,0.25,0.014942,1.12928,0.4125,0.577457,0.488736,0.1
4,0.49,50.0,Enron,0.1,0.7,0.124655,0.049405,0.021584,0.761232,0.453846,0.730503,0.537604,0.2
5,1.96,25.0,Congress,0.2,0.7,0.179506,0.425,0.023369,2.295741,0.175,0.146244,0.274172,0.1
6,1.96,30.0,Chamelon,0.3,0.6,0.25671,0.831818,0.003859,1.143631,0.327273,0.06592,0.193583,0.1
7,1.96,35.0,Bitcoin,0.3,0.6,0.175153,0.57027,0.014031,1.037409,0.24,0.765557,0.110991,0.1
8,1.96,15.0,Facebook,0.6,0.3,0.220393,0.655,0.003902,0.375575,0.2625,0.501964,0.294857,0.1
9,3.43,10.0,Congress,0.4,0.5,0.295844,0.4,0.023318,2.560122,0.1,0.100047,0.158672,0.1


In [249]:
reduced_eps = [0.49, 1.96, 3.43]
metrics = ['nmi','evc_overlap','evc_MAE','deg_kl', 'diam_rel','cc_rel','mod_rel']
datasets = list(set(dfs['dataset'].unique()) & set(baseline['dataset'].unique()))

eps = [0.5, 2.0, 3.5]
datasets

['Congress', 'Bitcoin', 'Facebook', 'Chamelon', 'Enron']

In [263]:
from numpy import NaN, inf


changes = pd.DataFrame()
for dataset in datasets:
    for i in range(len(eps)):
        print(dataset, eps[i], reduced_eps[i])
        b = baseline.loc[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])][metrics]
        o = dfs.loc[(dfs['dataset'] == dataset) & (dfs['eps'] == reduced_eps[i])][metrics]
        if (len(o)+ len(b) != 2):
            continue
        # display(b)
        # display(o)
        # display(o.iloc[0].div(b.iloc[0]))
        
        row = baseline[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])].copy()
        row[metrics] = o.iloc[0].div(b.iloc[0])
        row.replace(inf, NaN, inplace=True)
        # display(row)
        changes = changes.append(row, ignore_index=True)
changes    
    

Congress 0.5 0.49
Congress 2.0 1.96
Congress 3.5 3.43
Bitcoin 0.5 0.49
Bitcoin 2.0 1.96
Bitcoin 3.5 3.43
Facebook 0.5 0.49
Facebook 2.0 1.96
Facebook 3.5 3.43
Chamelon 0.5 0.49
Chamelon 2.0 1.96
Chamelon 3.5 3.43
Enron 0.5 0.49
Enron 2.0 1.96
Enron 3.5 3.43


Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Congress,0.33,0.33,1.329734,0.5,1.380002,0.989942,,0.514339,0.958935,0.34
1,2.0,20,Congress,0.33,0.33,1.803552,1.0625,1.03376,0.536269,1.0,0.386568,1.357845,0.34
2,3.5,20,Congress,0.33,0.33,2.173052,0.941176,1.183147,0.754315,1.0,0.344442,0.47896,0.34
3,0.5,20,Bitcoin,0.33,0.33,1.116095,1.117647,1.132372,1.001848,0.821429,2.456018,0.453396,0.34
4,2.0,20,Bitcoin,0.33,0.33,1.257504,1.065657,0.660928,0.871053,1.5,1.610677,0.878209,0.34
5,3.5,20,Bitcoin,0.33,0.33,1.164632,1.283019,0.534974,0.867857,1.266667,2.302993,0.98697,0.34
6,0.5,20,Facebook,0.33,0.33,1.942389,1.204819,0.638106,0.523808,1.1,0.637567,0.699607,0.34
7,3.5,20,Facebook,0.33,0.33,1.226663,1.457711,0.14456,1.217666,0.75,0.420687,0.656018,0.34
8,0.5,20,Chamelon,0.33,0.33,1.513857,1.323529,0.85891,0.940869,0.931034,0.293965,0.723458,0.34
9,2.0,20,Chamelon,0.33,0.33,1.025071,0.963158,1.761574,0.980481,0.782609,0.581586,1.071381,0.34


In [264]:
invert = ['evc_MAE','deg_kl', 'diam_rel', 'cc_rel', 'mod_rel'] 
changes[invert] = 1.0-changes[invert]
changes[['nmi','evc_overlap']] = changes[['nmi','evc_overlap']] - 1.0

In [265]:
changes

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Congress,0.33,0.33,0.329734,-0.5,-0.380002,0.010058,,0.485661,0.041065,0.34
1,2.0,20,Congress,0.33,0.33,0.803552,0.0625,-0.03376,0.463731,0.0,0.613432,-0.357845,0.34
2,3.5,20,Congress,0.33,0.33,1.173052,-0.058824,-0.183147,0.245685,0.0,0.655558,0.52104,0.34
3,0.5,20,Bitcoin,0.33,0.33,0.116095,0.117647,-0.132372,-0.001848,0.1785714,-1.456018,0.546604,0.34
4,2.0,20,Bitcoin,0.33,0.33,0.257504,0.065657,0.339072,0.128947,-0.5,-0.610677,0.121791,0.34
5,3.5,20,Bitcoin,0.33,0.33,0.164632,0.283019,0.465026,0.132143,-0.2666667,-1.302993,0.01303,0.34
6,0.5,20,Facebook,0.33,0.33,0.942389,0.204819,0.361894,0.476192,-0.1,0.362433,0.300393,0.34
7,3.5,20,Facebook,0.33,0.33,0.226663,0.457711,0.85544,-0.217666,0.25,0.579313,0.343982,0.34
8,0.5,20,Chamelon,0.33,0.33,0.513857,0.323529,0.14109,0.059131,0.06896552,0.706035,0.276542,0.34
9,2.0,20,Chamelon,0.33,0.33,0.025071,-0.036842,-0.761574,0.019519,0.2173913,0.418414,-0.071381,0.34


In [266]:
# mean of the mean
changes[changes['dataset'] == 'Enron'][metrics].mean().mean()

0.06401605527025465

In [268]:
# mean of the mean
changes[metrics].mean(skipna = True).mean()


0.1100360688249118

In [255]:
changes.describe()

Unnamed: 0,eps,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,2.0,0.33,0.33,0.38407,0.115405,0.031665,0.101664,-inf,0.014311,0.118818,0.34
std,1.369306,0.0,0.0,0.365851,0.275283,0.42165,0.203112,,0.739345,0.269633,0.0
min,0.5,0.33,0.33,0.025071,-0.5,-0.761574,-0.217666,-inf,-1.456018,-0.357845,0.34
25%,0.5,0.33,0.33,0.118109,-0.036842,-0.214465,-0.000299,-0.1,-0.363879,-0.071381,0.34
50%,2.0,0.33,0.33,0.251684,0.075804,-0.03376,0.059131,0.0,0.362433,0.092566,0.34
75%,3.5,0.33,0.33,0.513857,0.283019,0.339072,0.147562,0.178571,0.579313,0.300393,0.34
max,3.5,0.33,0.33,1.173052,0.596154,0.85544,0.476192,0.257143,0.706035,0.546604,0.34


In [256]:
changes[changes['dataset'] == 'Bitcoin'][metrics].mean()

nmi            0.179410
evc_overlap    0.155441
evc_MAE        0.223909
deg_kl         0.086414
diam_rel      -0.196032
cc_rel        -1.123229
mod_rel        0.227141
dtype: float64

In [257]:
changes[changes['dataset'] == 'Chamelon'][metrics].mean().mean()

0.07231040183354767

In [258]:
changes[changes['eps'] == 0.5][metrics].mean().mean()

-inf

In [259]:
changes[changes['eps'] == 2.0][metrics].mean().mean()

0.055452504902368784

In [260]:
changes[changes['eps'] == 3.5][metrics].mean().mean()

0.11300856906384858

In [261]:
changes[metrics].mean().mean()

-inf