In [132]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import sys
sys.path.append('../')
from utils import get_mat
import networkx as nx
from numpy.random import laplace

In [96]:
directory_path = '../baselines'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
baseline = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)

    # Check if the string matches the pattern
    if match:
        # Extract values from the match object
        name, N, t, e1, e2, exp = match.groups()
        e1 = float(e1)
        e2 = float(e2)
        # skip incomplete datasets
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    else:
        print(file + " doesn't match!")
        continue
    
    
    baseline.append(df)

# Concatenate the list of DataFrames into a single DataFrame
baseline = pd.concat(baseline, ignore_index=True)

# take the average of all experiments
baseline = baseline.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
baseline = baseline.drop('exper', axis=1)

baseline

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Bitcoin,0.33,0.33,0.132971,0.045946,0.031959,1.238628,0.28,0.345366,0.15444,0.34
1,0.5,20,CA-HepPh,0.33,0.33,0.13133,0.115,0.028148,1.505061,0.530769,0.982799,0.677937,0.34
2,0.5,20,Chamelon,0.33,0.33,0.112688,0.154545,0.023305,2.359417,0.527273,0.847022,0.599987,0.34
3,0.5,20,Congress,0.33,0.33,0.067174,0.1,0.023021,4.101716,0.0,0.469515,0.290453,0.34
4,0.5,20,Enron,0.33,0.33,0.111488,0.030952,0.017772,0.893006,0.430769,0.925084,0.592444,0.34
5,1.0,20,Bitcoin,0.33,0.33,0.13022,0.156757,0.023377,1.270408,0.27,0.611684,0.141137,0.34
6,1.0,20,Chamelon,0.33,0.33,0.177857,0.6,0.006136,1.673495,0.481818,0.186871,0.354664,0.34
7,1.0,20,Congress,0.33,0.33,0.049764,0.175,0.021643,4.363562,0.05,0.533857,0.360186,0.34
8,1.5,20,Bitcoin,0.33,0.33,0.124946,0.327027,0.021545,1.335093,0.24,0.633558,0.105724,0.34
9,1.5,20,Chamelon,0.33,0.33,0.233951,0.822727,0.002665,1.304295,0.427273,0.121476,0.248759,0.34


In [97]:
directory_path = '../our_params'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+\.\d+)_(\d+\.\d+)_(\d+\.\d+)_(\d+)\.csv')
    pattern2 = re.compile(r'([\w-]+)_(\d+)_(\d+\.\d)_(\d+\.\d{2})_(\d+\.\d{2})_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)
    match2 = pattern2.match(file)

    # Check if the string matches the pattern
    if match2:
        name, N, t, e1, e2, exp = match2.groups()
        e1 = float(e1)
        e2 = float(e2)
        df['dataset'] = name
        df['N'] = N
        df['e1'] = e1
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
    elif match:
        # Extract values from the match object
        name = match.group(1)
        # skip incomplete datasets
        df['dataset'] = name

        e1 = float(match.group(3))
        df['e1'] = e1
        e2 = float(match.group(4))
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    
  
    else:
        print(file + " doesn't match!")
        continue
    
    
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
dfs = pd.concat(dfs, ignore_index=True)

# take the average of all experiments
dfs = dfs.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
dfs = dfs.drop('exper', axis=1)

dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.49,55.0,Bitcoin,0.1,0.7,0.148409,0.051351,0.036189,1.240916,0.23,0.848224,0.070023,0.2
1,0.49,30.0,Facebook,0.5,0.4,0.188861,0.25,0.014942,1.12928,0.4125,0.577457,0.488736,0.1
2,0.49,50.0,Enron,0.1,0.7,0.124655,0.049405,0.021584,0.761232,0.453846,0.730503,0.537604,0.2
3,1.96,35.0,Bitcoin,0.3,0.6,0.175153,0.57027,0.014031,1.037409,0.24,0.765557,0.110991,0.1
4,1.96,15.0,Facebook,0.6,0.3,0.220393,0.655,0.003902,0.375575,0.2625,0.501964,0.294857,0.1
5,3.43,30.0,Bitcoin,0.3,0.6,0.175561,0.735135,0.010327,1.003181,0.19,0.515809,0.059981,0.1
6,3.43,5.0,Facebook,0.7,0.2,0.282119,0.7325,0.001154,0.370045,0.1875,0.245302,0.209613,0.1


In [98]:
reduced_eps = [0.49, 1.96, 3.43]
metrics = ['nmi','evc_overlap','evc_MAE','deg_kl', 'diam_rel','cc_rel','mod_rel']
datasets = list(set(dfs['dataset'].unique()) & set(baseline['dataset'].unique()))

eps = [0.5, 2.0, 3.5]
datasets

['Enron', 'Bitcoin']

In [99]:
baseline.loc[(baseline['dataset'] == 'Facebook') ]

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3


In [131]:
changes = pd.DataFrame()
for dataset in datasets:
    for i in range(len(eps)):
        print(dataset, eps[i], reduced_eps[i])
        b = baseline.loc[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])][metrics]
        o = dfs.loc[(dfs['dataset'] == dataset) & (dfs['eps'] == reduced_eps[i])][metrics]
        if (len(o)+ len(b) != 2):
            continue
        # display(b)
        # display(o)
        # display(o.iloc[0].div(b.iloc[0]))
        
        row = baseline[(baseline['dataset'] == dataset) & (baseline['eps'] == eps[i])].copy()
        row[metrics] = o.iloc[0].div(b.iloc[0])
        # display(row)
        changes = changes.append(row, ignore_index=True)
changes    
    

Enron 0.5 0.49
Enron 2.0 1.96
Enron 3.5 3.43
Bitcoin 0.5 0.49
Bitcoin 2.0 1.96
Bitcoin 3.5 3.43


Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,20,Enron,0.33,0.33,0.785179,0.88907,0.797429,0.343047,6.545471,1.290157,183.150278,0.34
1,0.5,20,Bitcoin,0.33,0.33,0.934796,0.9241,1.337039,0.559215,3.317112,1.498066,23.855194,0.34
2,2.0,20,Bitcoin,0.33,0.33,1.103255,10.262369,0.51839,0.467505,3.461334,1.352067,37.812329,0.34
3,3.5,20,Bitcoin,0.33,0.33,1.105822,13.229215,0.38155,0.45208,2.740223,0.910981,20.434394,0.34


In [129]:
# mean of the mean
changes[metrics].mean().mean()

11.444916663896018

In [130]:
changes[metrics].describe()

Unnamed: 0,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.982263,6.326188,0.758602,0.455462,4.016035,1.262818,66.313049
std,0.153841,6.37417,0.422676,0.088616,1.714831,0.250229,78.253327
min,0.785179,0.88907,0.38155,0.343047,2.740223,0.910981,20.434394
25%,0.897392,0.915342,0.48418,0.424822,3.172889,1.195363,22.999994
50%,1.019025,5.593234,0.65791,0.459793,3.389223,1.321112,30.833762
75%,1.103896,11.004081,0.932331,0.490433,4.232368,1.388567,74.146816
max,1.105822,13.229215,1.337039,0.559215,6.545471,1.498066,183.150278
