In [47]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from scipy.stats import zscore


In [57]:
directory_path = '../our_results'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Define the pattern
    pattern = re.compile(r'([\w-]+)_(\d+\.\d+)_(\d+\.\d+)_(\d+\.\d+)_(\d+)\.csv')

    # Use the pattern to extract values
    match = pattern.match(file)

    # Check if the string matches the pattern
    if match:
        # Extract values from the match object
        df['dataset'] = match.group(1)
        e1 = float(match.group(3))
        df['e1'] = e1
        e2 = float(match.group(4))
        df['e2'] = e2
        df['e3'] = 1.0 - e1 - e2
    else:
        print(file + " doesn't match!")
        continue
        
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
dfs = pd.concat(dfs, ignore_index=True)

# Display the concatenated DataFrame
dfs


Unnamed: 0,eps,exper,N,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,dataset,e1,e2,e3
0,3.5,0.0,5.0,0.331420,0.863636,0.000962,1.073651,0.545455,0.023995,0.046424,Chamelon,0.1,0.8,0.1
1,3.5,1.0,5.0,0.311205,0.681818,0.004607,0.847193,0.454545,0.203141,0.267037,Chamelon,0.1,0.8,0.1
2,3.5,2.0,5.0,0.314336,0.818182,0.003958,1.099951,0.363636,0.163873,0.290641,Chamelon,0.1,0.8,0.1
3,3.5,3.0,5.0,0.293502,0.636364,0.007585,0.994947,0.454545,0.237213,0.264279,Chamelon,0.1,0.8,0.1
4,3.5,4.0,5.0,0.348520,0.863636,0.001869,1.473699,0.272727,0.103795,0.157197,Chamelon,0.1,0.8,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11615,0.5,5.0,35.0,0.134843,0.366667,0.032736,1.466145,0.538462,0.943421,0.636785,CA-HepPh,0.1,0.5,0.4
11616,0.5,6.0,35.0,0.151100,0.408333,0.028070,1.100746,0.538462,0.899806,0.569705,CA-HepPh,0.1,0.5,0.4
11617,0.5,7.0,35.0,0.147346,0.558333,0.022033,1.353301,0.538462,0.800710,0.545442,CA-HepPh,0.1,0.5,0.4
11618,0.5,8.0,35.0,0.169561,0.600000,0.013512,1.360518,0.538462,0.638606,0.524008,CA-HepPh,0.1,0.5,0.4


In [58]:
# take the average of all experiments
dfs = dfs.groupby(['eps', 'N', 'dataset', 'e1', 'e2']).mean().reset_index()
dfs = dfs.drop('exper', axis=1)
dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,5.0,CA-HepPh,0.1,0.1,0.107244,0.014167,0.023399,3.981196,0.384615,0.999407,0.818220,0.8
1,0.5,5.0,CA-HepPh,0.1,0.2,0.110734,0.012500,0.024522,4.259935,0.438462,0.999385,0.813684,0.7
2,0.5,5.0,CA-HepPh,0.1,0.3,0.109254,0.008333,0.013324,3.505275,0.446154,0.998994,0.810664,0.6
3,0.5,5.0,CA-HepPh,0.1,0.4,0.111198,0.013333,0.030615,4.025057,0.453846,0.999141,0.813282,0.5
4,0.5,5.0,CA-HepPh,0.1,0.5,0.107882,0.010833,0.023944,3.901634,0.446154,0.999174,0.815921,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,3.5,35.0,Facebook,0.4,0.2,0.200293,0.677500,0.006733,0.343749,0.200000,0.582618,0.347632,0.4
1158,3.5,35.0,Facebook,0.4,0.3,0.222160,0.727500,0.006170,0.329198,0.212500,0.537673,0.311460,0.3
1159,3.5,35.0,Facebook,0.5,0.1,0.154159,0.712500,0.005143,0.451522,0.212500,0.525142,0.410727,0.4
1160,3.5,35.0,Facebook,0.5,0.3,0.207987,0.665000,0.008607,0.363541,0.225000,0.605318,0.335231,0.2


In [59]:
# apply Z-Score Normalization for the metrics at the dataset level
datasets = dfs['dataset'].unique()
metrics = cols = ['nmi','evc_overlap','evc_MAE','deg_kl', 'diam_rel','cc_rel','mod_rel']
for dataset in datasets:
    dfs.loc[dfs['dataset'] == dataset, metrics] = dfs.loc[dfs['dataset'] == dataset, metrics].apply(zscore)

dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,5.0,CA-HepPh,0.1,0.1,-1.478023,-0.821154,-0.721442,1.878104,-3.398509,0.542830,1.405300,0.8
1,0.5,5.0,CA-HepPh,0.1,0.2,-1.192883,-0.830602,-0.495401,2.198478,-1.648663,0.542588,1.348403,0.7
2,0.5,5.0,CA-HepPh,0.1,0.3,-1.313792,-0.854221,-2.750757,1.331095,-1.398686,0.538132,1.310513,0.6
3,0.5,5.0,CA-HepPh,0.1,0.4,-1.154940,-0.825878,0.731965,1.928516,-1.148708,0.539806,1.343358,0.5
4,0.5,5.0,CA-HepPh,0.1,0.5,-1.425881,-0.840050,-0.611755,1.786658,-1.398686,0.540177,1.376464,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,3.5,35.0,Facebook,0.4,0.2,-0.171250,0.046459,0.171621,-0.309275,-0.701880,0.335364,-0.092440,0.4
1158,3.5,35.0,Facebook,0.4,0.3,0.246658,0.520156,0.042339,-0.329880,-0.455274,-0.082073,-0.447621,0.3
1159,3.5,35.0,Facebook,0.5,0.1,-1.052911,0.378047,-0.193111,-0.156661,-0.455274,-0.198456,0.527104,0.4
1160,3.5,35.0,Facebook,0.5,0.3,-0.024201,-0.071966,0.601362,-0.281247,-0.208667,0.546186,-0.214212,0.2


In [60]:
# invert certain metrics that are better when the values are smaller
for col in ['evc_MAE','deg_kl', 'diam_rel','cc_rel','mod_rel']:
    dfs[col] = -dfs[col]

dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3
0,0.5,5.0,CA-HepPh,0.1,0.1,-1.478023,-0.821154,0.721442,-1.878104,3.398509,-0.542830,-1.405300,0.8
1,0.5,5.0,CA-HepPh,0.1,0.2,-1.192883,-0.830602,0.495401,-2.198478,1.648663,-0.542588,-1.348403,0.7
2,0.5,5.0,CA-HepPh,0.1,0.3,-1.313792,-0.854221,2.750757,-1.331095,1.398686,-0.538132,-1.310513,0.6
3,0.5,5.0,CA-HepPh,0.1,0.4,-1.154940,-0.825878,-0.731965,-1.928516,1.148708,-0.539806,-1.343358,0.5
4,0.5,5.0,CA-HepPh,0.1,0.5,-1.425881,-0.840050,0.611755,-1.786658,1.398686,-0.540177,-1.376464,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,3.5,35.0,Facebook,0.4,0.2,-0.171250,0.046459,-0.171621,0.309275,0.701880,-0.335364,0.092440,0.4
1158,3.5,35.0,Facebook,0.4,0.3,0.246658,0.520156,-0.042339,0.329880,0.455274,0.082073,0.447621,0.3
1159,3.5,35.0,Facebook,0.5,0.1,-1.052911,0.378047,0.193111,0.156661,0.455274,0.198456,-0.527104,0.4
1160,3.5,35.0,Facebook,0.5,0.3,-0.024201,-0.071966,-0.601362,0.281247,0.208667,-0.546186,0.214212,0.2


In [61]:
# calculate a score based on normalized metrics. The larger the score the better.
dfs['score'] = dfs[metrics].sum(axis=1)
dfs

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3,score
0,0.5,5.0,CA-HepPh,0.1,0.1,-1.478023,-0.821154,0.721442,-1.878104,3.398509,-0.542830,-1.405300,0.8,-2.005461
1,0.5,5.0,CA-HepPh,0.1,0.2,-1.192883,-0.830602,0.495401,-2.198478,1.648663,-0.542588,-1.348403,0.7,-3.968889
2,0.5,5.0,CA-HepPh,0.1,0.3,-1.313792,-0.854221,2.750757,-1.331095,1.398686,-0.538132,-1.310513,0.6,-1.198311
3,0.5,5.0,CA-HepPh,0.1,0.4,-1.154940,-0.825878,-0.731965,-1.928516,1.148708,-0.539806,-1.343358,0.5,-5.375754
4,0.5,5.0,CA-HepPh,0.1,0.5,-1.425881,-0.840050,0.611755,-1.786658,1.398686,-0.540177,-1.376464,0.4,-3.958789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,3.5,35.0,Facebook,0.4,0.2,-0.171250,0.046459,-0.171621,0.309275,0.701880,-0.335364,0.092440,0.4,0.471818
1158,3.5,35.0,Facebook,0.4,0.3,0.246658,0.520156,-0.042339,0.329880,0.455274,0.082073,0.447621,0.3,2.039323
1159,3.5,35.0,Facebook,0.5,0.1,-1.052911,0.378047,0.193111,0.156661,0.455274,0.198456,-0.527104,0.4,-0.198466
1160,3.5,35.0,Facebook,0.5,0.3,-0.024201,-0.071966,-0.601362,0.281247,0.208667,-0.546186,0.214212,0.2,-0.539587


In [63]:
# top 10 combos for Chamelon
dfs[dfs['dataset'] == "Chamelon"].nlargest(10, "score")
# for dataset in datasets:
#     print(dfs[dfs['dataset'] == dataset].nlargest(10, "score"))

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3,score
803,3.5,10.0,Chamelon,0.3,0.5,2.005815,0.99868,0.794761,0.800869,2.170939,0.973776,1.263456,0.2,9.008297
865,3.5,15.0,Chamelon,0.3,0.4,1.740759,0.917874,0.820162,0.89043,2.280941,1.007604,1.289859,0.3,8.947628
813,3.5,10.0,Chamelon,0.5,0.4,1.82882,0.917874,0.791168,0.78814,2.170939,1.038448,1.376039,0.1,8.911427
809,3.5,10.0,Chamelon,0.4,0.5,1.595651,0.971745,0.755986,0.726771,2.500944,0.987041,1.316437,0.1,8.854576
942,3.5,20.0,Chamelon,0.6,0.3,1.348305,0.931341,0.826604,0.858485,2.280941,0.938734,1.265978,0.1,8.450388
749,3.5,5.0,Chamelon,0.5,0.3,1.760637,0.985212,0.82911,0.765067,1.840933,0.892002,1.350714,0.2,8.423675
804,3.5,10.0,Chamelon,0.3,0.6,1.896877,0.985212,0.754895,0.769119,1.840933,0.934993,1.211558,0.1,8.393588
808,3.5,10.0,Chamelon,0.4,0.4,1.534968,0.971745,0.797162,0.780896,2.060937,1.006799,1.241076,0.2,8.393582
876,3.5,15.0,Chamelon,0.5,0.4,1.369798,0.864003,0.74779,0.762788,2.390943,0.973364,1.271776,0.1,8.380462
816,3.5,10.0,Chamelon,0.6,0.3,1.533758,0.917874,0.771378,0.797423,2.060937,0.98476,1.251988,0.1,8.318118


In [64]:
# top 10 combos for Facebook
dfs[dfs['dataset'] == "Facebook"].nlargest(10, "score")

Unnamed: 0,eps,N,dataset,e1,e2,nmi,evc_overlap,evc_MAE,deg_kl,diam_rel,cc_rel,mod_rel,e3,score
782,3.5,5.0,Facebook,0.5,0.3,1.885185,0.899113,1.241515,0.211817,2.674733,3.033929,1.482169,0.2,11.428461
774,3.5,5.0,Facebook,0.3,0.3,1.76044,1.301756,1.118598,0.324561,-0.284546,2.154077,1.582684,0.4,7.957571
780,3.5,5.0,Facebook,0.4,0.3,1.689582,1.159647,1.219254,0.307417,-0.777759,2.45448,1.677378,0.3,7.729998
840,3.5,10.0,Facebook,0.3,0.6,1.508773,0.520156,0.895865,0.364139,2.18152,0.634966,1.405606,0.1,7.511025
776,3.5,5.0,Facebook,0.3,0.5,2.090015,0.567526,1.132868,0.324545,0.208667,1.624354,1.527218,0.2,7.475192
779,3.5,5.0,Facebook,0.4,0.2,1.226742,1.041223,1.211724,0.298068,-0.531153,2.717559,1.422191,0.4,7.386354
783,3.5,5.0,Facebook,0.7,0.1,0.154741,1.064907,1.180154,0.227935,-0.037939,2.9974,1.20989,0.2,6.797089
777,3.5,5.0,Facebook,0.3,0.6,2.343166,0.472786,1.007854,0.383833,-0.531153,1.076793,1.621767,0.1,6.375046
773,3.5,5.0,Facebook,0.3,0.2,0.912864,1.017538,1.112016,0.282607,-0.777759,2.120205,1.07265,0.5,5.74012
775,3.5,5.0,Facebook,0.3,0.4,1.590573,0.68595,0.507931,0.311949,-0.037939,1.197211,1.307174,0.3,5.562849
