In [8]:
import os
import pandas as pd
from local_python.general_utils import load_pd_from_json, number_to_string

In [9]:
metric_name = "f1_macro"
column_names_groupby = [
    "architecture",
    "downstream",
    "pre-training",
    "number_of_samples",
    "model_name",
]
aggregating_functions = ["mean", "std", "count"]
path_metrics = "../runs/"
path_f1_scores_grouped = "../results/f1_scores.csv"

In [10]:
def get_downstream(value):
    return get_feature_identifier_part(value, 0).replace("_", "-")


def get_architecture(value):
    return get_feature_identifier_part(value, 1)


def get_pretraining(value):
    return get_feature_identifier_part(value, 2)


def get_feature_identifier_part(value, idx):
    return os.path.splitext(os.path.basename(value))[0].split("-")[idx]

In [12]:
df_master = pd.DataFrame()

for name in os.listdir(path_metrics):
    if name.endswith(".txt") and "metrics" in name:
        metric_file_path = os.path.join(path_metrics, name)
        df_full = load_pd_from_json(metric_file_path)
        df_full["number_of_samples"] = df_full["number_of_samples"].apply(
            number_to_string, "All"
        )
        df_full["architecture"] = df_full["feature_identifier"].apply(get_architecture)
        df_full["downstream"] = df_full["feature_identifier"].apply(get_downstream)
        df_full["pre-training"] = df_full["feature_identifier"].apply(get_pretraining)
        df_groups = df_full.groupby(column_names_groupby).agg(
            {metric_name: aggregating_functions}
        )
        df_master = pd.concat(
            [df_master, df_groups.reset_index()], ignore_index=True, axis=0
        )
print(f"{len(df_master)} rows in master file")

Read 7500 entries from Cassava-ResNet50-metrics.txt
Read 9000 entries from Cassava-ViT_T16-student-metrics.txt
Read 7500 entries from DDI-ResNet50-metrics.txt
Read 9000 entries from DDI-ViT_T16-student-metrics.txt
Read 7500 entries from Fitzpatrick17k-ResNet50-metrics.txt
Read 9000 entries from Fitzpatrick17k-ViT_T16-student-metrics.txt
Read 7500 entries from HAM10000-ResNet50-metrics.txt
Read 9000 entries from HAM10000-ViT_T16-student-metrics.txt
Read 3520 entries from master-metrics.txt
Read 6000 entries from PAD_UFES_20-ResNet50-metrics.txt
Read 7200 entries from PAD_UFES_20-ViT_T16-student-metrics.txt
Read 6000 entries from PlantDataset-ResNet50-metrics.txt
Read 7200 entries from PlantDataset-ViT_T16-student-metrics.txt
Read 6000 entries from PlantDoc-ResNet50-metrics.txt
Read 7200 entries from PlantDoc-ViT_T16-student-metrics.txt
Read 7500 entries from PlantVillage-ResNet50-metrics.txt
Read 9000 entries from PlantVillage-ViT_T16-student-metrics.txt
1485 rows in master file


In [13]:
for column_name in df_master.columns:
    unique_values = df_master[column_name].unique()
    if 20 < len(unique_values):
        print(f"{column_name} has {len(unique_values)} unique values")
    else:
        print(f"{column_name}: {unique_values}")

('architecture', ''): ['ResNet50' 'ViT_T16']
('downstream', ''): ['Cassava' 'DDI' 'Fitzpatrick17k' 'HAM10000' 'PAD-UFES-20' 'PlantDataset'
 'PlantDoc' 'PlantVillage']
('pre-training', ''): ['Derma_SSL_SimCLR' 'ImageNet_1k_SL_V1' 'ImageNet_1k_SSL_SimCLR' 'PDDD'
 'Random' 'Derma' 'ImageNet_1k_SL_WinKawaks' 'ImageNet_1k_SSL_Dino'
 'ImageNet_AugReg' 'Plant']
('number_of_samples', ''): ['1' '10' '100' '3' '30' 'None']
('model_name', ''): ['dc' 'knn' 'lr']
('f1_macro', 'mean') has 1004 unique values
('f1_macro', 'std') has 989 unique values
('f1_macro', 'count'): [100  10  20]


In [14]:
df_master.columns = df_master.columns.map(lambda col: "_".join(col).strip("_"))
df_master

Unnamed: 0,architecture,downstream,pre-training,number_of_samples,model_name,f1_macro_mean,f1_macro_std,f1_macro_count
0,ResNet50,Cassava,Derma_SSL_SimCLR,1,dc,0.019217,0.000000,100
1,ResNet50,Cassava,Derma_SSL_SimCLR,1,knn,0.184686,0.035744,100
2,ResNet50,Cassava,Derma_SSL_SimCLR,1,lr,0.163515,0.045919,100
3,ResNet50,Cassava,Derma_SSL_SimCLR,10,dc,0.019217,0.000000,100
4,ResNet50,Cassava,Derma_SSL_SimCLR,10,knn,0.208679,0.018469,100
...,...,...,...,...,...,...,...,...
1480,ViT_T16,PlantVillage,Random,3,knn,0.128046,0.010907,100
1481,ViT_T16,PlantVillage,Random,3,lr,0.175155,0.013696,100
1482,ViT_T16,PlantVillage,Random,30,dc,0.000604,0.000000,100
1483,ViT_T16,PlantVillage,Random,30,knn,0.205410,0.007056,100


In [15]:
df_master.to_csv(path_f1_scores_grouped)