In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
from collections import Counter

In [2]:
def collect_metrics(root_dir):

    depth = 2

    dict_res_roberta = dict()
    dict_res_camem = dict()
    dict_res_xlm = dict()
    
    for subdir, dirs, files in os.walk(root_dir):
        if subdir[len(root_dir):].count(os.sep) < depth:
            for file_ in files:
                if file_ == "xai_metrics.json":
                    with open(subdir + "/" + file_) as f:
                        d = json.load(f)
                        log_odd = d["log_odd"]
                        anti_log_odd = d["anti_log_odd"]
                        comp = d["comp"]
                        suff = d["suff"]
                        
                    model_type = subdir.split(os.sep)[-1].split("_")[0]
                    if model_type == "roberta":
                        dataset_name = " ".join(subdir.split(os.sep)[-1].split("_")[1:])
                        dict_res_roberta[dataset_name] = {"log odd": log_odd, "anti log odd": anti_log_odd, "comp": comp, "suff": suff}
                    elif model_type == "camem":
                        dataset_name = " ".join(subdir.split(os.sep)[-1].split("_")[1:])
                        dict_res_camem[dataset_name] = {"log odd": log_odd, "anti log odd": anti_log_odd, "comp": comp, "suff": suff}
                    elif model_type == "xlm":
                        dataset_name = " ".join(subdir.split(os.sep)[-1].split("_")[1:])
                        dict_res_xlm[dataset_name] = {"log odd": log_odd, "anti log odd": anti_log_odd, "comp": comp, "suff": suff}

    return dict_res_roberta, dict_res_camem, dict_res_xlm

def _average_dicts(dicts):
    avg_dict = collections.defaultdict(lambda: 0)
    
    for i, d in enumerate(dicts):
        for k, v in d.items():
            avg_dict[k] += v
        
    for k, v in avg_dict.items():
        avg_dict[k] /= (i+1)
    
    return avg_dict

def average_over_users(res):
    averaged_en = collections.defaultdict(list)
    averaged_it = collections.defaultdict(list)

    for k, v in res.items():
        if "en" in k and not ("nf" in k):
            averaged_en[" ".join(k.split(" ")[0:2])].append(v)
        elif "it" in k and not ("nf" in k):
            averaged_it[" ".join(k.split(" ")[0:2])].append(v)
    
    result_en = dict()
    result_it = dict()
            
    for k, v in averaged_en.items():
        result_en["avg en " + k] = _average_dicts(v)
        
    for k, v in averaged_it.items():
        result_it["avg it " + k] = _average_dicts(v)
                        
    return result_en, result_it

def process_to_present(res, column_width="1.1cm", plot_transpose=True):
    df_res = pd.DataFrame.from_dict(res).T
    # df_res = df_res.rename(columns=mapping_columns_names)
    if not plot_transpose:
        df_res = df_res.T
    s = df_res.style
    s.format(na_rep='MISS', precision=2)
    print(s.to_latex(column_format='l'+('p{'+f'{column_width}'+'}')*len(df_res.columns)))

In [3]:
dict_res_roberta, dict_res_camem, dict_res_xlm = collect_metrics("../output/sst2_2")

## ROBERTA

In [4]:
avg, _ = average_over_users(dict_res_roberta)
roberta_results = {**dict_res_roberta, ** avg}

In [5]:
pd.DataFrame(roberta_results).T

Unnamed: 0,log odd,anti log odd,comp,suff
np f en6,-0.418355,-0.389205,0.202144,0.104042
p f en98,-0.84677,-0.455843,0.319497,0.21383
p f en6,-0.826747,-0.552587,0.32163,0.242892
p f en83,-0.782659,-0.668574,0.30144,0.258438
np f en57,-0.407685,-1.512118,0.303476,0.220999
p f en49,-0.821754,-0.487702,0.306726,0.264014
p f en57,-0.83351,-0.525586,0.320067,0.236585
np f en83,-0.56857,-1.396199,0.291696,0.175574
np f en49,-0.803487,-1.306139,0.310119,0.132209
np f en98,-0.513712,-1.281455,0.295593,0.210427


In [6]:
process_to_present(roberta_results)

\begin{tabular}{lp{1.1cm}p{1.1cm}p{1.1cm}p{1.1cm}}
 & log odd & anti log odd & comp & suff \\
np f en6 & -0.42 & -0.39 & 0.20 & 0.10 \\
p f en98 & -0.85 & -0.46 & 0.32 & 0.21 \\
p f en6 & -0.83 & -0.55 & 0.32 & 0.24 \\
p f en83 & -0.78 & -0.67 & 0.30 & 0.26 \\
np f en57 & -0.41 & -1.51 & 0.30 & 0.22 \\
p f en49 & -0.82 & -0.49 & 0.31 & 0.26 \\
p f en57 & -0.83 & -0.53 & 0.32 & 0.24 \\
np f en83 & -0.57 & -1.40 & 0.29 & 0.18 \\
np f en49 & -0.80 & -1.31 & 0.31 & 0.13 \\
np f en98 & -0.51 & -1.28 & 0.30 & 0.21 \\
np nf en & -0.12 & -1.71 & 0.25 & 0.25 \\
p nf en & -0.95 & -0.53 & 0.31 & 0.25 \\
avg en np f & -0.54 & -1.18 & 0.28 & 0.17 \\
avg en p f & -0.82 & -0.54 & 0.31 & 0.24 \\
\end{tabular}



## XLM

In [7]:
avg, _ = average_over_users(dict_res_xlm)
xlm_results = {**dict_res_xlm, ** avg}

In [8]:
pd.DataFrame(xlm_results).T

Unnamed: 0,log odd,anti log odd,comp,suff
p nf en,-0.406611,-0.49591,0.180338,0.284389
np f en49,-0.309384,-1.927145,0.163461,0.290093
p f en83,-0.143999,-0.33897,0.03284,0.106492
p f en49,-0.668597,-0.510669,0.230756,0.286485
np f en83,-0.254294,-1.893138,0.151784,0.2646
p f en98,-0.477138,-0.401201,0.170064,0.211433
np f en98,-0.362067,-1.749374,0.18694,0.271692
np f en6,-0.29974,-1.53692,0.145078,0.268389
np nf en,-0.65804,-1.351752,0.270937,0.273333
p f en57,-0.144373,-0.234512,0.087496,0.480919


In [9]:
process_to_present(xlm_results)

\begin{tabular}{lp{1.1cm}p{1.1cm}p{1.1cm}p{1.1cm}}
 & log odd & anti log odd & comp & suff \\
p nf en & -0.41 & -0.50 & 0.18 & 0.28 \\
np f en49 & -0.31 & -1.93 & 0.16 & 0.29 \\
p f en83 & -0.14 & -0.34 & 0.03 & 0.11 \\
p f en49 & -0.67 & -0.51 & 0.23 & 0.29 \\
np f en83 & -0.25 & -1.89 & 0.15 & 0.26 \\
p f en98 & -0.48 & -0.40 & 0.17 & 0.21 \\
np f en98 & -0.36 & -1.75 & 0.19 & 0.27 \\
np f en6 & -0.30 & -1.54 & 0.15 & 0.27 \\
np nf en & -0.66 & -1.35 & 0.27 & 0.27 \\
p f en57 & -0.14 & -0.23 & 0.09 & 0.48 \\
p f en6 & -0.22 & -0.24 & 0.14 & 0.33 \\
np f en57 & -0.30 & -1.71 & 0.15 & 0.28 \\
avg en np f & -0.31 & -1.76 & 0.16 & 0.27 \\
avg en p f & -0.33 & -0.34 & 0.13 & 0.28 \\
\end{tabular}

