In [2]:
import json
from os.path import join
from string import Template
import numpy as np

In [7]:
datadir = "../data/experiments"
out_path = "../notebooks/latex_array.txt"
filepath = "f1_{exp}_{dataset}.json"
datasets = ["doccano1", "hate_speech", "unhealthy"]
dataset_label_names = ["doccano", "mhs", "uc"]
dataset_names = ["Doccano 1.0", "MHS", "UC"]
experiments = ["AA", "BA", "increment", "single_training", "threshold"]
experiment_names = ["Self-supervised", "", "Incremental", "Single-label vs. Multi-label", "Threshold"]
LABEL_COLUMNS = {
    "doccano1": ["Positive", "Negative", "Joy", "Delight", "Inspiration", "Calm", "Surprise", "Compassion", "Fear", "Sadness", "Repulsion", "Anger", "Ironic", "Embarrassing", "Vulgar", "Political", "Interesting", "Understandable", "Incomprehensible", "Offensive to me", "Offensive to someone", "Funny to me", "Funny to someone"],
    "hate_speech": ["Sentiment","Respect","Insult","Humiliate","Status","Dehumanize","Violence","Genocide","Attack Defend","Hate Speech"],
    "unhealthy": ["Antagonize","Condescending","Dismissive","Generalisation","Hostile","Sarcastic"],
}
iterators = {
    "threshold": (0.1, 0.15, 0.2, 0.25),
    "increment": list(range(1,9)),
    "single_training": LABEL_COLUMNS,
    "AA": None,
    "BA": None,
}

In [8]:
# table_template = """
# \begin{table*}%[H]
# \centering
# \begin{adjustbox}{width=\textwidth}
# \begin{tabular}{l|ccc}
# \hline
#  & F1 macro & F1 score (class 1) & F1 score (class 0) & F1 score (mean for 1 and 2 class) & & AER & AAL & MLARL\\
# \hline
# \multirow{2}{*}{Number of labels} & 23, i.e. delight, offensive to me, & 8, i.e. hostile, sarcastic,  & 10, i.e. violence,  \\

# \hline
# \end{tabular}
# \end{adjustbox}
# \caption{Dataset profiles after pre-processing. Each dataset contains a set number of labels, which are explained in full detail in Section \ref{sec:datasets}.}
# \label{tab:datasets}
# \end{table*}
# """


dataset_template = Template("""
\\begin{table}[H]
\\centering
\\begin{adjustbox}{width=\\textwidth}
\\begin{tabular}{l|l|$columns_c_string}
\\hline
Metric & $columns_table\\\\
\\hline
\\multirow{$variant_num}{*}{F1 macro} $f1_macro_scores
\\hline
\\multirow{$variant_num}{*}{F1 score (class 0)} $f1_class0_scores 
\\hline
\\multirow{$variant_num}{*}{F1 score (class 1)} $f1_class1_scores 
\\hline
\\end{tabular}
\\end{adjustbox}
\\caption{$experiment_name results for dataset $dataset_name}
\\label{tab:label_metrics_$dataset_label_name}
\\end{table}
""")

In [80]:
# with open(join(datadir, "test.txt"), "w") as f:
#     str_ = dataset_template.substitute(
#         columns_c_string="c"*2,
#         columns_table="cos & cos2",
#         f1_macro_scores="1&2",
#         f1_class0_scores="3&4",
#         f1_class1_scores="5&6",
#         dataset_name="Doccano 1.0",
#         dataset_label_name="doccano"
#     )
#     f.write(str_)


def get_mean_and_std(data, metric):
    processed = []
    for k, v in data.items():
        processed.append(v[metric])
    processed = np.stack(processed)
    # print(processed)
    return processed.mean(axis=0), processed.std(axis=0)


metrics = ("macro_per_dim", "0_per_dim", "1_per_dim")
def generate_single_table(dataset_idx, experiment_idx, experiment_column="$t$ value", experiment_template="th_{v}", experiment_iterator=(0.1, 0.15, 0.2, 0.25)):
    dataset = datasets[dataset_idx]
    datafile = join(datadir, filepath.format(dataset=dataset, exp=experiments[experiment_idx]))
    with open(datafile, "r") as f:
        data = json.load(f)
#     print(data)
#     means, stds = get_mean_and_std(data["th_0.1"], "macro_per_dim")
    column_c_string = "c"*(len(LABEL_COLUMNS[dataset]))
    columns_table = experiment_column + "&" + " & ".join(LABEL_COLUMNS[dataset])
    metric_rows = {}
    for metric in metrics:
        metric_one_row = ""
        for i, key in enumerate(experiment_iterator):
            means, stds = get_mean_and_std(data[experiment_template.format(v=key)], metric)
            curr_data = zip(list(means), list(stds))
            curr_row = "& " + str(key) + " & " + " & ".join(map(lambda d: "$" + f"{d[0]:.2f}"+"\\pm"+f"{d[1]:.2f}" + "$", curr_data))
            metric_one_row += curr_row
            metric_one_row += "\\\\\n"
        metric_rows[metric] = metric_one_row
    
    output = dataset_template.substitute(
        columns_c_string=column_c_string,
        columns_table=columns_table,
        variant_num=i+1,
        f1_macro_scores=metric_rows[metrics[0]],
        f1_class0_scores=metric_rows[metrics[1]],
        f1_class1_scores=metric_rows[metrics[2]],
        dataset_name=dataset_names[dataset_idx],
        dataset_label_name=dataset_label_names[dataset_idx],
        experiment_name=experiment_names[experiment_idx]
    )
    return  output

def generate_single_training_table(dataset_idx):
    experiment_column = "Classification type"
    dataset = datasets[dataset_idx]
    column_c_string = "c"*(len(LABEL_COLUMNS[dataset])+1)
    columns_table = experiment_column + "&" + " &".join(LABEL_COLUMNS[dataset]) + "& All Labels (average)"
    datafile = join(datadir, filepath.format(dataset=dataset, exp="single_training"))
    with open(datafile, "r") as f:
        single_data = json.load(f)
    with open(join(datadir, filepath.format(dataset=dataset, exp="threshold")), "r") as f:
        multi_data = json.load(f)["th_0.15"]
    
    metric_rows = []
    for metric_single, metric_multi in zip(("macro", "mean_for_0", "mean_for_1"), metrics):
        metric_single_row = "&Single-label"
        metric_multi_row = "&Multi-label"
        means, stds = get_mean_and_std(multi_data, metric_multi)
        multi_data_in_row = list(zip(list(means), list(stds)))
        multi_average = (means.mean(), means.std())
        single_average=[]
        for i, label in enumerate(single_data.keys()):
            curr_data = get_mean_and_std(single_data[label], metric_single)
            single_average.append(curr_data[0])
            metric_single_row += f"&${curr_data[0]:.2f}\\pm{curr_data[1]:.2f}$"
            curr_multi_data = multi_data_in_row[i]
            metric_multi_row += f"&${curr_multi_data[0]:.2f}\\pm{curr_multi_data[1]:.2f}$"
            
            # curr_row = "& " + str(key) + " & " + " & ".join(map(lambda d: "$" + f"{d[0]:.2f}"+"\\pm"+f"{d[1]:.2f}" + "$", curr_data))
            # curr_row += "\\\\\n"
            # metric_one_row += curr_row
            # mean, std = get_mean_and_std(multi_data[label], metric_single)
        metric_single_row += f"&${np.mean(single_average):.2f}\\pm{np.std(single_average):.2f}$\\\\\n"
        metric_multi_row += f"&${multi_average[0]:.2f}\\pm{multi_average[1]:.2f}$\\\\\n"
        metric_rows.append(metric_single_row+metric_multi_row)
    
    output = dataset_template.substitute(
        columns_c_string=column_c_string,
        columns_table=columns_table,
        variant_num=2,
        f1_macro_scores=metric_rows[0],
        f1_class0_scores=metric_rows[1],
        f1_class1_scores=metric_rows[2],
        dataset_name=dataset_names[dataset_idx],
        dataset_label_name=dataset_label_names[dataset_idx],
        experiment_name=experiment_names[-2]
    )
    return  output
            
            
def generate_AB_table(dataset_idx):
    experiment_column = "Evaluation type"
    dataset = datasets[dataset_idx]
    column_c_string = "c"*(len(LABEL_COLUMNS[dataset])+1)
    columns_table = experiment_column + "&" + " &".join(LABEL_COLUMNS[dataset]) + "& All Labels (average)"
    with open(join(datadir, filepath.format(dataset=dataset, exp="AA")), "r") as f:
        sup_data = json.load(f)
    
    with open(join(datadir, filepath.format(dataset=dataset, exp="BA")), "r") as f:
        selfsup_data = json.load(f)
    
    starting_rows = ["&Supervised", "&Self-Supervised"]
    metric_rows = []
    # for metric_single, metric_multi in zip(("macro", "mean_for_0", "mean_for_1"), metrics):
    #     metric_single_row = 
    #     metric_multi_row = 
        # means, stds = get_mean_and_std(multi_data, metric_multi)
        # multi_data_in_row = list(zip(list(means), list(stds)))
        # multi_average = (means.mean(), means.std())
        # single_average=[]
#         for i, label in enumerate(single_data.keys()):
#             curr_data = get_mean_and_std(single_data[label], metric_single)
#             single_average.append(curr_data[0])
#             metric_single_row += f"&${curr_data[0]:.2f}\\pm{curr_data[1]:.2f}$"
#             curr_multi_data = multi_data_in_row[i]
#             metric_multi_row += f"&${curr_multi_data[0]:.2f}\\pm{curr_multi_data[1]:.2f}$"
            
#             curr_row = "& " + str(key) + " & " + " & ".join(map(lambda d: "$" + f"{d[0]:.2f}"+"\\pm"+f"{d[1]:.2f}" + "$", curr_data))
#             curr_row += "\\\\\n"
#             metric_one_row += curr_row
    for single_metric, metric in zip(("macro", "mean_for_0", "mean_for_1"), metrics):
        metric_one_row = ""
        for i, curr_json in enumerate((sup_data, selfsup_data)):
            means, stds = get_mean_and_std(curr_json, metric)
            curr_data = zip(list(means), list(stds))
            mean, std = get_mean_and_std(curr_json, single_metric)

            curr_row = starting_rows[i] + " & " + " & ".join(map(lambda d: "$" + f"{d[0]:.2f}"+"\\pm"+f"{d[1]:.2f}" + "$", curr_data))
            metric_one_row += curr_row + f"&${mean:.2f}"+"\\pm"+f"{std:.2f}" + "$"
            metric_one_row += "\\\\\n"
        metric_rows.append(metric_one_row)
            
            
        # metric_single_row += f"&${np.mean(single_average):.2f}\\pm{np.std(single_average):.2f}$\\\\\n"
        # metric_multi_row += f"&${multi_average[0]:.2f}\\pm{multi_average[1]:.2f}$\\\\\n"
        # metric_rows.append(metric_single_row+metric_multi_row)
    
    output = dataset_template.substitute(
        columns_c_string=column_c_string,
        columns_table=columns_table,
        variant_num=2,
        f1_macro_scores=metric_rows[0],
        f1_class0_scores=metric_rows[1],
        f1_class1_scores=metric_rows[2],
        dataset_name=dataset_names[dataset_idx],
        dataset_label_name=dataset_label_names[dataset_idx],
        experiment_name="Self-Supervised"
    )
    return  output
            
    
    

In [81]:
# out = generate_single_training_table(1)
# out = generate_AB_table(2)

In [82]:
# out = generate_single_table(0, -1) #, experiment_iterator=range(1,9), experiment_template="train_{v}", experiment_column="Train folds")

In [87]:
results_template = Template("""
\\begin{table}[H]
\\centering
\\begin{adjustbox}{width=\\textwidth}
\\begin{tabular}{l|$columns_c_string}
\\hline
 & \\multicolumn{$category_num}{*}{F1 macro} & \\multicolumn{$category_num}{*}{AER} & \\multicolumn{$category_num}{*}{AAL} & \\multicolumn{$category_num}{*}{MLRAL}\\\\
\\hline
Dataset & $category_variants & $category_variants & $category_variants & $category_variants\\\\
\\hline
Doccano 1.0  $doccano_scores\\\\
\\hline
MHS  $mhs_scores\\\\
\\hline
UC  $uc_scores\\\\
\\hline
\\end{tabular}
\\end{adjustbox}
\\caption{$experiment_name results for every dataset}
\\label{tab:datasets_metrics_$experiment_label}
\\end{table}
""")

In [93]:
metrics = ["macro", "AER", "AAL", "MLRAL"]
def generate_AB_sum_table():
    category_variants = "Supervised&Self-Supervised"
    category_num=2
    columns_c_string = "cc|cc|cc|cc"
    experiment_name = "Self-Supervised"
    experiment_label= "self_sup"
    
    for i, dataset in enumerate(datasets):
        with open(join(datadir, filepath.format(dataset=dataset, exp="AA")), "r") as f:
            sup_data = json.load(f)
        with open(join(datadir, filepath.format(dataset=dataset, exp="BA")), "r") as f:
            selfsup_data = json.load(f)
        row = ""
        for metric in metrics:
            mean, std = get_mean_and_std(sup_data, metric)
            row += f"&${mean:.2f}"+"\\pm"+f"{std:.2f}" + "$"
            mean, std = get_mean_and_std(selfsup_data, metric)
            row += f"&${mean:.2f}"+"\\pm"+f"{std:.2f}" + "$"
        row += "\\\\\n"
        if i == 0:
            doccano_scores = row
        elif i == 1:
            mhs_scores = row
        else:
            uc_scores = row
        
    
    out = results_template.substitute(
        columns_c_string=columns_c_string,
        category_num=category_num,
        category_variants=category_variants,
        doccano_scores=doccano_scores,
        mhs_scores=mhs_scores,
        uc_scores=uc_scores,
        experiment_name=experiment_name,
        experiment_label=experiment_label,
    )
    return out

In [94]:
out = generate_AB_sum_table()

In [95]:
with open(out_path, "w") as f:
    f.write(out)