# Aggregate results from GLUE evaluation

The model and adaptation results on GLUE have already been obtained, we simply need to collect the results from the different log files.

Move to root folder

In [None]:
%cd ..

In [1]:
import json
import os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

### Helper functions

In [2]:
TASK_METRICS = {"cola": "matthews_correlation", 
                "mnli": "accuracy", 
                "mrpc": "combined_score", 
                "qnli": "accuracy", 
                "qqp": "combined_score", 
                "rte": "accuracy", 
                "sst2": "accuracy", 
                "stsb": "combined_score", 
                "wnli": "accuracy"}

UNIMODAL_MODELS = ["bert-base-uncased"]
MULTIMODAL_MODELS = ["clipbert", "lxmert", "visualbert"]
MULTIMODAL_ADAPTATIONS = ["-avg-visual-features", 
                          "-finetuned-lxmert-visual-features", 
                          "-finetuned-wikipedia-visual-features", 
                          "-no-visual-features",
                          "-no-visual-features-finetuned-lxmert", 
                          "-no-visual-features-finetuned-wikipedia", 
                          "-zero-image-visual-features",
                          "-zeroed-visual-features"]
MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS = {"clipbert": ["-imagined-visual-features"],
                                         "lxmert": [],
                                         "visualbert": []
                                        }
UNIMODAL_ADAPTATIONS = ["", "-trained-lxmert", "-trained-lxmert-scratch", "-trained-wikipedia"]

In [3]:
def get_eval_score(filename, task):
    with open(filename, 'r') as f:
        scores = json.load(f)
    metric_name = "eval_"+TASK_METRICS[task]
    return scores[metric_name]

In [4]:
def get_eval_filenames(dirname):
    eval_filenames = {}
    for file in os.listdir(dirname):
        if "GLUE-benchmark-" in file:
            # folders look as follows: 'GLUE-benchmark-rte-bert-base-uncased-2022-05-02T09-38'
            task_name = file.split("-")[2]
            eval_filename = os.path.join(dirname, file, "eval_results.json")
            if os.path.exists(eval_filename):
                if task_name in eval_filenames:
                    raise ValueError(f"Duplicate entries for task {task_name} found in {dirname}")
                else:
                    eval_filenames[task_name] = eval_filename
    assert eval_filenames.keys() == TASK_METRICS.keys(), "All eval task files should be present in the given folder"
    return eval_filenames

In [5]:
def get_model_dirnames(dirname):
    model_dirnames = {}
    for model in UNIMODAL_MODELS:
        for adaptation in UNIMODAL_ADAPTATIONS:
            model_name = model+adaptation
            model_dirname = os.path.join(dirname, model_name)
            assert os.path.exists(model_dirname), f"The directory '{model_dirname}' should exist"
            model_dirnames[model_name] = model_dirname
    for model in MULTIMODAL_MODELS:
        for adaptation in MULTIMODAL_ADAPTATIONS:
            model_name = model+adaptation
            model_dirname = os.path.join(dirname, model_name)
            assert os.path.exists(model_dirname), f"The directory '{model_dirname}' should exist"
            model_dirnames[model_name] = model_dirname
        for adaptation in MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS[model]:
            model_name = model+adaptation
            model_dirname = os.path.join(dirname, model_name)
            assert os.path.exists(model_dirname), f"The directory '{model_dirname}' should exist"
            model_dirnames[model_name] = model_dirname
            
    return model_dirnames

In [6]:
def get_mnli_eval_results(dirname, logname_starter):
    if logname_starter is None:
        raise ValueError("logname_starter cannot be None")
    eval_acc = []
    values_found = 0
    for file in os.listdir(dirname):
        if file[:6] == logname_starter and file[-6:] == "_1.out":
            with open(os.path.join(dirname, file), "r") as f:
                for line in f.readlines():
                    if " eval_accuracy " in line:
                        accuracy_part = line.split()[-1]
                        eval_acc.append(float(accuracy_part))
                        values_found += 1 
    assert values_found == 2, f"There should be two mnli eval_accuracy values in {dirname}"
    return eval_acc

### Collect results

In [7]:
results = pd.DataFrame(columns=["model", "task", "score"])
logname_starter = None # fill in this - prefix of generated lognames from `benchmark_model_GLUE.py`, needed for extracting evaluation results from generated logfile of run

model_dirnames = get_model_dirnames("GLUE/data/logs")
for model, dirname in model_dirnames.items():
    eval_filenames = get_eval_filenames(dirname)
    for task, eval_filename in eval_filenames.items():
        if task == "mnli":
            # need to specifically handle mnli since eval was screwed up due to several eval sets
            mnli_scores = get_mnli_eval_results(dirname, logname_starter)
            score = sum(mnli_scores)/2
        else:
            score = get_eval_score(eval_filename, task)
        results = results.append({"model": model, "task": task, "score": score}, ignore_index = True)

In [9]:
results

Unnamed: 0,model,task,score
0,bert-base-uncased,qqp,0.895803
1,bert-base-uncased,sst2,0.936927
2,bert-base-uncased,stsb,0.883918
3,bert-base-uncased,cola,0.611204
4,bert-base-uncased,mnli,0.845550
...,...,...,...
256,visualbert-zeroed-visual-features,qnli,0.902618
257,visualbert-zeroed-visual-features,sst2,0.911697
258,visualbert-zeroed-visual-features,mrpc,0.793530
259,visualbert-zeroed-visual-features,rte,0.599278


In [8]:
results.groupby("model").mean()

Unnamed: 0_level_0,score
model,Unnamed: 1_level_1
bert-base-uncased,0.805837
bert-base-uncased-trained-lxmert,0.781412
bert-base-uncased-trained-lxmert-scratch,0.629244
bert-base-uncased-trained-wikipedia,0.782761
clipbert-avg-visual-features,0.765934
clipbert-finetuned-lxmert-visual-features,0.765073
clipbert-finetuned-wikipedia-visual-features,0.768888
clipbert-imagined-visual-features,0.769229
clipbert-no-visual-features,0.765053
clipbert-no-visual-features-finetuned-lxmert,0.762133


## Save the results

In [10]:
results.to_csv("GLUE/data/results/results.csv", index=False)