# Aggregate results from GLUE evaluation

The model and adaptation results on GLUE have already been obtained, we simply need to collect the results from the different log files.

Move to root folder

In [1]:
%cd ../..

/home1/brizk/adapt-pre-trained-VL-models-to-text


In [2]:
import json
import os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

### Helper functions

In [3]:
TASK_METRICS = {"cola": ["matthews_correlation"], 
                "mnli": ["accuracy"],
                "mnli_mm": ["accuracy"],
                "mrpc": ["accuracy","f1"], 
                "qnli": ["accuracy"], 
                "qqp":  ["accuracy","f1"], 
                "rte":  ["accuracy"], 
                "sst2": ["accuracy"], 
                "stsb": ["spearmanr"], 
                "wnli": ["accuracy"]}

UNIMODAL_MODELS = ["bert-base-uncased"]
MULTIMODAL_MODELS = ["clipbert", "lxmert", "visualbert"]
MULTIMODAL_ADAPTATIONS = ["-avg-visual-features", 
                          "-finetuned-lxmert-visual-features", 
                          "-finetuned-wikipedia-visual-features", 
                          "-no-visual-features",
                          "-no-visual-features-finetuned-lxmert", 
                          "-no-visual-features-finetuned-wikipedia", 
                          "-zero-image-visual-features",
                          "-zeroed-visual-features",
                          "-generated-imgs-features"]
MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS = {"clipbert": ["-imagined-visual-features"],
                                         "lxmert": [],
                                         "visualbert": []
                                        }
UNIMODAL_ADAPTATIONS = ["", "-trained-lxmert", "-trained-lxmert-scratch", "-trained-wikipedia"]

In [4]:
def get_eval_score(filename, task):
    with open(filename, 'r') as f:
        scores = json.load(f)
    eval_scores = {}
    for metric in TASK_METRICS[task]:
        metric_name = "eval_"+metric
        eval_scores[metric] = scores[metric_name]
    return eval_scores

In [5]:
def get_eval_filenames(dirname):
    eval_filenames = {}
    for file in os.listdir(dirname):
        if "GLUE-benchmark-" in file:
            # folders with run results look as follows: 'GLUE-benchmark-rte-bert-base-uncased-2022-05-02T09-38'
            task_name = file.split("-")[2]
            eval_filename = os.path.join(dirname, file, ("_").join([task_name, "eval_results.json"]))
            if os.path.exists(eval_filename):
                if task_name in eval_filenames:
                    raise ValueError(f"Duplicate entries for task {task_name} found in {dirname}")
                else:
                    eval_filenames[task_name] = eval_filename
                    # mnli-mm is evaluated together with mnli
                    if task_name == "mnli":
                        eval_filenames["mnli_mm"] = eval_filename.replace("mnli_eval_results", "mnli_mm_eval_results")
    if not eval_filenames.keys() == TASK_METRICS.keys():                    
        print(f"Warning: All eval task files should be present in the given folder '{dirname}'. Found:\n{eval_filenames.keys()}\nShould have:\n{TASK_METRICS.keys()}")
    return eval_filenames

In [6]:
def extract_filename(model, adaptation, dirname):
    model_name = model+adaptation
    model_dirname = os.path.join(dirname, model_name)
    
    if not os.path.exists(model_dirname):
        print(f"Warning: Missing results, the directory '{model_dirname}' should exist")
        return None, None
    return model_name, model_dirname
    
def get_model_dirnames(dirname):
    model_dirnames = {}
    for model in UNIMODAL_MODELS:
        for adaptation in UNIMODAL_ADAPTATIONS:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
    for model in MULTIMODAL_MODELS:
        for adaptation in MULTIMODAL_ADAPTATIONS:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
        for adaptation in MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS[model]:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
            
    return model_dirnames

In [7]:
def get_mnli_eval_results(dirname, logname_starter):
    if logname_starter is None:
        raise ValueError("logname_starter cannot be None")
    eval_acc = []
    values_found = 0
    for file in os.listdir(dirname):
        if file[:6] == logname_starter and file[-6:] == "_1.out":
            with open(os.path.join(dirname, file), "r") as f:
                for line in f.readlines():
                    if " eval_accuracy " in line:
                        accuracy_part = line.split()[-1]
                        eval_acc.append(float(accuracy_part))
                        values_found += 1 
    assert values_found == 2, f"There should be two mnli eval_accuracy values in {dirname}"
    return eval_acc

### Collect results

In [8]:
results = pd.DataFrame(columns=["model", "task", "metric", "score"])

model_dirnames = get_model_dirnames("GLUE/data/logs")
for model, dirname in model_dirnames.items():
    eval_filenames = get_eval_filenames(dirname)
    for task, eval_filename in eval_filenames.items():
        score = get_eval_score(eval_filename, task)
        for key, val in score.items():
            results = results.append({"model": model, "task": task, "metric": key, "score": val}, ignore_index = True)

dict_keys(['wnli'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['mrpc', 'qqp', 'stsb', 'cola', 'mnli', 'mnli_mm', 'wnli', 'sst2', 'rte'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['mnli', 'mnli_mm', 'mrpc', 'cola', 'wnli'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['qqp', 'rte', 'sst2', 'wnli', 'cola'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['cola', 'mrpc', 'rte'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['sst2', 'stsb', 'qqp', 'rte', 'cola', 'wnli', 'mrpc'])
Should have:
dict_keys(['cola', 'mnli', 'mnli_mm', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli'])
dict_keys(['mrpc', 'cola', 'sst2', 'qqp', 'wnli', 'rt

In [9]:
sorted(results['model'].unique())

['bert-base-uncased',
 'bert-base-uncased-trained-lxmert',
 'bert-base-uncased-trained-lxmert-scratch',
 'bert-base-uncased-trained-wikipedia',
 'clipbert-avg-visual-features',
 'clipbert-finetuned-lxmert-visual-features',
 'clipbert-finetuned-wikipedia-visual-features',
 'clipbert-generated-imgs-features',
 'clipbert-imagined-visual-features',
 'clipbert-no-visual-features',
 'clipbert-no-visual-features-finetuned-lxmert',
 'clipbert-no-visual-features-finetuned-wikipedia',
 'clipbert-zero-image-visual-features',
 'clipbert-zeroed-visual-features',
 'lxmert-avg-visual-features',
 'lxmert-finetuned-lxmert-visual-features',
 'lxmert-finetuned-wikipedia-visual-features',
 'lxmert-no-visual-features',
 'lxmert-no-visual-features-finetuned-lxmert',
 'lxmert-no-visual-features-finetuned-wikipedia',
 'lxmert-zero-image-visual-features',
 'lxmert-zeroed-visual-features',
 'visualbert-avg-visual-features',
 'visualbert-finetuned-lxmert-visual-features',
 'visualbert-finetuned-wikipedia-visual-

In [15]:
task_of_choices=['cola', 'wnli', 'mrpc'] # removed sst2
task = task_of_choices[0]
results[results['model'].str.contains('visualbert')][results['task'].str.match(task)] #[['model','score']]

  results[results['model'].str.contains('visualbert')][results['task'].str.match(task)] #[['model','score']]


Unnamed: 0,model,task,metric,score
224,visualbert-avg-visual-features,cola,matthews_correlation,0.524198
244,visualbert-finetuned-lxmert-visual-features,cola,matthews_correlation,0.503307
254,visualbert-finetuned-wikipedia-visual-features,cola,matthews_correlation,0.511034
258,visualbert-no-visual-features,cola,matthews_correlation,0.511795
265,visualbert-no-visual-features-finetuned-lxmert,cola,matthews_correlation,0.488237
273,visualbert-no-visual-features-finetuned-wikipedia,cola,matthews_correlation,0.527953
276,visualbert-zero-image-visual-features,cola,matthews_correlation,0.505577
281,visualbert-zeroed-visual-features,cola,matthews_correlation,0.0


In [None]:
cols = ['model', 'metric']
results_removed_dups = results.set_index(cols, append=True).drop_duplicates()['score'].copy()
results_removed_dups.unstack()

In [None]:
results.groupby("model").count()

In [None]:
results.groupby("model").mean()

## Save the results

In [None]:
results.to_csv("GLUE/data/results.csv", index=False)