# Get GLUE Scores
We aggregate evaluation results that exist in each model folder to get the glue score.  
The GLUE score for a model is defined the average of task combined scores that are calculated by averaging task metrics.  
Our procedure:
- Choose the model that has the smallest loss in each run
- Calculate the glue combined score for the model
- Average the glue combined score of three runs

In [None]:
import os
import json
import numpy as np

## Define some functions

In [None]:
# configuration for the GLUE tasks
tasks = ['wnli', 'rte', 'mrpc', 'stsb', 'cola', 'sst2', 'qnli', 'qqp', 'mnli']
task_metrics = {
    'wnli': ['eval_accuracy'],
    'rte': ['eval_accuracy'],
    'mrpc': ['eval_f1', 'eval_accuracy'],
    'stsb': ['eval_pearson', 'eval_spearmanr'],
    'cola': ['eval_matthews_correlation'],
    'qnli': ['eval_accuracy'],
    'sst2': ['eval_accuracy'],
    'qqp': ['eval_f1', 'eval_accuracy'],
    'mnli': ['eval_accuracy', 'eval_mm_accuracy']
}

In [None]:
def collect_data(root_dir, prefixes_excluded=[]):
    """Model directories have their evaluation results in the trainer_state.json file.
    This function collects data in trainer_state.json walking sub directories in root_dir.
    Returns a dict whose key is a relative path and value is a list of evaluation result on each epoch.
    (The timing of evaluation depends on eval_strategy for the run_glue.py)
    We assume that a model_path has the format '(prefix)/(vl_model_name)/(task_name)'.
    """
    prefixes_excluded = [os.path.join(root_dir, p) for p in prefixes_excluded]
    
    data = {}
    target_file_name = 'trainer_state.json'
    for cur_dir, dirs, files in os.walk(root_dir):
        if any(cur_dir.startswith(p) for p in prefixes_excluded):
            continue
        if target_file_name in files:
            results = json.load(open(os.path.join(cur_dir, target_file_name)))
            key = cur_dir.replace(root_dir+os.path.sep, '', 1) 
            data[key] = results.get('log_history', [])
    return data

In [None]:
def get_best_loss_epoch(data, task_name, skip_columns=['step', 'eval_runtime', 'eval_samples_per_second']):
    """We assume that a model_path has the format '(prefix)/(vl_model_name)/(task_name)'.
    This function filters by task_name and  
    """
    columns = []
    rows = []
    # Sort to get the same order in terms of model name each time.
    for k, v in sorted(data.items(), key=lambda kv: kv[0]):
        if task_name and not k.endswith('/'+task_name):
            continue
        losses = np.asarray([_.get('eval_loss', float('inf')) for _ in v])
        max_id = losses.argmin()
        best = v[max_id]
        if len(columns) == 0:
            # In the first loop, we initialize columns (sort for the same order in columns)
            columns = [_ for _ in sorted(best.keys()) if _ not in skip_columns]
        model_prefix = k.replace('/'+task_name, '')
        rows.append([model_prefix]+['%.4f'%best[_] for _ in columns])
    columns = ['model'] + columns
    return columns, rows

In [None]:
def average_model_runs(tasks, task_metrics, table, model_name, only_combined=False, units='raw'):
    """Calculate the averaged GLUE task and GLUE score of a model specified by model_name.
    """
    
    def get_statistics_across_runs(array):
        # the last axis is for runs
        return array.mean(axis=-1), array.std(axis=-1), array.min(axis=-1), array.max(axis=-1)
    
    scale = 100 if units == '%' else 1
    
    lines = []
    lines.append(('model_name', model_name))
    lines.append(('(%s)'%units, 'n', 'avg', 'std', 'min', 'max'))
    
    # Calculate task scores
    glue_score_elements = []
    for task_name in tasks:
        columns, rows = table[task_name]
        model_rows = [row for row in rows if row[0].endswith('/'+model_name)]
        num_examples = len(model_rows)
        
        # Calculate metrics for tasks
        metrics_for_a_task = []
        for metric in task_metrics[task_name]:
            task_col = columns.index(metric)
            arr = np.asarray([float(_[task_col])*scale for _ in model_rows])
            # The shape of arr = (the number of runs,)
            avg, std, _min, _max = get_statistics_across_runs(arr)
            # We output the task-specific scores when the only_combined is True.
            if not only_combined:
                key = task_name + '_' + metric.replace('eval_', '')
                lines.append((key, str(num_examples),  '%.4f'%avg, '%.4f'%std, '%.4f'%_min, '%.4f'%_max))
            metrics_for_a_task.append(arr)
        metrics_for_a_task = np.asarray(metrics_for_a_task)
        # The shape of metrics_for_tasks = (the number of metrics, the number of runs)
        
        # Get combined score for a task by averaging across metrics for the task
        combined_score = metrics_for_a_task.mean(axis=0)
        avg, std, _min, _max = get_statistics_across_runs(combined_score)
        key = task_name + '_' + 'combined'
        lines.append((key, str(num_examples), '%.4f'%avg, '%.4f'%std, '%.4f'%_min, '%.4f'%_max))
        glue_score_elements.append(arr)
    glue_score_elements = np.asarray(glue_score_elements)
    # The shape of glue_score_elements = (the number of tasks, the number of runs)
    
    # Calculate the GLUE score by averaging across tasks
    glue_scores = glue_score_elements.mean(axis=0)
    avg, std, _min, _max = get_statistics_across_runs(glue_scores)
    key = model_name+'_'+'glue'
    lines.append((key, str(num_examples), '%.4f'%avg, '%.4f'%std, '%.4f'%_min, '%.4f'%_max))
    
    return '\n'.join('\t'.join(_) for _ in lines)

## Run aggregation

In [None]:
root_dir = '../../../volta_transformers/trained_models'
data = collect_data(root_dir)
table = {t: get_best_loss_epoch(data, t) for t in tasks}

In [None]:
model_names = [
    'ctrl_visual_bert_base',
    'ctrl_uniter_base',
    'ctrl_vl_bert_base',
    'ctrl_lxmert_base_new',
    'ctrl_vilbert_base',
    'ctrl_visual_bert_base_reinit',
    'ctrl_uniter_base_reinit',
    'ctrl_vl_bert_base_reinit',
    'ctrl_lxmert_base_new_reinit',
    'ctrl_vilbert_base_reinit',
]
for model_name in model_names:
    results = average_model_runs(
        tasks, task_metrics, table, model_name, 
        only_combined=False, units='%'
    )
    print(results)
    print()