# Error Analysis
This notebook first creates prediction files on the validation set of each task and run.
Then it analyze those prediction files.

## Make prediction files

In [None]:
# Enumerate the best models that have the minimum losses for each run
# A table best_models.txt will be output in the anaysis directory
!python enumerate_best_models.py -r ../../vl_models/finetuned > best_models.txt

In [None]:
# Predict labels with those best models
# prediction directory will be created in the anaysis directory and 
# prediction will be stored in the sub-directoryies that corresponds to model directories.
# We output strings in dump.log because they are too long to show in a notebook.
!./batch_dump.sh best_models.txt > dump.log 2>&1

## Calculate the distribution of some metrics
We analyze the distributions of sentence lengths and readability here.
- Make sentence sets (both solvable, vl-only, bert-only and neither) for each model family from those dump files.
- Calculate their statiistics

In [None]:
# We install packages for this analysis before entering the detail
# We use textstat to calculate readability
!pip install textstat pandas matplotlib

### Make sentence sets

In [None]:
import os
import csv
from datasets import load_dataset

In [None]:
# configuration
# We do not use stsb
target_tasks = ['cola', 'sst2', 'mrpc', 'qqp','mnli/mnli_m', 'mnli/mnli_mm', 'qnli',  'rte', 'wnli']
# keys for sentences in each task
keys_for_tasks = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
    'cc': ('sentence', None)
}

target_lm = 'bert-base-uncased'
target_models = ['ctrl_visual_bert_base',  'ctrl_uniter_base', 'ctrl_vl_bert_base', 'ctrl_lxmert_base_new', 'ctrl_vilbert_base']
target_runs = ['1', '2', '3']
target_split = 'valid'

# path to a directory contains predictions
root_path = 'prediction'
# path to output the text files of sentence sets
output_dir_combined  = 'sentence_sets_combined'
# We consider successful if accuracy is higher than this value for each problem.
# 0.5 means correct two or three times when the number of runs is three,
successful_threshold = 0.5

In [None]:
# Load sentences from the GLUE tasks
datasets = {}
for task in target_tasks:
    if task.startswith('mnli'):
        dataset = load_dataset('glue', 'mnli')
        datasets['mnli/mnli_m'] = dataset['validation_matched']
        datasets['mnli/mnli_mm'] = dataset['validation_mismatched']
    else:
        datasets[task] = load_dataset('glue', task)['validation']

In [None]:
def summarize_across_runs(root_path, model, task, runs, split):
    """This funcition reads the prefiction file of runs and returns their summay.
    Returns a list whose items are the summaries of each problem, {_id, sum, total, values}
    """
    # subtask for MNLI's ma and mm
    subtask = task
    if '/' in task:
        task, subtask = task.split('/')
    
    # is_correct is a dict whose keys are problem_id and 
    # values are a list of the correctness of runs
    is_correct = {}
    for run in runs:
        file_name = f'{split}_results_{subtask}.txt'
        path = os.path.join(root_path, model, task, run, file_name)
        with open(path, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            header = next(reader) # ['id', 'predction', 'label']
            for row in reader:
                is_correct.setdefault(int(row[0]), []).append(row[1]==row[2])
    
    len_runs = len(runs)
    results = []
    for key, val in sorted(is_correct.items(), key=lambda kv: kv[0]):
        if len(val) != len_runs:
            raise Exception(f'total number of runs is not matched for {key}')
        results.append({'_id': key, 'sum': sum(val), 'total': len_runs, 'values': val})
    return results

In [None]:
def classify_problems(summary_a, summary_b, successful_threshold):
    """Classify problems based on two summaries.
    """
    both_correct = []
    only_a = []
    only_b = []
    both_wrong = []
    for a, b in zip(summary_a, summary_b):
        if a['_id'] != b['_id']:
            raise Exception('key unmatched')
        _id = a['_id']
        a_correct = a['sum'] / a['total'] >= successful_threshold
        b_correct = b['sum'] / b['total'] >= successful_threshold
        if a_correct and b_correct:
            both_correct.append(_id)
        elif not a_correct and b_correct:
            only_b.append(_id)
        elif a_correct and not b_correct:
            only_a.append(_id)
        else:
            both_wrong.append(_id)
    total = len(both_correct) + len(only_a) + len(only_b) + len(both_wrong)
    return {
        'total': total,
        'both_correct': len(both_correct) / total,
        'only_a': len(only_a) / total,
        'only_b': len(only_b) / total,
        'both_wrong': len(both_wrong) / total,
        'ids': (both_correct, only_a, only_b, both_wrong),
    }

In [None]:
def make_conbined_corpus(summary, target_lm, model_name, successful_threshold):
    """The corpora order is "both_corpus, only_bert_corpus, only_vl_corpus, neither_corpus"
    """
    # We output four sets
    both_corpus = []
    only_bert_corpus = []
    only_vl_corpus = []
    neither_corpus = []
    corpora = [both_corpus, only_bert_corpus, only_vl_corpus, neither_corpus]
    
    # Combine tasks
    for task in summary:
        task_name = task.split('/')[0]
        ids = classify_problems(summary[task][target_lm], summary[task][model_name], successful_threshold)['ids']
        for corpus, problem_ids in zip(corpora, ids):
            for problem_id in problem_ids:
                for key in keys_for_tasks[task_name]:
                    if key is not None:
                        corpus.append(datasets[task][problem_id][key])
    return corpora

In [None]:
# Collect all summary
summary = {}
for task in target_tasks:
    summary[task] = {}
    summary[task][target_lm] = summarize_across_runs(root_path, target_lm, task, target_runs, target_split)
    for model_name in target_models:
        summary[task][model_name] = summarize_across_runs(root_path, model_name, task, target_runs, target_split)

In [None]:
# Save sentence sets for all the models.
if not os.path.exist(output_dir_combined):
    os.mkdir(output_dir_combined)
for model_name in target_models:
    corpora = make_conbined_corpus(summary, target_lm, model_name, successful_threshold)
    filenames = [
        f'{model_name}_both.txt', f'{model_name}_bert.txt',
        f'{model_name}_vl.txt', f'{model_name}_neither.txt'
    ]
    for filename, corpus in zip(filenames, corpora):
        path = os.path.join(output_dir_combined, filename)
        with open(path, 'w') as f:
            f.write('\n'.join(corpus))
            f.write('\n')

### Show statistics

In [None]:
import textstat
import pandas
import matplotlib

In [None]:
# Dummy data to update font size
pandas.DataFrame([0], columns = ['dummy']).hist()
matplotlib.rcParams.update({'font.size': 18})

In [None]:
def _show_hist(func, prefix, name, ylim=None, xlim=(-5, 20)):
    """ Show a distribution of a metric after applying a function (func) to the sentence in the sets.
    We show the distributions of bert-only and vl model in a plot
    """
    plot = None
    
    for fname, n, color in [
        (prefix+'_bert.txt', 'BERT-only', 'r'),
        (prefix+'_vl.txt', name, 'b'),
    ]:
        with open(fname, 'r') as f:
            df = pandas.DataFrame((_.strip() for _ in f.readlines()), columns = [n])
        x = df[n].apply(func)
        print(n, color, len(x), x.mean(), x.std())
        plot = x.hist(density=1, range=xlim, bins=40, alpha=0.4, color=color, legend=True)
        if ylim is not None:
            plot.set_ylim(ylim)
    
    return plot

def show_hist_fkg(prefix, name, ylim=None, xlim=(-5, 20)):
    """metric is flesch_kincaid_grade"""
    return _show_hist(textstat.flesch_kincaid_grade, prefix, name, ylim, xlim)

def show_hist_len(prefix, name, ylim=(0, 0.020), xlim=(0, 500)):
    """metric is length"""
    return _show_hist(len, prefix, name, ylim, xlim)

In [None]:
# Flesch kincaid grade

In [None]:
show_hist_fkg(f'{output_dir_combined}/ctrl_visual_bert_base', 'VIS-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg(f'{output_dir_combined}/ctrl_uniter_base', 'UNI-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg(f'{output_dir_combined}/ctrl_vl_bert_base', 'VL-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg(f'{output_dir_combined}/ctrl_lxmert_base_new', 'LX-only',  ylim=(0, 0.15))

In [None]:
show_hist_fkg(f'{output_dir_combined}/ctrl_vilbert_base', 'VIL-only', ylim=(0, 0.15))

In [None]:
# Sentence length

In [None]:
show_hist_len(f'{output_dir_combined}/ctrl_visual_bert_base', 'VIS-only')

In [None]:
show_hist_len(f'{output_dir_combined}/ctrl_uniter_base', 'UNI-only')

In [None]:
show_hist_len(f'{output_dir_combined}/ctrl_vl_bert_base', 'VL-only')

In [None]:
show_hist_len(f'{output_dir_combined}/ctrl_lxmert_base_new', 'LX-only')

In [None]:
show_hist_len(f'{output_dir_combined}/ctrl_vilbert_base', 'VIL-only')