# Error Analysis
This notebook first creates prediction files on the validation set of each task and run.
Then it analyze those prediction files.

## Make prediction files

In [None]:
# Enumerate the best models that have the minimum losses for each run
# A table best_models.txt will be output in the anaysis directory
!python enumerate_best_models.py -r ../../vl_models/finetuned > best_models.txt

In [None]:
# Predict labels with those best models
# prediction directory will be created in the anaysis directory and 
# prediction will be stored in the sub-directoryies that corresponds to model directories.
# We output strings in dump.log because they are too long to show in a notebook.
!./batch_dump.sh best_models.txt > dump.log 2>&1

## Calculate distribution 
We analyze the distributions of sentence lengths and readability here.
- Make sentence sets (both solvable, vl-only, bert-only and neither) for each model family from those dump files.
- Calculate their statiistics

In [None]:
# We install packages for this analysis before entering the detail
# We use textstat to calculate readability
!pip install textstat pandas matplotlib

### Make sentence sets

In [None]:
# todo

### Show statistics

In [None]:
import textstat
import pandas
import matplotlib

In [None]:
# Dummy data to update font size
pandas.DataFrame([0], columns = ['dummy']).hist()
matplotlib.rcParams.update({'font.size': 18})

In [None]:
def _show_hist(func, prefix, name, ylim=None, xlim=(-5, 20)):
    """ Show a distribution of a metric after applying a function (func) to the sentence in the sets.
    We show the distributions of bert-only and vl model in a plot
    """
    plot = None
    
    for fname, n, color in [
        (prefix+'_bert.txt', 'BERT-only', 'r'),
        (prefix+'_vl.txt', name, 'b'),
    ]:
        with open(fname, 'r') as f:
            df = pandas.DataFrame((_.strip() for _ in f.readlines()), columns = [n])
        x = df[n].apply(func)
        print(n, color, len(x), x.mean(), x.std())
        plot = x.hist(density=1, range=xlim, bins=40, alpha=0.4, color=color, legend=True)
        if ylim is not None:
            plot.set_ylim(ylim)
    
    return plot

def show_hist_fkg(prefix, name, ylim=None, xlim=(-5, 20)):
    """metric is flesch_kincaid_grade"""
    return _show_hist(textstat.flesch_kincaid_grade, prefix, name, ylim, xlim)

def show_hist_len(prefix, name, ylim=(0, 0.020), xlim=(0, 500)):
    """metric is length"""
    return _show_hist(len, prefix, name, ylim, xlim)

In [None]:
# Flesch kincaid grade

In [None]:
show_hist_fkg('error_analysis_combined/ctrl_visual_bert_base', 'VIS-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg('error_analysis_combined/ctrl_uniter_base', 'UNI-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg('error_analysis_combined/ctrl_vl_bert_base', 'VL-only', ylim=(0, 0.15))

In [None]:
show_hist_fkg('error_analysis_combined/ctrl_lxmert_base_new', 'LX-only',  ylim=(0, 0.15))

In [None]:
show_hist_fkg('error_analysis_combined/ctrl_vilbert_base', 'VIL-only', ylim=(0, 0.15))

In [None]:
# Sentence length

In [None]:
show_hist_len('error_analysis_combined/ctrl_visual_bert_base', 'VIS-only')

In [None]:
show_hist_len('error_analysis_combined/ctrl_uniter_base', 'UNI-only')

In [None]:
show_hist_len('error_analysis_combined/ctrl_vl_bert_base', 'VL-only')

In [None]:
show_hist_len('error_analysis_combined/ctrl_lxmert_base_new', 'LX-only')

In [None]:
show_hist_len('error_analysis_combined/ctrl_vilbert_base', 'VIL-only')