# Run Parameters

In [229]:
_model = 'roberta-large' # roberta-large or roberta-large-mnli
overwrite_plotting_data = False # Set to True if running for first time with new experiment data

# Import

In [230]:
%matplotlib widget

In [231]:
import os
import collections
import copy
import itertools

import pandas as pd
import numpy as np
import json
import joblib
import pickle
import torch

import matplotlib.pyplot as plt
import matplotlib.path as pth
import seaborn as sns
import matplotlib.patches as mpatches

from scipy.stats import ttest_ind, f_oneway

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [232]:
sns.set_theme(style='whitegrid')

# Define

In [233]:
def get_val_summary(modifier, iteration, eval_dir, ):
    fname = os.path.join(eval_dir, f'r{iteration}', 'tables', f'configs.{modifier}.csv')
    summary_table = pd.read_csv(fname, index_col = 0)
    summary_table = summary_table[[str(n) for n in range(1, iteration+1)]]
    
    return summary_table


def get_itereval_summary(sub_keys, iteration, eval_dir, combined, ):
    rep = {
        '/': '-',
        ';': '--',
    }
    
    fname_key = '.'.join(sub_keys.values())
    for old_char, new_char in rep.items():
        fname_key = fname_key.replace(old_char, new_char)
    fname = os.path.join(eval_dir, f'r{iteration}', 'tables', combined, f'iterevals.{fname_key}.csv')
    summary_table = pd.read_csv(fname, index_col = 0)
    summary_table = summary_table[[str(n) for n in range(1, iteration+1)]]
    
    return summary_table
    

In [234]:
def get_mnli_tables(mnli_summary, subsetting='genre'):
    with open(mnli_summary, 'r') as f:
        summary = pd.DataFrame([json.loads(line) for line in f])
    
    mnli_tables = {}
    for comb in summary['comb'].unique():
        comb_sum = summary.loc[summary['comb'] == comb, :]

        for subset in summary[subsetting].unique():
            subset_sum = comb_sum.loc[comb_sum[subsetting] == subset, :]

            plot_tab = []
            for treat in subset_sum['treat'].unique():
                treat_sum = subset_sum.loc[subset_sum['treat'] == treat, :]
                s = treat_sum[['iter','acc']].set_index('iter').rename({'acc': treat}, axis=1).transpose()            
                plot_tab.append(s)
            
            mnli_tables[(model, comb, subset)] = pd.concat(plot_tab)
    
    return summary, mnli_tables

In [235]:
def split_run_name(run_name, split_by='_'):
    name_list = run_name.split(split_by)
    if len(name_list) == 2:
        input_type = 'full'
        comb = 'combined'
    elif len(name_list) == 3:
        if name_list[-1] == 'hyp':
            input_type = name_list[-1]
            comb = 'combined'
        else:
            input_type = 'full'
            comb = name_list[-1]
    else:
        input_type = name_list[-1]
        comb = name_list[-2]

    return (name_list[0], name_list[1], input_type, comb)

In [236]:
def unique_itereval(df, keys=['case', 'subcase', 'label', 'dataset', 'treat', 'iter', 'comb', 'sample_type', 'sample_partition']):
    return df.drop_duplicates(subset=keys, ignore_index=True)

def load_sampled_results(sampled_base):
    collected = pd.read_csv(os.path.join(sampled_base, 'collected.csv'))
    itereval = pd.read_csv(os.path.join(sampled_base, 'itereval.csv'))
    mnli = pd.read_csv(os.path.join(sampled_base, 'mnli.csv'))
    anli = pd.read_csv(os.path.join(sampled_base, 'anli.csv'))
    
    
    # fill in keys
    collected['treat'] = collected['run'].apply(lambda x: split_run_name(x)[0])
    collected['iter'] = collected['run'].apply(lambda x: int(split_run_name(x)[1]))
    collected['mod'] = collected['run'].apply(lambda x: split_run_name(x)[2])
    collected['combined'] = collected['run'].apply(lambda x: split_run_name(x)[3])
    
    
    mnli['breakdown'] = mnli['genre'].fillna('combined')
    anli['breakdown'] = anli['tag'].fillna('combined')
    
    itereval = unique_itereval(itereval)
    
    return collected, itereval, mnli, anli

In [237]:
def load_all_sampled(sampled_base, upto=5):
    loaded_keys = {'collected': 0, 'itereval':1 ,'mnli': 2, 'anli': 3}
    results = {key: [] for key in loaded_keys.keys()}
    
    for r in range(1, upto + 1):
        loaded = load_sampled_results(os.path.join(sampled_base, f'r{r}'))
        for result_key, loaded_key in loaded_keys.items():
            results[result_key].append(loaded[loaded_key])
    
    return {
        key: pd.concat(result_list, ignore_index=True)
        for key, result_list in results.items()
    }
    

In [238]:
def get_ttest_pvals(dist_df, verbose=True):
    pairs = [
        ('baseline', 'LotS'),
        ('baseline', 'LitL'),
        ('LotS', 'LitL'),
    ]
    
    ttest_dict = {}
    for pair in pairs:
        a = dist_df.loc[dist_df['treat'] == pair[0], 'acc']
        b = dist_df.loc[dist_df['treat'] == pair[1], 'acc']
        ttest_dict[pair] = ttest_ind(a, b)
    
    if verbose:
        for pair, ttest_results in ttest_dict.items():
            print('='*45)
            print(f"{pair}\nt: {ttest_results[0]:.5f} | p: {ttest_results[1]/2:.5f}")
    
    return ttest_dict


In [239]:
def two_way_anova(df, f1='iter', f2='treat', acc='acc', formula=None):
    keeps = [f1, f2, acc]
    
    if not formula:
        formula = f'{acc} ~ C({f1}) + C({f2}) + C({f1}):C({f2})'
        
    print(formula)
    model = ols(formula, data=df[keeps]).fit()
    
    return sm.stats.anova_lm(model, typ=2), model

def one_way_anova(df, f='iter', acc='acc', formula=None):
    keeps = [f, acc]
    
    if not formula:
        formula = f'{acc} ~ C({f})'
        
    print(formula)
    model = ols(formula, data=df[keeps]).fit()
    
    return sm.stats.anova_lm(model, typ=1), model

# Combine Plotting Data

In [240]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
sample_type = 'cross_eval'
iteration = 5
plot_data = os.path.join(repo, 'eval_summary', 'plot_data')
os.makedirs(plot_data, exist_ok=True)

In [241]:
models = ['roberta-large', 'roberta-large-mnli']

In [242]:
distributions = {}

for model in models:
    eval_dir = os.path.join(repo, 'eval_summary', model)
    distributions[model] = load_all_sampled(
        os.path.join(eval_dir, 'sample', sample_type), upto=iteration
    )

## MNLI Only Training

In [243]:
pred_base = os.path.join(repo, 'predictions', 'roberta-large-mnli_only')
data_base = os.path.join(repo, 'tasks', 'data')
best_data = os.path.join(pred_base, 'best')

pred_dirs = [
    'baseline_5',
    'LotS_5',
    'LitL_5',
    'mnlieval_baseline_1',
    'anlieval_baseline_1',
    'eval_baseline_1',
]

data_dirs = [
    os.path.join('baseline_5', 'val_round5_base_combined.jsonl'),
    os.path.join('LotS_5', 'val_round5_LotS_combined.jsonl'),
    os.path.join('LitL_5', 'val_round5_LitL_combined.jsonl'),
    os.path.join('mnli_mismatched', 'val_mismatched_mnli.jsonl'),
    os.path.join('anli_combined', 'val_anli.jsonl'),
    os.path.join('iterative_eval', 'val_itercombined.jsonl'),
]

lrs = ['0.00001', '0.00002', '0.00003']
batches = ['16', '32']

n_trials = 10

In [244]:
def read_jsonl(file):
    with open(file, 'r') as f:
        return [json.loads(line) for line in f.readlines()]
    
def get_acc(
    preds,
    data,
    int2pred={0:'contradiction', 1:'entailment', 2:'neutral'}
):
    df = pd.DataFrame(data)
    df['preds'] = pd.Series(preds).apply(lambda x: int2pred[x])
    df['correct'] = df['label'].eq(df['preds'])
    return df

In [245]:
if overwrite_plotting_data:
    preds_and_data = {}

    for pred, data in zip(pred_dirs, data_dirs):
        temp_pred = torch.load(os.path.join(pred_base, pred, 'val_preds.p'))
        temp_data = read_jsonl(os.path.join(data_base, data))
        preds_and_data[pred.split('_')[0]] = {'pred': temp_pred, 'data':temp_data}

    accs = {
        key: get_acc(val['pred']['mnli']['preds'], val['data']) for key, val in preds_and_data.items()
    }
    
    accs['glue'] = accs['eval'].loc[accs['eval']['dataset'] == 'glue']
    
    hans = accs['eval'].loc[accs['eval']['dataset'] == 'hans', :]
    tempdict = {'contradiction':'contradiction', 'neutral':'contradiction', 'entailment':'entailment'}
    hans['preds'] = hans['preds'].apply(lambda x: tempdict[x])
    hans['case'] = hans['case'].apply(lambda x: x[0])
    hans['correct'] = hans['label'].eq(hans['preds'])
    
    accs['hans'] = hans
    
    with open(os.path.join(plot_data, 'mnli-only-training_accs.p'), 'wb') as f:
        pickle.dump(accs, f)

In [246]:
if overwrite_plotting_data:
    for trial in range(1, n_trials+1):
        preds_and_data = {}

        for pred, data in zip(pred_dirs, data_dirs):
            temp_pred = torch.load(os.path.join(best_data, pred, f'{trial}', 'val_preds.p'))
            temp_data = read_jsonl(os.path.join(data_base, data))
            preds_and_data[pred.split('_')[0]] = {'pred': temp_pred, 'data':temp_data}

        accs = {
            key: get_acc(val['pred']['mnli']['preds'], val['data']) for key, val in preds_and_data.items()
        }

        accs['glue'] = accs['eval'].loc[accs['eval']['dataset'] == 'glue']

        hans = accs['eval'].loc[accs['eval']['dataset'] == 'hans', :]
        tempdict = {'contradiction':'contradiction', 'neutral':'contradiction', 'entailment':'entailment'}
        hans['preds'] = hans['preds'].apply(lambda x: tempdict[x])
        hans['case'] = hans['case'].apply(lambda x: x[0])
        hans['correct'] = hans['label'].eq(hans['preds'])

        accs['hans'] = hans

        mnli_out_dir = os.path.join(plot_data, 'mnli_restarts', 'best', f'{trial}')
        os.makedirs(mnli_out_dir, exist_ok = True)
        with open(os.path.join(mnli_out_dir, 'mnli-only-training_accs.p'), 'wb') as f:
            pickle.dump(accs, f)

## Collected

In [247]:
select2mod = {
    ('combined', 'full'): 'combined',
    ('combined', 'hyp'): 'hyp',
    ('separate', 'full'): 'separate',
    ('separate', 'hyp'): 'separate_hyp',
}

In [248]:
all_df = []

if overwrite_plotting_data:
    for model in models:
        collected = []
        eval_dir = os.path.join(repo, 'eval_summary', model)

        for combined, input_type in select2mod.keys():
            mod = select2mod[(combined, input_type)]
            temp = get_val_summary(mod, iteration, eval_dir, )
            for idx, row in temp.iterrows():
                df = pd.DataFrame({
                    'acc': row,
                    'iter': [int(x) for x in row.index.values],
                    'treat':row.name,
                    'mod':input_type,
                    'combined':combined,
                    'model':model,
                })
                collected.append(df)
        collected_t = pd.concat(collected, ignore_index = True)
        distributions[model]['collected']['model'] = model
        all_df.append(pd.concat([distributions[model]['collected'], collected_t], ignore_index = True))
    df = pd.concat(all_df, ignore_index = True)
    df.to_csv(os.path.join(plot_data, 'collected.csv'))

## GLUE

In [249]:
glue_keys = pd.read_csv('glue_case_keys.csv')
print(glue_keys)
glue_labels = ['combined', 'entailment', 'neutral', 'contradiction']

                            case                                subcase
0                       combined                               combined
1                      Knowledge                               combined
2                      Knowledge                           Common sense
3                      Knowledge                        World knowledge
4              Lexical Semantics                               combined
..                           ...                                    ...
64  Predicate-Argument Structure  Relative clauses;Anaphora/Coreference
65  Predicate-Argument Structure         Relative clauses;Restrictivity
66  Predicate-Argument Structure                          Restrictivity
67  Predicate-Argument Structure     Restrictivity;Anaphora/Coreference
68  Predicate-Argument Structure         Restrictivity;Relative clauses

[69 rows x 2 columns]


In [250]:
dataset = 'glue'

combineds = ['combined', 'separate']
all_df = []

if overwrite_plotting_data:
    for model in models:
        collected = []
        eval_dir = os.path.join(repo, 'eval_summary', model)

        for idx, caserow in glue_keys.iterrows():
            for label in glue_labels:
                for combined in combineds:
                    sub_keys = {
                        'dataset': dataset,     # either hans or glue
                        'case': caserow['case'],    # combined or specific to respective itereval set
                        'subcase': caserow['subcase'], # combined or specific to respective itereval set
                        'label': label,   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
                    }

                    temp = get_itereval_summary(sub_keys, iteration, eval_dir, combined)
                    for idx, row in temp.iterrows():
                        df = pd.DataFrame({
                            'acc': row,
                            'iter': [int(x) for x in row.index.values],
                            'treat':row.name,
                            'case':sub_keys['case'],
                            'subcase':sub_keys['subcase'],
                            'label':sub_keys['label'],
                            'comb':combined,
                            'model':model,
                        })
                        collected.append(df)
                collected_t = pd.concat(collected, ignore_index = True)

        temp_sampled = distributions[model]['itereval']
        temp_sampled = temp_sampled.loc[temp_sampled['dataset'] == dataset, :]
        temp_sampled['model'] = model
        all_df.append(pd.concat([temp_sampled, collected_t], ignore_index=True))
    df = pd.concat(all_df, ignore_index = True)
    df.to_csv(os.path.join(plot_data, 'glue.csv'))

## HANS

In [251]:
hans_keys = pd.read_csv('hans_case_keys.csv')
print(hans_keys)
hans_labels = ['combined', 'entailment', 'non-entailment']

               case                         subcase
0          combined                        combined
1       constituent                       ce_adverb
2       constituent           ce_after_since_clause
3       constituent                  ce_conjunction
4       constituent         ce_embedded_under_since
5       constituent          ce_embedded_under_verb
6       constituent                       cn_adverb
7       constituent              cn_after_if_clause
8       constituent                  cn_disjunction
9       constituent            cn_embedded_under_if
10      constituent          cn_embedded_under_verb
11      constituent                        combined
12  lexical_overlap                        combined
13  lexical_overlap  le_around_prepositional_phrase
14  lexical_overlap       le_around_relative_clause
15  lexical_overlap                  le_conjunction
16  lexical_overlap                      le_passive
17  lexical_overlap              le_relative_clause
18  lexical_

In [252]:
dataset = 'hans'

combineds = ['combined', 'separate']
all_df = []

if overwrite_plotting_data:
    for model in models:
        collected = []
        eval_dir = os.path.join(repo, 'eval_summary', model)

        for idx, caserow in hans_keys.iterrows():
            for label in hans_labels:
                for combined in combineds:
                    sub_keys = {
                        'dataset': dataset,     # either glue or hans
                        'case': caserow['case'],    # combined or specific to respective itereval set
                        'subcase': caserow['subcase'], # combined or specific to respective itereval set
                        'label': label,   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
                    }

                    temp = get_itereval_summary(sub_keys, iteration, eval_dir, combined)
                    for idx, row in temp.iterrows():
                        df = pd.DataFrame({
                            'acc': row,
                            'iter': [int(x) for x in row.index.values],
                            'treat':row.name,
                            'case':sub_keys['case'],
                            'subcase':sub_keys['subcase'],
                            'label':sub_keys['label'],
                            'comb':combined,
                            'model':model,
                        })
                        collected.append(df)
                collected_t = pd.concat(collected, ignore_index = True)

        temp_sampled = distributions[model]['itereval']
        temp_sampled = temp_sampled.loc[temp_sampled['dataset'] == dataset, :]
        temp_sampled['model'] = model
        all_df.append(pd.concat([temp_sampled, collected_t], ignore_index=True))
    df = pd.concat(all_df, ignore_index = True)
    df.to_csv(os.path.join(plot_data, 'hans.csv'))

## MNLI

In [253]:
all_df = []

if overwrite_plotting_data:
    for model in models:
        eval_dir = os.path.join(repo, 'eval_summary', model)
        mnli_summary = os.path.join(eval_dir, 'mnli_evals', 'eval_summaries.jsonl')

        with open(mnli_summary, 'r') as f:
            summary = pd.DataFrame([json.loads(line) for line in f])
        summary['breakdown'] = summary['genre'].fillna('combined')
        summary['iter'] = summary['iter'].apply(lambda x: int(x))

        temp = pd.concat([distributions[model]['mnli'], summary], ignore_index=True)
        temp['model'] = model

        all_df.append(temp)
    df = pd.concat(all_df, ignore_index = True)
    df.to_csv(os.path.join(plot_data, 'mnli.csv'))

## ANLI

In [254]:
all_df = []

if overwrite_plotting_data:
    for model in models:
        eval_dir = os.path.join(repo, 'eval_summary', model)
        df = pd.read_csv(os.path.join(eval_dir, 'sample', sample_type, 'final', 'anli_by_annotation.csv'))
        df['model'] = model

        all_df.append(df)
    df = pd.concat(all_df, ignore_index = True)
    df.to_csv(os.path.join(plot_data, 'anli.csv'))

# Plot

## Plot Params

In [255]:
combined = 'combined'

In [256]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
plot_dir = os.path.join(repo, 'eval_summary', 'plot_data')
plot_out = os.path.join(repo, 'eval_summary', 'plots', _model)
os.makedirs(plot_out, exist_ok=True)

In [257]:
acc_name = 'Performance'
diff_name = 'Over Baseline'

In [258]:
save_figs = True
figtype='pdf'

err_style='bars' # band or bars
err_kws={'elinewidth': 2, 'capsize': 3}

title_fontsize=18
label_fontsize=16
legend_fontsize=14

err_line_offset = 4 if _model == 'roberta-large' else 5
err_cap_offset = 0.05


In [259]:
cols2plot = {'treat':'Protocol', 'model':'Model'}
treat2plot = {'baseline':'Baseline', 'LotS':'LitL', 'LitL':'LitL Chat'}
model2plot = {'roberta-large': r'RoBERTa$_{\rm{Lg}}$', 'roberta-large-mnli': r'RoBERTa$_{\rm{Lg+MNLI}}$'}

hue='Protocol'
hue_order=['Baseline', 'LitL', 'LitL Chat']
style_key='Model'
style_order=[
    model2plot[_model]
]
boxplots=False
saveappend = "-box" if boxplots else ""

palette={
        'baseline':'tab:blue',
        'LitL':'tab:orange',
        'LitL Chat':'tab:green',
    }
resid_palette={
        'baseline':'tab:blue',
        'Baseline':'tab:blue',
        'LitL':'tab:orange',
        'LotS':'tab:green',
        'LitL Chat':'tab:green',
    }
symbols = {
    1:"D", 
    5:"^",
}

xlim = [0.8, 5.2]

In [260]:
with open(os.path.join(plot_dir, 'mnli-only-training_accs.p'), 'rb') as f:
    mnli_accs = pickle.load(f)

## Def plots

In [261]:
def err_line_plots(
    plot_df,
    ylim=[0,1],
    ystep=0.1,
    xlim=[1,5],
    title=None,
    xlabel=None,
    ylabel=None,
    tabletitle=None,
    tableon=True,
    x='iter',
    y='acc',
    err_style='bars',
    ci=95,
    estimator='mean',
    markers=True,
    hue='treat',
    hue_order=['baseline', 'LotS', 'LitL'],
    iteration=5,
    bbox_to_anchor=(1.01, 1),
    palette=None,
    style_key='combined',
    style_order=['combined', 'separate'],
    yaxis_visible = True,
    xaxis_visible = True,
    ylabel_visible = True,
    xlabel_visible = True,
    legend_visible = True,
    ax=None,
    figsize=(6.4, 4.8),
    err_kws={'elinewidth': 1, 'capsize': 2},
    err_alpha=0.6,
    linewidth=2,
    markersize=7,
    loc='best',
    ncol=1,
    error_offsets = [
        {
            'line':-err_line_offset,
            'cap':-err_cap_offset,
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+err_line_offset,
            'cap':+err_cap_offset,
        }
    ],
    boxplots = False,
):
    kwargs = {}
    no_ax = not ax
    if not ax:
        fig, ax = plt.subplots(figsize=figsize)
        
    if len(style_order) == 1:
        plot_df = plot_df.loc[plot_df[style_key] == style_order[0], :]
        style_key, style_order = hue, hue_order
        
    if boxplots:
        # Boxplots
        sns.boxplot(
            data=plot_df, x=x, y=y,
            hue=hue, hue_order=hue_order,
            ax=ax, **kwargs,
            width=0.4, saturation=1,
        )
    else:
        # Lineplot
        g = sns.lineplot(
            data=plot_df, x=x, y=y,
            hue=hue, hue_order=hue_order,
            style = style_key, style_order = style_order,
            err_style=err_style, err_kws=err_kws,
            ci=ci, markers=markers,
            ax=ax, **kwargs,
            linewidth=linewidth, markersize=markersize,
        )

        if error_offsets:
            assert len(g.containers) == len(error_offsets), f'{len(g.collections)}, error_offsets {len(error_offsets)}'

            for container, offsets in zip(g.containers, error_offsets):
                # offset line
                plt.setp(container[2][0], offsets = [offsets['line'], 0.])

                # offset caps
                for cap in container[1]:
                    temp = cap._xy
                    temp[:, 0] = temp[:, 0] + offsets['cap']
                    cap._path = pth.Path(temp)

        plt.setp(g.containers, alpha=err_alpha)
    
    ax.set_title(title, fontsize=title_fontsize)
    
    ax.set_xlabel(xlabel if xaxis_visible and xlabel_visible else '', fontsize=label_fontsize)
    if boxplots:
        ax.set_xticklabels(['1', '2', '3', '4', '5'])
    else:
        ax.set_xlim(*xlim)
        ax.set_xticks(np.arange(1, 6, 1))
        
    if not xaxis_visible:
        ax.xaxis.set_ticklabels([])
    
    
    ax.set_ylabel(ylabel if yaxis_visible and ylabel_visible else '', fontsize=label_fontsize)
    ax.set_yticks(np.arange(ylim[0], ylim[1]+ystep, ystep))
    ax.set_ylim(*ylim)
    if not yaxis_visible:
        ax.yaxis.set_ticklabels([])
    
    ax.legend(
        bbox_to_anchor=bbox_to_anchor,
        loc=loc,
        ncol=ncol,
        fontsize=legend_fontsize,
    ).set_visible(legend_visible)
    
    if no_ax:
        fig.tight_layout()
        return fig

def display_resid(
    iter_df, anova_lm, 
    treat_name="treat", iter_name="iter", ppalette=resid_palette
):
    factor_groups = iter_df.groupby([treat_name, iter_name])
    resid = anova_lm.resid
    plt.figure(figsize=(6, 6))
    for group_num, (values, group) in enumerate(factor_groups):
        i, j = values
        x = [group_num] * len(group)
        plt.scatter(
            x,
            resid[group.index],
            marker=symbols[j],
            color=ppalette[i],
            s=144,
            edgecolors="black",
        )
    plt.xlabel("Group")
    plt.ylabel("Residuals")

    color_legend = ['Baseline', 'LitL', 'LitL Chat']
    marker_legend = [1, 5]
    colors = [mpatches.Patch(color=resid_palette[treat]) for treat in color_legend]
    markers = [plt.plot([], [], symbols[r], markerfacecolor='w',
                        markeredgecolor='k')[0] for r in marker_legend]
    plt.legend(
        colors + markers,
        color_legend + [f'Round {r}' for r in marker_legend],
        loc='best'
    )

## Combined In-Domain

In [262]:
indomain_l_offset = 0 if _model == 'roberta-large' else -1
indomain_c_offset = 0

error_offsets = [
        {
            'line':-(err_line_offset+indomain_l_offset),
            'cap':-(err_cap_offset+indomain_c_offset),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+indomain_l_offset),
            'cap':+(err_cap_offset+indomain_c_offset),
        }
    ]

In [263]:
figsize=(7, 5)
fig, ax = plt.subplots(2, 1, figsize=figsize)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### In-domain MNLI Only Training

In [264]:
mnli_collected = []
n_trials = 10

for trial in range(1, n_trials+1):
    with open(os.path.join(plot_dir, 'mnli_restarts', 'best', f'{trial}', 'mnli-only-training_accs.p'), 'rb') as f:
        temp_mnli_accs = pickle.load(f)

    for treat in ['baseline', 'LotS', 'LitL']:
        temp_df = temp_mnli_accs[treat]
        temp_df['iter'] = temp_df['round'].apply(lambda x: int(x[-1]))

        for iteration in temp_df['iter'].unique():
            temp_acc = temp_df.loc[temp_df['iter'] <= iteration, :]
            mnli_collected.append(
                {
                    'treat': treat,
                    'iter': iteration,
                    'model': 'mnli-only',
                    'mod': 'full',
                    'combined': 'combined',
                    'trial': trial,
                    'acc': temp_acc['correct'].sum()/temp_acc.shape[0]
                }
            )

            temp_acc = temp_df.loc[temp_df['iter'] == iteration, :]
            mnli_collected.append(
                {
                    'treat': treat,
                    'iter': iteration,
                    'model': 'mnli-only',
                    'mod': 'full',
                    'combined': 'separate',
                    'trial': trial,
                    'acc': temp_acc['correct'].sum()/temp_acc.shape[0]
                }
            )

mnli_df = pd.DataFrame(mnli_collected)

In [265]:
mnli_df = mnli_df.loc[mnli_df['combined'] == 'combined', :]

for treat in mnli_df['treat'].unique():
    for iteration in mnli_df['iter'].unique():
        temp_df = mnli_df.loc[mnli_df['treat'] == treat, :]
        temp_df = temp_df.loc[temp_df['iter'] == iteration, :]
        
        print(treat, iteration, f"acc mean: {temp_df['acc'].mean():.3f}", f"acc std: {temp_df['acc'].std():.3f}")

baseline 1 acc mean: 0.841 acc std: 0.012
baseline 2 acc mean: 0.854 acc std: 0.006
baseline 3 acc mean: 0.860 acc std: 0.007
baseline 4 acc mean: 0.862 acc std: 0.008
baseline 5 acc mean: 0.866 acc std: 0.005
LotS 1 acc mean: 0.817 acc std: 0.011
LotS 2 acc mean: 0.812 acc std: 0.010
LotS 3 acc mean: 0.821 acc std: 0.008
LotS 4 acc mean: 0.827 acc std: 0.007
LotS 5 acc mean: 0.823 acc std: 0.006
LitL 1 acc mean: 0.860 acc std: 0.011
LitL 2 acc mean: 0.845 acc std: 0.009
LitL 3 acc mean: 0.838 acc std: 0.009
LitL 4 acc mean: 0.839 acc std: 0.010
LitL 5 acc mean: 0.836 acc std: 0.009


In [266]:
plot_df = copy.deepcopy(mnli_df)
plot_df['model'] = model2plot[_model]
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df.rename(columns=cols2plot, inplace=True)

In [267]:
mnlifigsize=(5, 2)
figmnli, axmnli = plt.subplots(figsize=mnlifigsize)

ylims={
    'roberta-large': [0.8, 0.9],
    'roberta-large-mnli': [0.8, 0.9],
}
title=f'MNLI-only Trained'
xlabel='Round'
ylabel='Accuracy'
tabletitle='Median'
tableon=False

mnli_l_offset = 1.5
mnlierror_offsets = [
        {
            'line':-(err_line_offset+mnli_l_offset),
            'cap':-(err_cap_offset+0),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+mnli_l_offset),
            'cap':+(err_cap_offset+0),
        }
    ]

bbox_to_anchor = (1.25, 1)

err_line_plots(
    plot_df,
    err_style=err_style,
    ylim=ylims[_model],
    title=title,
    xlabel=xlabel,
    ylabel=ylabel,
    tabletitle=tabletitle,
    palette=palette,
    tableon=tableon,
    style_key=style_key,
    style_order=style_order,
    figsize=mnlifigsize,
    hue=hue,
    hue_order=hue_order,
    xlim=xlim,
    err_kws=err_kws,
    ax=axmnli,
    ystep=0.02,
    legend_visible=True,
    xlabel_visible=True,
    error_offsets=mnlierror_offsets,
    boxplots=boxplots,
    bbox_to_anchor=bbox_to_anchor,
)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [268]:
# 2-way ANOVA

iterations = [1, 5]

iter_df = mnli_df.loc[mnli_df['iter'].isin(iterations), :]
anova_table, anova_lm = two_way_anova(iter_df)

print(model)
print(anova_table)
print('-'*90)

display_resid(iter_df, anova_lm)


acc ~ C(iter) + C(treat) + C(iter):C(treat)
roberta-large-mnli
                    sum_sq    df          F        PR(>F)
C(iter)           0.000083   1.0   0.926261  3.401266e-01
C(treat)          0.013026   2.0  72.745731  4.749797e-16
C(iter):C(treat)  0.006278   2.0  35.060489  1.741050e-10
Residual          0.004835  54.0        NaN           NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### In Domain

In [269]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'collected.csv'))

In [270]:
input_type = 'full'

plot_df = plot_df.loc[plot_df['combined'] == combined, :]
plot_df = plot_df.loc[plot_df['mod'] == input_type, :]

In [271]:
print(plot_df)

      Unnamed: 0  Unnamed: 0.1         run hyperparams sample_type  \
0              0           0.0  baseline_1  0.00003_32  cross_eval   
1              1           1.0  baseline_1  0.00003_32  cross_eval   
2              2           2.0  baseline_1  0.00003_32  cross_eval   
3              3           3.0  baseline_1  0.00003_32  cross_eval   
4              4           4.0  baseline_1  0.00003_32  cross_eval   
...          ...           ...         ...         ...         ...   
1270        1270           NaN         NaN         NaN         NaN   
1271        1271           NaN         NaN         NaN         NaN   
1272        1272           NaN         NaN         NaN         NaN   
1273        1273           NaN         NaN         NaN         NaN   
1274        1274           NaN         NaN         NaN         NaN   

      sample_partition       acc     treat  iter   mod  combined  \
0                  0.1  0.778905  baseline     1  full  combined   
1                  0.2 

In [272]:
# 2-way ANOVA

iterations = [1, 5]
models = ['roberta-large', 'roberta-large-mnli']

for model in models:
    iter_df = plot_df.loc[plot_df['iter'].isin(iterations), :]
    iter_df = iter_df.loc[iter_df['model'] == model, :]
    anova_table, anova_lm = two_way_anova(iter_df)
    
    print(model)
    print(anova_table)
    print('-'*90)

    display_resid(iter_df, anova_lm)

acc ~ C(iter) + C(treat) + C(iter):C(treat)
roberta-large
                    sum_sq    df         F    PR(>F)
C(iter)           0.009287   1.0  0.631617  0.429895
C(treat)          0.038552   2.0  1.310954  0.277172
C(iter):C(treat)  0.117642   2.0  4.000362  0.023395
Residual          0.882239  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(treat) + C(iter):C(treat)
roberta-large-mnli
                    sum_sq    df           F        PR(>F)
C(iter)           0.000037   1.0    0.988386  3.241299e-01
C(treat)          0.018550   2.0  245.427812  1.298386e-29
C(iter):C(treat)  0.007667   2.0  101.446507  5.638533e-20
Residual          0.002267  60.0         NaN           NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [273]:
plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df = plot_df.rename(columns=cols2plot)

In [274]:
ylims={
    'roberta-large': [0.6, 0.9],
    'roberta-large-mnli': [0.8, 1.0],
}

if boxplots:
    ylims={
        'roberta-large': [0.3, 0.9],
        'roberta-large-mnli': [0.3, 0.9],
    }
title=f'In-domain Validation'
xlabel='Round'
ylabel='Accuracy'
tabletitle='Median'
tableon=False

err_line_plots(
    plot_df,
    err_style=err_style,
    ylim=ylims[_model],
    title=title,
    xlabel=xlabel,
    ylabel=ylabel,
    tabletitle=tabletitle,
    palette=palette,
    tableon=tableon,
    style_key=style_key,
    style_order=style_order,
    figsize=figsize,
    hue=hue,
    hue_order=hue_order,
    xlim=xlim,
    err_kws=err_kws,
    ax=ax[0],
    ystep=0.05 if _model == 'roberta-large-mnli' else 0.1,
    legend_visible=True,
    xlabel_visible=False,
    error_offsets=error_offsets,
    boxplots=boxplots,
)

### Hyp

In [275]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'collected.csv'))

In [276]:
input_type = 'hyp'

plot_df = plot_df.loc[plot_df['combined'] == combined, :]
plot_df = plot_df.loc[plot_df['mod'] == input_type, :]

In [277]:
# 2-way ANOVA

iterations = [1, 5]
models = ['roberta-large', 'roberta-large-mnli']

for model in models:
    iter_df = plot_df.loc[plot_df['iter'].isin(iterations), :]
    iter_df = iter_df.loc[iter_df['model'] == model, :]
    anova_table, anova_lm = two_way_anova(iter_df)
    
    print(model)
    print(anova_table)
    print('-'*90)

    display_resid(iter_df, anova_lm)

acc ~ C(iter) + C(treat) + C(iter):C(treat)
roberta-large
                    sum_sq    df          F    PR(>F)
C(iter)           0.023914   1.0  10.343665  0.002095
C(treat)          0.075884   2.0  16.410916  0.000002
C(iter):C(treat)  0.014604   2.0   3.158400  0.049640
Residual          0.138719  60.0        NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(treat) + C(iter):C(treat)
roberta-large-mnli
                    sum_sq    df           F        PR(>F)
C(iter)           0.000900   1.0    7.935376  6.553143e-03
C(treat)          0.039800   2.0  175.528151  8.462490e-26
C(iter):C(treat)  0.004758   2.0   20.986339  1.230333e-07
Residual          0.006802  60.0         NaN           NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [278]:
plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df = plot_df.rename(columns=cols2plot)

In [279]:
ylims={
    'roberta-large': [0.3,0.7],
    'roberta-large-mnli': [0.3, 0.7],
}
title=f'Hypothesis-only Input'
xlabel='Round'
ylabel='Accuracy'
tabletitle='Median'
tableon=False

bbox_to_anchor = (0.5, -1.2)

err_line_plots(
    plot_df,
    err_style=err_style,
    ylim=ylims[_model],
    title=title,
    xlabel=xlabel,
    ylabel=ylabel,
    tabletitle=tabletitle,
    palette=palette,
    tableon=tableon,
    style_key=style_key,
    style_order=style_order,
    figsize=figsize,
    err_kws=err_kws,
    hue=hue,
    hue_order=hue_order,
    xlim=xlim,
    xlabel_visible=True,
    legend_visible=False,
    ax=ax[1],
    error_offsets=error_offsets,
    boxplots=boxplots,
)

In [280]:
# get majority class baseline per protocol and round
append = '_combined' if combined == 'combined' else ''
nli_data = os.path.join(repo, 'NLI_data')
protocol2dir = {
    'base':'1_Baseline_protocol',
    'LotS':'2_Ling_on_side_protocol',
    'LitL':'3_Ling_in_loop_protocol',
}
rounds = range(1,6)

majority_class = []

file2plot = {'base':'Baseline', 'LotS':'LitL', 'LitL':'LitL Chat'}

for protocol, protocol_dir in protocol2dir.items():
    for r in rounds:
        val_name = f'val_round{r}_{protocol}{append}.jsonl'
        val_path = os.path.join(nli_data, protocol_dir, val_name)
        
        labels2count = collections.defaultdict(int)
        with open(val_path, 'r') as f:
            for example in f.readlines():
                label = json.loads(example)['label']
                labels2count[label] += 1
        majority_class.append({
            'Protocol':file2plot[protocol],
            'Iteration':r,
            'Majority Class':max(labels2count.values())/sum(labels2count.values())
        })

In [281]:
majority_class_df = pd.DataFrame(majority_class)
avg_majority = majority_class_df.groupby(by='Iteration').mean()

xvals = avg_majority.index.values
if boxplots:
    xvals = xvals - 1
    
ax[1].plot(
    xvals, avg_majority['Majority Class'],
    c='k', ls='--'
)
ax[1].legend().set_visible(False)

### Save

In [282]:
fig.tight_layout()
if save_figs:
    fig.savefig(os.path.join(plot_out, f'{combined}_indomain{saveappend}.{figtype}'))

In [55]:
figmnli.tight_layout()
if save_figs:
    figmnli.savefig(os.path.join(os.path.dirname(plot_out), f'{combined}_mnlionly{saveappend}.{figtype}'))

## Diagnostic Sets

In [56]:
iterations = [1, 5]
models = style_order
iter_l_offset = -0.75 if _model == 'roberta-large' else -1.75
iter_c_offset = +0.02

In [57]:
error_offsets = [
        {
            'line':-(err_line_offset+iter_l_offset),
            'cap':-(err_cap_offset+iter_c_offset),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+iter_l_offset),
            'cap':+(err_cap_offset+iter_c_offset),
        }
    ]

In [58]:
figsize = (15, 5)
fig, ax = plt.subplots(2, 4, figsize=figsize) # top is GLUE bottom is HANS
ax[1, 3].set_axis_off()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### GLUE

In [59]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'glue.csv'))

In [60]:
mnli_collected = []
temp_df = mnli_accs['glue']
temp_df['case_text'] = temp_df['case'].apply(lambda x: x[0] if len(x) > 0 else '')

for case in temp_df['case_text'].unique():
    temp_acc = temp_df.loc[temp_df['case_text'] == case, :]
    mnli_collected.append({
        'acc': temp_acc['correct'].sum()/temp_acc.shape[0],
        'subcase': 'combined',
        'label': 'combined',
        'model': 'mnli-only',
        'case': case,
    })
    
    for label in temp_acc['label'].unique():
        temptemp_acc = temp_acc.loc[temp_acc['label'] == label, :]
        mnli_collected.append({
            'acc': temptemp_acc['correct'].sum()/temptemp_acc.shape[0],
            'subcase': 'combined',
            'label': label,
            'model': 'mnli-only',
            'case': case,
        })
mnli_df = pd.DataFrame(mnli_collected)

In [61]:
label = 'combined'

plot_df = plot_df.loc[plot_df['comb'] == combined, :]
plot_df = plot_df.loc[plot_df['label'] == label, :]
plot_df = plot_df.loc[plot_df['subcase'] == 'combined', :]

In [62]:
mnli_df = mnli_df.loc[mnli_df['label'] == label, :]
mnli_df = mnli_df.loc[mnli_df['subcase'] == 'combined', :]

In [63]:
plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df = plot_df.rename(columns=cols2plot)

In [64]:
ylims={
    'roberta-large':{
        'Knowledge':[0.4,0.7],
        'Lexical Semantics':[0.5, 0.8],
        'Logic': [0.4,0.7],
        'Predicate-Argument Structure':[0.5,0.8],
    },
    'roberta-large-mnli':{
        'Knowledge':[0.4,0.7],
        'Lexical Semantics':[0.5, 0.8],
        'Logic': [0.4,0.7],
        'Predicate-Argument Structure':[0.5,0.8],
    },
}
title=f""
xlabel='Round'
ylabel='GLUE'
tabletitle='median'
tableon=False

glue_keys = pd.read_csv('glue_case_keys.csv')

i = 0
for case in glue_keys['case'].unique():
    if case == 'combined':
        continue
    
    temp_df = plot_df.loc[plot_df['case'] == case, :]
    err_line_plots(
        temp_df,
        err_style=err_style,
        ylim=ylims[_model][case],
        title=f"{case}",
        xlabel=xlabel if i == len(glue_keys['case'].unique()) - 2 else None,
        ylabel=ylabel,
        tabletitle=tabletitle,
        palette=palette,
        tableon=tableon,
        style_key=style_key,
        style_order=style_order,
        ax=ax[0, i],
        ylabel_visible = i == 0,
        legend_visible = False,
        err_kws=err_kws,
        hue=hue,
        hue_order=hue_order,
        xlim=xlim,
        error_offsets=error_offsets,
        boxplots=boxplots,
    )
    
    if _model == 'roberta-large-mnli':
        temp_mnli = mnli_df.loc[mnli_df['case'] == case, :]
        ax[0, i].hlines(temp_mnli['acc'], xlim[0], xlim[1], label='mnli-only', zorder=10)
    
    i += 1
    
    for model in models:
        iter_df = temp_df.loc[temp_df['iter'].isin(iterations), :]
        iter_df = iter_df.loc[iter_df['Model'] == model, :]
        anova_table, anova_lm = two_way_anova(iter_df, f2='Protocol')

        print(model, case)
        print(anova_table)
        print('-'*90)
        
        display_resid(iter_df, anova_lm, treat_name="Protocol")

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ Knowledge
                       sum_sq    df         F    PR(>F)
C(iter)              0.024076   1.0  7.718398  0.007286
C(Protocol)          0.002086   2.0  0.334297  0.717166
C(iter):C(Protocol)  0.036334   2.0  5.824089  0.004880
Residual             0.187158  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ Lexical Semantics
                       sum_sq    df         F    PR(>F)
C(iter)              0.014581   1.0  2.795388  0.099744
C(Protocol)          0.000287   2.0  0.027477  0.972909
C(iter):C(Protocol)  0.040338   2.0  3.866817  0.026327
Residual             0.312956  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ Logic
                       sum_sq    df         F    PR(>F)
C(iter)              0.025046   1.0  8.978601  0.003968
C(Protocol)          0.003934   2.0  0.705050  0.498131
C(iter):C(Protocol)  0.016440   2.0  2.946749  0.060153
Residual             0.167374  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ Predicate-Argument Structure
                       sum_sq    df         F    PR(>F)
C(iter)              0.034306   1.0  8.866052  0.004186
C(Protocol)          0.010377   2.0  1.340938  0.269326
C(iter):C(Protocol)  0.031917   2.0  4.124316  0.020976
Residual             0.232160  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### HANS non-entailment

In [65]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'hans.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [66]:
mnli_collected = []
temp_df = mnli_accs['hans']
temp_df['case_text'] = temp_df['case']

for case in temp_df['case_text'].unique():
    temp_acc = temp_df.loc[temp_df['case_text'] == case, :]
    mnli_collected.append({
        'acc': temp_acc['correct'].sum()/temp_acc.shape[0],
        'subcase': 'combined',
        'label': 'combined',
        'model': 'mnli-only',
        'case': case,
    })
    
    for label in temp_acc['label'].unique():
        temptemp_acc = temp_acc.loc[temp_acc['label'] == label, :]
        mnli_collected.append({
            'acc': temptemp_acc['correct'].sum()/temptemp_acc.shape[0],
            'subcase': 'combined',
            'label': 'non-entailment' if label == 'contradiction' else label,
            'model': 'mnli-only',
            'case': case,
        })
mnli_df = pd.DataFrame(mnli_collected)

In [67]:
label = 'non-entailment'

plot_df = plot_df.loc[plot_df['comb'] == combined, :]
plot_df = plot_df.loc[plot_df['label'] == label, :]
plot_df = plot_df.loc[plot_df['subcase'] == 'combined', :]

In [68]:
mnli_df = mnli_df.loc[mnli_df['label'] == label, :]
mnli_df = mnli_df.loc[mnli_df['subcase'] == 'combined', :]

In [69]:
plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df = plot_df.rename(columns=cols2plot)

In [70]:
ylims={
    'roberta-large':{
        'constituent': [0.0, 0.6],
        'lexical_overlap': [0.0, 0.8],
        'subsequence': [0.0, 0.6],
    },
    'roberta-large-mnli':{
        'constituent': [0.1, 0.5],
        'lexical_overlap': [0.6, 1.0],
        'subsequence': [0.1, 0.5],
    },
}
title=f""
xlabel='Round'
ylabel='HANS Non-Entailment'
tabletitle='median'
tableon=False

bbox_to_anchor = (1.15, 1)

hans_keys = pd.read_csv('hans_case_keys.csv')

case2title = {
    'constituent': 'Constituent',
    'lexical_overlap': 'Lexical Overlap',
    'subsequence': 'Subsequence',
}

i = 0
for case in hans_keys['case'].unique():
    if case == 'combined':
        continue
    
    temp_df = plot_df.loc[plot_df['case'] == case, :]
    err_line_plots(
        temp_df,
        err_style=err_style,
        ylim=ylims[_model][case],
        title=f"{case2title[case]}",
        xlabel=xlabel,
        ylabel=ylabel,
        tabletitle=tabletitle,
        palette=palette,
        tableon=tableon,
        style_key=style_key,
        style_order=style_order,
        ax=ax[1, i],
        ylabel_visible = i == 0,
        legend_visible = i == len(hans_keys['case'].unique()) - 2,
        bbox_to_anchor=bbox_to_anchor,
        err_kws=err_kws,
        hue=hue,
        hue_order=hue_order,
        xlim=xlim,
        ystep=0.2,
        error_offsets=error_offsets,
        boxplots=boxplots,
    )
    
    if _model == 'roberta-large-mnli':
        temp_mnli = mnli_df.loc[mnli_df['case'] == case, :]
        ax[1, i].hlines(temp_mnli['acc'], xlim[0], xlim[1], label='mnli-only', zorder=10)
    
    i += 1
    
    for model in models:
        iter_df = temp_df.loc[temp_df['iter'].isin(iterations), :]
        iter_df = iter_df.loc[iter_df['Model'] == model, :]
        anova_table, anova_lm = two_way_anova(iter_df, f2='Protocol')

        print(model, case)
        print(anova_table)
        print('-'*90)
        
        display_resid(iter_df, anova_lm, treat_name="Protocol")

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ constituent
                       sum_sq    df         F    PR(>F)
C(iter)              0.000099   1.0  0.002805  0.957941
C(Protocol)          0.126587   2.0  1.785701  0.176475
C(iter):C(Protocol)  0.003211   2.0  0.045291  0.955752
Residual             2.126678  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ lexical_overlap
                       sum_sq    df          F    PR(>F)
C(iter)              1.169550   1.0  20.783012  0.000026
C(Protocol)          0.462926   2.0   4.113124  0.021183
C(iter):C(Protocol)  0.161721   2.0   1.436899  0.245723
Residual             3.376459  60.0        NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ subsequence
                       sum_sq    df         F    PR(>F)
C(iter)              0.121056   1.0  3.532371  0.065040
C(Protocol)          0.068691   2.0  1.002187  0.373136
C(iter):C(Protocol)  0.051253   2.0  0.747773  0.477780
Residual             2.056221  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Save

In [71]:
fig.tight_layout()
fig.subplots_adjust(wspace=2.5e-1)
if save_figs:
    fig.savefig(os.path.join(plot_out, f'{combined}_itereval{saveappend}.{figtype}'))

## Combined Held-out

In [72]:
figsize=(7, 5)
fig, ax = plt.subplots(2, 1, figsize=figsize)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [73]:
iterations = [1, 5]
models = style_order

In [74]:
mnli_df = mnli_accs['mnlieval']

In [75]:
ylims={
    'roberta-large':{
        'mnli':[0.6,0.9],
        'anli':[0.2,0.5]
    },
    'roberta-large-mnli':{
        'mnli':[0.8, 1.0],
        'anli':[0.3, 0.4]
    },
}
title=f""
xlabel='Round'
ylabel='Accuracy'
tabletitle='median'
tableon=False

bbox_to_anchor = (1.01, 1)

case2title = {
    'mnli': 'MNLI-mismatched',
    'anli': 'ANLI',
}

i = 0
for case, title_name in case2title.items():
    plot_df = pd.read_csv(os.path.join(plot_dir, f'{case}.csv'))
    plot_df = plot_df.loc[plot_df['comb'] == combined, :]
    plot_df = plot_df.loc[plot_df['breakdown'] == 'combined', :]
    mnli_df = mnli_accs[f'{case}eval']
    
    plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
    plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
    plot_df = plot_df.rename(columns=cols2plot)
    
    err_line_plots(
        plot_df,
        err_style=err_style,
        ylim=ylims[_model][case],
        title=f"{title_name}",
        xlabel=xlabel,
        ylabel=ylabel,
        tabletitle=tabletitle,
        palette=palette,
        tableon=tableon,
        style_key=style_key,
        style_order=style_order,
        ax=ax[i],
        xlabel_visible = i == 1,
        legend_visible = i == 0,
        bbox_to_anchor=bbox_to_anchor,
        err_kws=err_kws,
        hue=hue,
        hue_order=hue_order,
        xlim=xlim,
        ystep=0.05 if _model == 'roberta-large-mnli' else 0.1,
        boxplots=boxplots,
    )
    
    if _model == 'roberta-large-mnli':
        ax[i].hlines(mnli_df['correct'].sum()/mnli_df.shape[0], xlim[0], xlim[1], label='mnli-only', zorder=10)
    
    i += 1
    
    for model in models:
        iter_df = plot_df.loc[plot_df['iter'].isin(iterations), :]
        iter_df = iter_df.loc[iter_df['Model'] == model, :]
        anova_table, anova_lm = two_way_anova(iter_df, f2='Protocol')

        print(model, case)
        print(anova_table)
        print('-'*90)
        
        display_resid(iter_df, anova_lm, treat_name="Protocol")

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ mnli
                       sum_sq    df         F    PR(>F)
C(iter)              0.025885   1.0  1.654439  0.203297
C(Protocol)          0.006798   2.0  0.217236  0.805370
C(iter):C(Protocol)  0.106554   2.0  3.405210  0.039739
Residual             0.938743  60.0       NaN       NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ anli
                       sum_sq    df          F        PR(>F)
C(iter)              0.003980   1.0  30.828636  6.773534e-07
C(Protocol)          0.000229   2.0   0.886923  4.172518e-01
C(iter):C(Protocol)  0.001047   2.0   4.055168  2.229188e-02
Residual             0.007746  60.0        NaN           NaN
------------------------------------------------------------------------------------------


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Save

In [76]:
fig.tight_layout()
if save_figs:
    fig.savefig(os.path.join(plot_out, f'{combined}_val{saveappend}.{figtype}'))

## ANLI Breakdown

In [77]:
anli_l_offset = -0
anli_c_offset = +0.075

In [78]:
error_offsets = [
    {
        'line':-(err_line_offset+anli_l_offset),
        'cap':-(err_cap_offset+anli_c_offset),
    },
    {
        'line':0,
        'cap':0,
    },
    {
        'line':+(err_line_offset+anli_l_offset),
        'cap':+(err_cap_offset+anli_c_offset),
    },
    ]

In [79]:
anli_plot_out = os.path.join(repo, 'eval_summary', 'plots')
anli_style_order=[
    r'RoBERTa$_{\rm{Lg}}$', 
    r'RoBERTa$_{\rm{Lg+MNLI}}$'
]

In [80]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'anli.csv'))

In [81]:
breakdowns = [
        'combined',
        'Basic',
#         'EventCoref',
        'Imperfection',
        'Numerical',
        'Reasoning',
        'Reference',
        'Tricky',
    ]

In [82]:
mnli_df = mnli_accs['anlieval']

In [83]:
repo_up = os.path.dirname(repo)
anli_annot_fname = os.path.join(repo_up, 'anli_annot_v0.2_combined_A1A2')
anli_annot = joblib.load(anli_annot_fname)

In [84]:
def get_anli_breakdown_acc(pred_df, anli_annot, breakdown):
    temp1 = anli_annot[['uid', breakdown]]
    temp2 = pred_df[['uid', 'correct']]  
    temp = temp1.merge(temp2, on='uid')
    temp = temp.loc[temp[breakdown].ne('none'), :]
    return temp['correct'].sum()/temp.shape[0]

In [85]:
plot_df = plot_df.loc[plot_df['comb'] == combined, :]

In [86]:
plot_df['model'] = plot_df['model'].apply(lambda x: model2plot[x])
plot_df['treat'] = plot_df['treat'].apply(lambda x: treat2plot[x])
plot_df = plot_df.rename(columns=cols2plot)

In [87]:
if _model == 'roberta-large':
    ylim=[0.25,0.4]
    title=f""
    xlabel='Round'
    ylabel='Accuracy'
    tabletitle='median'
    tableon=False

    bbox_to_anchor = (1.01, 1)
    figsize=(15, 6)

    fig, ax = plt.subplots(2, len(breakdowns) - 1, figsize=figsize)

    i = 0
    for anli_model in anli_style_order:
        temp_df = plot_df.loc[plot_df['Model'] == anli_model, :]
        for case in breakdowns:
            if case == 'combined':
                continue
            
            print(i // (len(breakdowns) - 1), i % (len(breakdowns) - 1))
            temp_ax = ax[i // (len(breakdowns) - 1), i%(len(breakdowns) - 1)]
            temptemp_df = temp_df.loc[temp_df['breakdown'] == case, :]
            err_line_plots(
                temptemp_df,
                err_style=err_style,
                ylim=ylim,
                title=f"{case}",
                xlabel=xlabel,
                ylabel=anli_model,
                tabletitle=tabletitle,
                palette=palette,
                tableon=tableon,
                style_key=style_key,
                style_order=[anli_model],
                ax=temp_ax,
                yaxis_visible = i % (len(breakdowns) - 1) == 0,
                legend_visible = i == len(breakdowns) - 2,
                xlabel_visible = i // (len(breakdowns) - 1) == 1,
                bbox_to_anchor=bbox_to_anchor,
                err_kws=err_kws,
                hue=hue,
                hue_order=hue_order,
                xlim=xlim,
                error_offsets=error_offsets,
                ystep=0.05,
                boxplots=boxplots,
            )

            temp_ax.hlines(get_anli_breakdown_acc(mnli_df, anli_annot, case), xlim[0], xlim[1], label='mnli-only', zorder=10)    

            i += 1

    fig.tight_layout()
    if save_figs:
        print(os.path.dirname(plot_out))
        fig.savefig(os.path.join(os.path.dirname(plot_out), f'{combined}_anli_breakdown{saveappend}.{figtype}'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

0 0
0 1
0 2
0 3
0 4
0 5
1 0
1 1
1 2
1 3
1 4
1 5
C:\Users\Willi\Documents\NYU\2020_Fall\semantics_seminar\lip\ling_in_loop\eval_summary\plots


## HANS Entailment

In [88]:
iterations = [1, 5]
models = style_order
iter_l_offset = -0.75 if _model == 'roberta-large' else -1.75
iter_c_offset = +0.02

In [89]:
error_offsets = [
        {
            'line':-(err_line_offset+iter_l_offset),
            'cap':-(err_cap_offset+iter_c_offset),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+iter_l_offset),
            'cap':+(err_cap_offset+iter_c_offset),
        }
    ]

In [90]:
title_fontsize=16
label_fontsize=14
legend_fontsize=12

In [91]:
figsize = (15, 5)
fig, ax = plt.subplots(2, 3, figsize=figsize)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### RoBERTa HANS Entailment

#### Read Data

In [92]:
combined = 'combined'
_model = 'roberta-large'

In [93]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
plot_dir = os.path.join(repo, 'eval_summary', 'plot_data')
plot_out = os.path.join(repo, 'eval_summary', 'plots', _model)
os.makedirs(plot_out, exist_ok=True)

In [94]:
acc_name = 'Performance'
diff_name = 'Over Baseline'

In [95]:
save_figs = True
figtype='pdf'

err_style='bars' # band or bars
err_kws={'elinewidth': 2, 'capsize': 3}

err_line_offset = 4 if _model == 'roberta-large' else 5
err_cap_offset = 0.05


In [96]:
cols2plot = {'treat':'Protocol', 'model':'Model'}
treat2plot = {'baseline':'Baseline', 'LotS':'LitL', 'LitL':'LitL Chat'}
model2plot = {'roberta-large': r'RoBERTa$_{\rm{Lg}}$', 'roberta-large-mnli': r'RoBERTa$_{\rm{Lg+MNLI}}$'}

hue='Protocol'
hue_order=['Baseline', 'LitL', 'LitL Chat']
style_key='Model'
style_order=[
    model2plot[_model]
]

palette={
        'baseline':'tab:blue',
        'LitL':'tab:orange',
        'LitL Chat':'tab:green',
    }

xlim = [0.8, 5.2]

In [97]:
with open(os.path.join(plot_dir, 'mnli-only-training_accs.p'), 'rb') as f:
    mnli_accs = pickle.load(f)

#### Plot

In [98]:
iterations = [1, 5]
models = style_order
iter_l_offset = -0.75 if _model == 'roberta-large' else -1.75
iter_c_offset = +0.02

In [99]:
error_offsets = [
        {
            'line':-(err_line_offset+iter_l_offset),
            'cap':-(err_cap_offset+iter_c_offset),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+iter_l_offset),
            'cap':+(err_cap_offset+iter_c_offset),
        }
    ]

In [100]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'hans.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [101]:
mnli_collected = []
temp_df = mnli_accs['hans']
temp_df['case_text'] = temp_df['case']

for case in temp_df['case_text'].unique():
    temp_acc = temp_df.loc[temp_df['case_text'] == case, :]
    mnli_collected.append({
        'acc': temp_acc['correct'].sum()/temp_acc.shape[0],
        'subcase': 'combined',
        'label': 'combined',
        'model': 'mnli-only',
        'case': case,
    })
    
    for label in temp_acc['label'].unique():
        temptemp_acc = temp_acc.loc[temp_acc['label'] == label, :]
        mnli_collected.append({
            'acc': temptemp_acc['correct'].sum()/temptemp_acc.shape[0],
            'subcase': 'combined',
            'label': 'non-entailment' if label == 'contradiction' else label,
            'model': 'mnli-only',
            'case': case,
        })
mnli_df = pd.DataFrame(mnli_collected)

In [102]:
label = 'entailment'

plot_dff = plot_df.loc[plot_df['comb'] == combined, :]
plot_dff = plot_dff.loc[plot_dff['label'] == label, :]
plot_dff = plot_dff.loc[plot_dff['subcase'] == 'combined', :]

In [103]:
mnli_df = mnli_df.loc[mnli_df['label'] == label, :]
mnli_df = mnli_df.loc[mnli_df['subcase'] == 'combined', :]

In [104]:
plot_dff['model'] = plot_dff['model'].apply(lambda x: model2plot[x])
plot_dff['treat'] = plot_dff['treat'].apply(lambda x: treat2plot[x])
plot_dff = plot_dff.rename(columns=cols2plot)

In [105]:
ylims={
    'roberta-large':{
        'constituent': [0.6, 1.05],
        'lexical_overlap': [0.6, 1.05],
        'subsequence': [0.6, 1.05],
    },
    'roberta-large-mnli':{
        'constituent': [0.6, 1.05],
        'lexical_overlap': [0.6, 1.05],
        'subsequence': [0.6, 1.05],
    },
}
title=f""
xlabel='Round'
ylabel= r'RoBERTa$_{\rm{Lg}}$'
tabletitle='median'
tableon=False

bbox_to_anchor = (1.055, 1)

hans_keys = pd.read_csv('hans_case_keys.csv')

case2title = {
    'constituent': 'Constituent',
    'lexical_overlap': 'Lexical Overlap',
    'subsequence': 'Subsequence',
}

i = 0
for case in hans_keys['case'].unique():
    if case == 'combined':
        continue
    
    temp_df = plot_dff.loc[plot_dff['case'] == case, :]
    err_line_plots(
        temp_df,
        err_style=err_style,
        ylim=ylims[_model][case],
        title=f"{case2title[case]}",
        xlabel=xlabel,
        ylabel=ylabel,
        tabletitle=tabletitle,
        palette=palette,
        tableon=tableon,
        style_key=style_key,
        style_order=style_order,
        ax=ax[0, i],
        ylabel_visible = i == 0,
        legend_visible = False,
        bbox_to_anchor=bbox_to_anchor,
        err_kws=err_kws,
        hue=hue,
        hue_order=hue_order,
        xlim=xlim,
        ystep=0.2,
        error_offsets=error_offsets,
        boxplots=boxplots,
    )
    
    if _model == 'roberta-large-mnli':
        temp_mnli = mnli_df.loc[mnli_df['case'] == case, :]
        ax[1, i].hlines(temp_mnli['acc'], xlim[0], xlim[1], label='mnli-only', zorder=10)
    
    i += 1
    
    for model in models:
        iter_df = temp_df.loc[temp_df['iter'].isin(iterations), :]
        iter_df = iter_df.loc[iter_df['Model'] == model, :]
        anova_table, anova_lm = two_way_anova(iter_df, f2='Protocol')

        print(model, case)
        print(anova_table)
        print('-'*90)
        
        display_resid(iter_df, anova_lm, treat_name="Protocol")

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ constituent
                       sum_sq    df         F    PR(>F)
C(iter)              0.063998   1.0  2.175201  0.145479
C(Protocol)          0.020829   2.0  0.353979  0.703346
C(iter):C(Protocol)  0.020775   2.0  0.353062  0.703983
Residual             1.765290  60.0       NaN       NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ lexical_overlap
                       sum_sq    df         F    PR(>F)
C(iter)              0.052475   1.0  1.745497  0.191460
C(Protocol)          0.020630   2.0  0.343109  0.710944
C(iter):C(Protocol)  0.026087   2.0  0.433871  0.650013
Residual             1.803769  60.0       NaN       NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg}}$ subsequence
                       sum_sq    df         F    PR(>F)
C(iter)              0.044387   1.0  1.471280  0.229898
C(Protocol)          0.033026   2.0  0.547337  0.581349
C(iter):C(Protocol)  0.016709   2.0  0.276925  0.759075
Residual             1.810158  60.0       NaN       NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### RoBERTa MNLI HANS Entailment

#### Read Data

In [106]:
combined = 'combined'
_model = 'roberta-large-mnli'

In [107]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
plot_dir = os.path.join(repo, 'eval_summary', 'plot_data')
plot_out = os.path.join(repo, 'eval_summary', 'plots', _model)
os.makedirs(plot_out, exist_ok=True)

In [108]:
acc_name = 'Performance'
diff_name = 'Over Baseline'

In [109]:
save_figs = True
figtype='pdf'

err_style='bars' # band or bars
err_kws={'elinewidth': 2, 'capsize': 3}

err_line_offset = 4 if _model == 'roberta-large' else 5
err_cap_offset = 0.05


In [110]:
cols2plot = {'treat':'Protocol', 'model':'Model'}
treat2plot = {'baseline':'Baseline', 'LotS':'LitL', 'LitL':'LitL Chat'}
model2plot = {'roberta-large': r'RoBERTa$_{\rm{Lg}}$', 'roberta-large-mnli': r'RoBERTa$_{\rm{Lg+MNLI}}$'}

hue='Protocol'
hue_order=['Baseline', 'LitL', 'LitL Chat']
style_key='Model'
style_order=[
    model2plot[_model]
]

palette={
        'baseline':'tab:blue',
        'LitL':'tab:orange',
        'LitL Chat':'tab:green',
    }

xlim = [0.8, 5.2]

In [111]:
with open(os.path.join(plot_dir, 'mnli-only-training_accs.p'), 'rb') as f:
    mnli_accs = pickle.load(f)

#### Plot

In [112]:
iterations = [1, 5]
models = style_order
iter_l_offset = -0.75 if _model == 'roberta-large' else -1.75
iter_c_offset = +0.02

In [113]:
error_offsets = [
        {
            'line':-(err_line_offset+iter_l_offset),
            'cap':-(err_cap_offset+iter_c_offset),
        },
        {
            'line':0,
            'cap':0,
        },
        {
            'line':+(err_line_offset+iter_l_offset),
            'cap':+(err_cap_offset+iter_c_offset),
        }
    ]

In [114]:
plot_df = pd.read_csv(os.path.join(plot_dir, 'hans.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [115]:
mnli_collected = []
temp_df = mnli_accs['hans']
temp_df['case_text'] = temp_df['case']

for case in temp_df['case_text'].unique():
    temp_acc = temp_df.loc[temp_df['case_text'] == case, :]
    mnli_collected.append({
        'acc': temp_acc['correct'].sum()/temp_acc.shape[0],
        'subcase': 'combined',
        'label': 'combined',
        'model': 'mnli-only',
        'case': case,
    })
    
    for label in temp_acc['label'].unique():
        temptemp_acc = temp_acc.loc[temp_acc['label'] == label, :]
        mnli_collected.append({
            'acc': temptemp_acc['correct'].sum()/temptemp_acc.shape[0],
            'subcase': 'combined',
            'label': 'non-entailment' if label == 'contradiction' else label,
            'model': 'mnli-only',
            'case': case,
        })
mnli_df = pd.DataFrame(mnli_collected)

In [116]:
label = 'entailment'

plot_dff = plot_df.loc[plot_df['comb'] == combined, :]
plot_dff = plot_dff.loc[plot_dff['label'] == label, :]
plot_dff = plot_dff.loc[plot_dff['subcase'] == 'combined', :]

In [117]:
mnli_df = mnli_df.loc[mnli_df['label'] == label, :]
mnli_df = mnli_df.loc[mnli_df['subcase'] == 'combined', :]

In [118]:
plot_dff['model'] = plot_dff['model'].apply(lambda x: model2plot[x])
plot_dff['treat'] = plot_dff['treat'].apply(lambda x: treat2plot[x])
plot_dff = plot_dff.rename(columns=cols2plot)

In [119]:
ylims={
    'roberta-large':{
        'constituent': [0.6, 1.05],
        'lexical_overlap': [0.6, 1.05],
        'subsequence': [0.6, 1.05],
    },
    'roberta-large-mnli':{
        'constituent': [0.6, 1.05],
        'lexical_overlap': [0.6, 1.05],
        'subsequence': [0.6, 1.05],
    },
}
title=f""
xlabel='Round'
ylabel=r'RoBERTa$_{\rm{Lg+MNLI}}$'
tabletitle='median'
tableon=False

bbox_to_anchor = (-.75, -1)

hans_keys = pd.read_csv('hans_case_keys.csv')

case2title = {
    'constituent': 'Constituent',
    'lexical_overlap': 'Lexical Overlap',
    'subsequence': 'Subsequence',
}

i = 0
for case in hans_keys['case'].unique():
    if case == 'combined':
        continue
    
    temp_df = plot_dff.loc[plot_dff['case'] == case, :]
    err_line_plots(
        temp_df,
        err_style=err_style,
        ylim=ylims[_model][case],
        title=f"{case2title[case]}",
        xlabel=xlabel,
        ylabel=ylabel,
        tabletitle=tabletitle,
        palette=palette,
        tableon=tableon,
        style_key=style_key,
        style_order=style_order,
        ax=ax[1, i],
        ylabel_visible = i == 0,
        legend_visible = i == len(hans_keys['case'].unique())-2,
        loc='lower center',
        ncol=len(hans_keys['case'].unique()),
        bbox_to_anchor=bbox_to_anchor,
        err_kws=err_kws,
        hue=hue,
        hue_order=hue_order,
        xlim=xlim,
        ystep=0.2,
        error_offsets=error_offsets,
        boxplots=boxplots,
    )
    
    if _model == 'roberta-large-mnli':
        temp_mnli = mnli_df.loc[mnli_df['case'] == case, :]
        ax[1, i].hlines(temp_mnli['acc'], xlim[0], xlim[1], label='mnli-only', zorder=10)
    
    i += 1
    
    for model in models:
        iter_df = temp_df.loc[temp_df['iter'].isin(iterations), :]
        iter_df = iter_df.loc[iter_df['Model'] == model, :]
        anova_table, anova_lm = two_way_anova(iter_df, f2='Protocol')

        print(model, case)
        print(anova_table)
        print('-'*90)
        
        display_resid(iter_df, anova_lm, treat_name="Protocol")

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg+MNLI}}$ constituent
                           sum_sq    df          F        PR(>F)
C(iter)              7.333333e-08   1.0   0.007216  9.325869e-01
C(Protocol)          3.824085e-04   2.0  18.814153  4.542187e-07
C(iter):C(Protocol)  1.688885e-04   2.0   8.309161  6.524667e-04
Residual             6.097673e-04  60.0        NaN           NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg+MNLI}}$ lexical_overlap
                       sum_sq    df         F    PR(>F)
C(iter)              0.000053   1.0  2.268828  0.137247
C(Protocol)          0.000109   2.0  2.342036  0.104862
C(iter):C(Protocol)  0.000007   2.0  0.142687  0.867319
Residual             0.001395  60.0       NaN       NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

acc ~ C(iter) + C(Protocol) + C(iter):C(Protocol)
RoBERTa$_{\rm{Lg+MNLI}}$ subsequence
                       sum_sq    df         F    PR(>F)
C(iter)              0.000005   1.0  0.770350  0.383610
C(Protocol)          0.000023   2.0  1.941397  0.152412
C(iter):C(Protocol)  0.000006   2.0  0.463798  0.631126
Residual             0.000357  60.0       NaN       NaN
------------------------------------------------------------------------------------------




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Save

In [120]:
plot_out = os.path.join(repo, 'eval_summary', 'plots', 'HANS_entailment')
os.makedirs(plot_out, exist_ok=True)

In [121]:
fig.tight_layout()
fig.subplots_adjust(wspace=2e-1)
if save_figs:
    fig.savefig(os.path.join(plot_out, f'{combined}_HANS_entailment{saveappend}.pdf'))