# Import

In [1]:
%matplotlib widget

In [2]:
import os

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import ttest_ind

In [3]:
sns.set_theme(style='whitegrid')

# Define

In [4]:
def my_plot(table, plot_args, title, xlabel='iteration', ylabel='accuracy', ylim=[0, 1]):
    fig, ax = plt.subplots(2, 1)
    
    lo = ax[0].plot(
        table.columns.values,
        table.transpose(),
        **plot_args
    )
    ax[0].legend(iter(lo), table.index.values, loc='best')

    ax[0].set_title(title)
    ax[0].set_xlabel(xlabel)
    ax[0].set_ylabel(ylabel)
    ax[0].set_ylim(*ylim)
    
    cell_text = []
    order = ['baseline', 'LotS', 'LitL']
    for treat in order:
        display_text = [f'{acc*100:.2f}%' for acc in table.loc[treat, :]]
        cell_text.append(display_text)
    
    ax[1].table(cellText=cell_text, colLabels=table.columns, rowLabels=order, loc='center')
    ax[1].axis('off')
    
    plt.tight_layout()
    return fig
    

In [5]:
def get_val_summary(modifier, iteration, eval_dir, ):
    fname = os.path.join(eval_dir, f'r{iteration}', 'tables', f'configs.{modifier}.csv')
    summary_table = pd.read_csv(fname, index_col = 0)
    summary_table = summary_table[[str(n) for n in range(1, iteration+1)]]
    
    return summary_table


def get_itereval_summary(sub_keys, iteration, eval_dir, combined, ):
    rep = {
        '/': '-',
        ';': '--',
    }
    
    fname_key = '.'.join(sub_keys.values())
    for old_char, new_char in rep.items():
        fname_key = fname_key.replace(old_char, new_char)
    fname = os.path.join(eval_dir, f'r{iteration}', 'tables', combined, f'iterevals.{fname_key}.csv')
    summary_table = pd.read_csv(fname, index_col = 0)
    summary_table = summary_table[[str(n) for n in range(1, iteration+1)]]
    
    return summary_table
    

In [6]:
def get_mnli_tables(mnli_summary, subsetting='genre'):
    with open(mnli_summary, 'r') as f:
        summary = pd.DataFrame([json.loads(line) for line in f])
    
    mnli_tables = {}
    for comb in summary['comb'].unique():
        comb_sum = summary.loc[summary['comb'] == comb, :]

        for subset in summary[subsetting].unique():
            subset_sum = comb_sum.loc[comb_sum[subsetting] == subset, :]

            plot_tab = []
            for treat in subset_sum['treat'].unique():
                treat_sum = subset_sum.loc[subset_sum['treat'] == treat, :]
                s = treat_sum[['iter','acc']].set_index('iter').rename({'acc': treat}, axis=1).transpose()            
                plot_tab.append(s)
            
            mnli_tables[(model, comb, subset)] = pd.concat(plot_tab)
    
    return summary, mnli_tables

In [7]:
def split_run_name(run_name, split_by='_'):
    name_list = run_name.split(split_by)
    if len(name_list) == 2:
        input_type = 'full'
        comb = 'combined'
    elif len(name_list) == 3:
        if name_list[-1] == 'hyp':
            input_type = name_list[-1]
            comb = 'combined'
        else:
            input_type = 'full'
            comb = name_list[-1]
    else:
        input_type = name_list[-1]
        comb = name_list[-2]

    return (name_list[0], name_list[1], input_type, comb)

In [8]:
def load_sampled_results(sampled_base):
    collected = pd.read_csv(os.path.join(sampled_base, 'collected.csv'))
    itereval = pd.read_csv(os.path.join(sampled_base, 'itereval.csv'))
    mnli = pd.read_csv(os.path.join(sampled_base, 'mnli.csv'))
    
    return collected, itereval, mnli

In [9]:
def get_ttest_pvals(dist_df, verbose=True):
    pairs = [
        ('baseline', 'LotS'),
        ('baseline', 'LitL'),
        ('LotS', 'LitL'),
    ]
    
    ttest_dict = {}
    for pair in pairs:
        a = dist_df.loc[dist_df['treat'] == pair[0], 'acc']
        b = dist_df.loc[dist_df['treat'] == pair[1], 'acc']
        ttest_dict[pair] = ttest_ind(a, b)
    
    if verbose:
        for pair, ttest_results in ttest_dict.items():
            print('='*45)
            print(f"{pair}\nt: {ttest_results[0]:.5f} | p: {ttest_results[1]/2:.5f}")
    
    return ttest_dict
    

# Plot

In [10]:
def single_round_box(
    plot_arrays, 
    plot_alltraining,
    ylim=[0,1],
    title=None,
    xlabel=None,
    ylabel=None,
    tabletitle=None,
    tableon=True,
):
    if tableon:
        fig, ax = plt.subplots(2, 1)
        sns.boxplot(x='treat', y='acc', data=plot_arrays, ax=ax[0])
        ax[0].scatter(x=plot_alltraining['treat'], y=plot_alltraining['acc'], c='r', s=75)

        ax[0].set_title(title)
        ax[0].set_xlabel(xlabel)
        ax[0].set_ylabel(ylabel)
        ax[0].set_ylim(*ylim)

        cell_text = []
        order = ['baseline', 'LotS', 'LitL']
        for treat in order:
            display_text = f'{plot_alltraining.loc[plot_alltraining["treat"] == treat,"acc"].values[0]*100:.2f}%'
            cell_text.append(display_text)

        table = ax[1].table(cellText=[cell_text], colLabels=plot_alltraining['treat'].values, loc='upper center')
        table.scale(1, 2)
        ax[1].axis('off')

        ax[1].set_title(tabletitle)
    else:
        fig, ax = plt.subplots()
        sns.boxplot(x='treat', y='acc', data=plot_arrays, ax=ax)
        ax.scatter(x=plot_alltraining['treat'], y=plot_alltraining['acc'], c='r', s=75)

        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_ylim(*ylim)
    
    fig.tight_layout()
    
    return fig

In [11]:
model='roberta-large-mnli'
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
eval_dir = os.path.join(repo, 'eval_summary', model)
sample_type = 'cross_eval'
iteration = 5

mnli_summary = os.path.join(eval_dir, 'mnli_evals', 'eval_summaries.jsonl')

plots_dir = os.path.join(eval_dir, 'sample', sample_type, f'r{iteration}', 'plots')
os.makedirs(plots_dir, exist_ok=True)

In [12]:
collected_errs, itereval_errs, mnli_errs = load_sampled_results(
    os.path.join(eval_dir, 'sample', sample_type, f'r{iteration}')
)

In [13]:
collected_errs['treat'] = collected_errs['run'].apply(lambda x: split_run_name(x)[0])
collected_errs['iter'] = collected_errs['run'].apply(lambda x: int(split_run_name(x)[1]))
collected_errs['mod'] = collected_errs['run'].apply(lambda x: split_run_name(x)[2])
collected_errs['combined'] = collected_errs['run'].apply(lambda x: split_run_name(x)[3])

In [14]:
mnli_errs['genre'] = mnli_errs['genre'].fillna('combined')

## Hyp

In [15]:
select2mod = {
    ('combined', 'full'): 'combined',
    ('combined', 'hyp'): 'hyp',
    ('separate', 'full'): 'separate',
    ('separate', 'hyp'): 'separate_hyp',
}

### Combined

In [16]:
combined = 'combined'
input_type = 'hyp'

mod = select2mod[(combined, input_type)]

collected = get_val_summary(mod, iteration, eval_dir, )

In [17]:
collected_best = collected[str(iteration)]

In [18]:
collected_iter_errs = collected_errs.loc[collected_errs['iter'] == iteration, :]
collected_mod_errs = collected_iter_errs.loc[collected_iter_errs['mod'] == input_type, :]
collected_err = collected_mod_errs.loc[collected_mod_errs['combined'] == combined, :]

In [19]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [20]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 20.87870 | p: 0.00000
('baseline', 'LitL')
t: 15.65192 | p: 0.00000
('LotS', 'LitL')
t: 0.21880 | p: 0.41451


In [21]:
ylim=[0.3,0.7]
title=f'{combined} | {input_type} input'
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f'collected-{combined}-{input_type}.{figtype}'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Separate

In [22]:
combined = 'separate'
input_type = 'hyp'

mod = select2mod[(combined, input_type)]

collected = get_val_summary(mod, iteration, eval_dir, )

In [23]:
collected_best = collected[str(iteration)]

In [24]:
collected_iter_errs = collected_errs.loc[collected_errs['iter'] == iteration, :]
collected_mod_errs = collected_iter_errs.loc[collected_iter_errs['mod'] == input_type, :]
collected_err = collected_mod_errs.loc[collected_mod_errs['combined'] == combined, :]

In [25]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [26]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 25.19341 | p: 0.00000
('baseline', 'LitL')
t: 16.37855 | p: 0.00000
('LotS', 'LitL')
t: -4.07685 | p: 0.00029


In [27]:
ylim=[0.3,0.7]
title=f'{combined} | {input_type} input'
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f'collected-{combined}-{input_type}.{figtype}'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## HANS - combined

In [28]:
combined_iterations = 'combined'

### lexical_overlap

#### enailment

In [29]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'lexical_overlap',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}


hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [30]:
collected_best = hans[str(iteration)]

In [31]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [32]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [33]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 2.15641 | p: 0.01856
('baseline', 'LitL')
t: 2.46001 | p: 0.00916
('LotS', 'LitL')
t: -0.65339 | p: 0.25862


In [34]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [35]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'lexical_overlap',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [36]:
collected_best = hans[str(iteration)]

In [37]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [38]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [39]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -3.69738 | p: 0.00033
('baseline', 'LitL')
t: -5.68781 | p: 0.00000
('LotS', 'LitL')
t: -1.41058 | p: 0.08305


In [40]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### subsequence

#### enailment

In [41]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'subsequence',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [42]:
collected_best = hans[str(iteration)]

In [43]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [44]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [45]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 2.48796 | p: 0.00856
('baseline', 'LitL')
t: 1.82994 | p: 0.03736
('LotS', 'LitL')
t: -1.76133 | p: 0.04291


In [46]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [47]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'subsequence',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [48]:
collected_best = hans[str(iteration)]

In [49]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [50]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [51]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -3.24106 | p: 0.00120
('baseline', 'LitL')
t: -7.32403 | p: 0.00000
('LotS', 'LitL')
t: -3.13003 | p: 0.00163


In [52]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### constituent

#### enailment

In [53]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'constituent',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [54]:
collected_best = hans[str(iteration)]

In [55]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [56]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [57]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -4.42137 | p: 0.00004
('baseline', 'LitL')
t: 1.97077 | p: 0.02785
('LotS', 'LitL')
t: 5.70621 | p: 0.00000


In [58]:
ylim=[0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [59]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'constituent',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [60]:
collected_best = hans[str(iteration)]

In [61]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [62]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [63]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 1.92932 | p: 0.03040
('baseline', 'LitL')
t: -0.24893 | p: 0.40235
('LotS', 'LitL')
t: -1.94081 | p: 0.02968


In [64]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## HANS - separate

In [65]:
combined_iterations = 'separate'

### lexical_overlap

#### enailment

In [66]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'lexical_overlap',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}


hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [67]:
collected_best = hans[str(iteration)]

In [68]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [69]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [70]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 2.23237 | p: 0.01563
('baseline', 'LitL')
t: 2.58502 | p: 0.00674
('LotS', 'LitL')
t: -0.66205 | p: 0.25587


In [71]:
ylim=[0.0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [72]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'lexical_overlap',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [73]:
collected_best = hans[str(iteration)]

In [74]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [75]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [76]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -3.95429 | p: 0.00015
('baseline', 'LitL')
t: -6.30470 | p: 0.00000
('LotS', 'LitL')
t: -1.76212 | p: 0.04285


In [77]:
ylim=[0.0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### subsequence

#### enailment

In [78]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'subsequence',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [79]:
collected_best = hans[str(iteration)]

In [80]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [81]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [82]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 2.56379 | p: 0.00711
('baseline', 'LitL')
t: 1.91568 | p: 0.03129
('LotS', 'LitL')
t: -1.80483 | p: 0.03932


In [83]:
ylim=[0.0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [84]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'subsequence',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [85]:
collected_best = hans[str(iteration)]

In [86]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [87]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [88]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -3.51222 | p: 0.00056
('baseline', 'LitL')
t: -8.34286 | p: 0.00000
('LotS', 'LitL')
t: -3.54752 | p: 0.00050


In [89]:
ylim=[0.0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### constituent

#### enailment

In [90]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'constituent',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [91]:
collected_best = hans[str(iteration)]

In [92]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [93]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [94]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: -3.96532 | p: 0.00015
('baseline', 'LitL')
t: 2.29191 | p: 0.01363
('LotS', 'LitL')
t: 5.85281 | p: 0.00000


In [95]:
ylim=[0,1.0]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### non-entailment

In [96]:
sub_keys = {
    'dataset': 'hans',     # either hans or glue
    'case': 'constituent',    # combined or specific to respective itereval set
    'subcase': 'combined', # combined or specific to respective itereval set
    'label': 'non-entailment',   # combined or [entailment, neutral, contradiction] for glue, [entailment, non-entailment] for hans
}

hans = get_itereval_summary(sub_keys, iteration, eval_dir, combined_iterations)

In [97]:
collected_best = hans[str(iteration)]

In [98]:
round_errs = itereval_errs.loc[itereval_errs['iter'] == iteration, :]
dataset_errs = round_errs.loc[round_errs['dataset'] == sub_keys['dataset'], :]
case_errs = dataset_errs.loc[dataset_errs['case'] == sub_keys['case'], :]
subcase_errs = case_errs.loc[case_errs['subcase'] == sub_keys['subcase'], :]
collected_err = subcase_errs.loc[subcase_errs['label'] == sub_keys['label'], :]

In [99]:
plot_arrays = []
plot_alltraining = []

for treat in collected_best.index.values:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    accs = accs.append(pd.Series([collected_best[treat]]), ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': collected_best[treat], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [100]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 1.44716 | p: 0.07782
('baseline', 'LitL')
t: -0.67058 | p: 0.25317
('LotS', 'LitL')
t: -2.09205 | p: 0.02141


In [101]:
ylim=[0,1]
title=f"{combined_iterations} | {sub_keys['dataset']} | {sub_keys['case']} | {sub_keys['label']}"
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle)
fig.savefig(os.path.join(plots_dir, f"{sub_keys['dataset']}-{combined_iterations}-{sub_keys['case']}-{sub_keys['label']}.{figtype}"))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## MNLI

### Combined

In [102]:
combined = 'combined'

In [103]:
genre = 'combined'

In [104]:
with open(mnli_summary, 'r') as f:
    summary = pd.DataFrame([json.loads(line) for line in f])

In [105]:
iter_summary = summary.loc[summary['iter'] == str(iteration), :]
comb_summary = iter_summary.loc[iter_summary['comb'] == combined, :]
genre_summary = comb_summary.loc[comb_summary['tag'] == genre, :] # <--- CHANGE 'tag' to 'genre'

In [106]:
iter_errs = mnli_errs.loc[mnli_errs['iter'] == iteration, :]
comb_errs = iter_errs.loc[iter_errs['comb'] == combined, :]
collected_err = comb_errs.loc[comb_errs['genre'] == genre, :]

In [107]:
print(collected_err.loc[collected_err['treat'] == 'LitL', 'acc'])

0      0.899715
36     0.894528
72     0.888019
108    0.892596
144    0.898698
180    0.878967
216    0.899410
252    0.886697
288    0.901241
324    0.897274
Name: acc, dtype: float64


In [108]:
plot_arrays = []
plot_alltraining = []

order = ['baseline', 'LotS', 'LitL']

for treat in order:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    
    accs = accs.append(genre_summary.loc[genre_summary['treat'] == treat, 'acc'], ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': genre_summary.loc[genre_summary['treat'] == treat, 'acc'].values[0], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [109]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 0.15991 | p: 0.43728
('baseline', 'LitL')
t: 0.04122 | p: 0.48376
('LotS', 'LitL')
t: -0.11620 | p: 0.45433


In [110]:
ylim=[0.7,1]
title=f'mnli | {combined} | {genre} genre'
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle, tableon=False)
fig.savefig(os.path.join(plots_dir, f'mnli-{combined}-{genre}.{figtype}'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Separate

In [111]:
combined = 'separate'

In [112]:
genre = 'combined'

In [113]:
with open(mnli_summary, 'r') as f:
    summary = pd.DataFrame([json.loads(line) for line in f])

In [114]:
iter_summary = summary.loc[summary['iter'] == str(iteration), :]
comb_summary = iter_summary.loc[iter_summary['comb'] == combined, :]
genre_summary = comb_summary.loc[comb_summary['tag'] == genre, :] # <--- CHANGE 'tag' to 'genre'

In [115]:
iter_errs = mnli_errs.loc[mnli_errs['iter'] == iteration, :]
comb_errs = iter_errs.loc[iter_errs['comb'] == combined, :]
collected_err = comb_errs.loc[comb_errs['genre'] == genre, :]

In [116]:
plot_arrays = []
plot_alltraining = []

order = ['baseline', 'LotS', 'LitL']

for treat in order:
    accs = collected_err.loc[collected_err['treat'] == treat, 'acc']
    
    accs = accs.append(genre_summary.loc[genre_summary['treat'] == treat, 'acc'], ignore_index=True)
    accs = accs.to_frame(name='acc')
    accs['treat'] = treat
    
    plot_arrays.append(accs)
    plot_alltraining.append({'acc': genre_summary.loc[genre_summary['treat'] == treat, 'acc'].values[0], 'treat': treat})
    
plot_arrays = pd.concat(plot_arrays, ignore_index=True)
plot_alltraining = pd.DataFrame(plot_alltraining)

In [117]:
ttests = get_ttest_pvals(plot_arrays)

('baseline', 'LotS')
t: 0.10853 | p: 0.45733
('baseline', 'LitL')
t: 0.06274 | p: 0.47530
('LotS', 'LitL')
t: -0.04505 | p: 0.48226


In [118]:
ylim=[0.7,1]
title=f'mnli | {combined} | {genre} genre'
xlabel=''
ylabel='Accuracy'
tabletitle='Full Training Data'

figtype='jpg'

fig = single_round_box(plot_arrays, plot_alltraining, ylim=ylim, title=title, xlabel=xlabel, ylabel=ylabel, tabletitle=tabletitle, tableon=False)
fig.savefig(os.path.join(plots_dir, f'mnli-{combined}-{genre}.{figtype}'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …