In [35]:
import pandas as pd
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
blimp_corpus_map = {
    "full": [],
    "pp-mod-subj": ["distractor_agreement_relational_noun"],
    "rel-cl": ["distractor_agreement_relative_clause"],
    "re-irr-sv-agr": [
        "irregular_plural_subject_verb_agreement_1",
        "irregular_plural_subject_verb_agreement_2",
        "regular_plural_subject_verb_agreement_1",
        "regular_plural_subject_verb_agreement_2",
    ],
    "npi-only": ["only_npi_licensor_present", "only_npi_scope"],
    "npi-sent-neg": [
        "sentential_negation_npi_licensor_present",
        "sentential_negation_npi_scope",
    ],
    "npi-sim-ques": ["matrix_question_npi_licensor_present"],
    "superlative-quantifier": [
        "superlative_quantifiers_1",
        "superlative_quantifiers_2",
    ],
    "existential-there-quantifier": ["existential_there_quantifiers_1"],
    "binding-c-command": ["principle_A_c_command"],
    "binding-case": ["principle_A_case_1", "principle_A_case_2"],
    "binding-domain": [
        "principle_A_domain_1",
        "principle_A_domain_2",
        "principle_A_domain_3",
    ],
    "binding-reconstruction": ["principle_A_reconstruction"],
    "passive": ["passive_1", "passive_2"],
    "det-adj-noun": [
        "determiner_noun_agreement_with_adjective_1",
        "determiner_noun_agreement_with_adj_2",
        "determiner_noun_agreement_with_adj_irregular_1",
        "determiner_noun_agreement_with_adj_irregular_2",
    ],
    "det-noun": [
        "determiner_noun_agreement_1",
        "determiner_noun_agreement_2",
        "determiner_noun_agreement_irregular_1",
        "determiner_noun_agreement_irregular_2",
    ],
}
all_filters = set(blimp_corpus_map.keys())
benchmarks_with_filters = set([_ for xs in blimp_corpus_map.values() for _ in xs])
print(all_filters)
print(benchmarks_with_filters)

{'binding-reconstruction', 'npi-sent-neg', 'npi-only', 'binding-case', 'passive', 'superlative-quantifier', 'pp-mod-subj', 'existential-there-quantifier', 'det-adj-noun', 'full', 'det-noun', 're-irr-sv-agr', 'binding-domain', 'rel-cl', 'binding-c-command', 'npi-sim-ques'}
{'determiner_noun_agreement_1', 'principle_A_domain_1', 'determiner_noun_agreement_with_adjective_1', 'determiner_noun_agreement_with_adj_irregular_2', 'sentential_negation_npi_licensor_present', 'regular_plural_subject_verb_agreement_1', 'principle_A_case_2', 'passive_1', 'determiner_noun_agreement_irregular_2', 'regular_plural_subject_verb_agreement_2', 'principle_A_c_command', 'determiner_noun_agreement_with_adj_irregular_1', 'matrix_question_npi_licensor_present', 'only_npi_scope', 'sentential_negation_npi_scope', 'determiner_noun_agreement_with_adj_2', 'principle_A_case_1', 'principle_A_domain_3', 'existential_there_quantifiers_1', 'passive_2', 'superlative_quantifiers_2', 'irregular_plural_subject_verb_agreement

# Read the main data

In [22]:
main_data = pd.read_csv("data/tidy_results.csv", index_col=0)
main_data = main_data.rename(columns = {"blimp_delta_all_seed_avg": "blimp_delta"})
print(main_data.columns)
main_data

Index(['corpus', 'arch', 'seed', 'validation_loss', 'validation_ppl',
       'test_loss', 'test_ppl', 'blimp_benchmark', 'blimp_acc',
       'filter_target', 'z_validation_ppl', 'z_validation_loss', 'z_test_ppl',
       'z_test_loss', 'full_same_seed_acc', 'blimp_delta_same_seed',
       'full_all_seed_avg_acc', 'blimp_delta', 'field', 'linguistics_term',
       'corpus_tokens'],
      dtype='object')
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/formatters.py", line 223, in catch_format_error
    r = method(self, *args, **kwargs)
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/IPython/core/formatters.py", line 344, in __call__
    return method()
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/core/frame.py", line 1106, in _repr_html_
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/format.py", line 1110, in to_html
    When formatting an Index subclass
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py", line 88, in to_string
    lines = self.render()
  File "/Users/shanest/opt/anaconda3/envs/corpus-filtering/lib/python3.9/site-packages/pandas/io/formats/html.py", line 644, in render
    super(

        corpus         arch  seed  validation_loss  validation_ppl  test_loss  \
0         full  transformer     0         3.841752       46.607061   3.845174   
1         full  transformer     1         3.840523       46.549825   3.844156   
2         full  transformer     2         3.840050       46.527789   3.843551   
3         full  transformer     3         3.841511       46.595850   3.844735   
4         full  transformer     4         3.842284       46.631870   3.846043   
...        ...          ...   ...              ...             ...        ...   
10715  passive         lstm     0         3.975108       53.255876   3.977766   
10716  passive         lstm     1         3.975204       53.261006   3.978100   
10717  passive         lstm     2         3.974197       53.207382   3.977306   
10718  passive         lstm     3         3.975786       53.291974   3.978186   
10719  passive         lstm     4         3.977474       53.382045   3.980730   

        test_ppl           

In [36]:
# group data by corpus and architecture, take the mean val ppl across seeds
ppls_by_arch = main_data.groupby(["corpus", "arch"])["validation_ppl"].mean()
# get the difference between lstm and transformer for each corpus
ppl_unstacked = ppls_by_arch.unstack(level="arch")

# print the mean ppl for each architecture
print(ppls_by_arch.mean(level="arch"))
print(scipy.stats.ttest_rel(ppl_unstacked["lstm"], ppl_unstacked["transformer"]))


arch
lstm           53.403396
transformer    46.970552
Name: validation_ppl, dtype: float64
TtestResult(statistic=271.4049864115043, pvalue=4.192319978874203e-29, df=15)


  print(ppls_by_arch.mean(level="arch"))
