In [1]:
import pandas as pd
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
blimp_corpus_map = {
    "full": [],
    "pp-mod-subj": ["distractor_agreement_relational_noun"],
    "rel-cl": ["distractor_agreement_relative_clause"],
    "re-irr-sv-agr": [
        "irregular_plural_subject_verb_agreement_1",
        "irregular_plural_subject_verb_agreement_2",
        "regular_plural_subject_verb_agreement_1",
        "regular_plural_subject_verb_agreement_2",
    ],
    "npi-only": ["only_npi_licensor_present", "only_npi_scope"],
    "npi-sent-neg": [
        "sentential_negation_npi_licensor_present",
        "sentential_negation_npi_scope",
    ],
    "npi-sim-ques": ["matrix_question_npi_licensor_present"],
    "superlative-quantifier": [
        "superlative_quantifiers_1",
        "superlative_quantifiers_2",
    ],
    "existential-there-quantifier": ["existential_there_quantifiers_1"],
    "binding-c-command": ["principle_A_c_command"],
    "binding-case": ["principle_A_case_1", "principle_A_case_2"],
    "binding-domain": [
        "principle_A_domain_1",
        "principle_A_domain_2",
        "principle_A_domain_3",
    ],
    "binding-reconstruction": ["principle_A_reconstruction"],
    "passive": ["passive_1", "passive_2"],
    "det-adj-noun": [
        "determiner_noun_agreement_with_adjective_1",
        "determiner_noun_agreement_with_adj_2",
        "determiner_noun_agreement_with_adj_irregular_1",
        "determiner_noun_agreement_with_adj_irregular_2",
    ],
    "det-noun": [
        "determiner_noun_agreement_1",
        "determiner_noun_agreement_2",
        "determiner_noun_agreement_irregular_1",
        "determiner_noun_agreement_irregular_2",
    ],
}
all_filters = set(blimp_corpus_map.keys())
benchmarks_with_filters = set([_ for xs in blimp_corpus_map.values() for _ in xs])
print(all_filters)
print(benchmarks_with_filters)

{'binding-reconstruction', 'binding-domain', 'rel-cl', 'npi-sim-ques', 'binding-c-command', 'binding-case', 'passive', 'existential-there-quantifier', 'det-noun', 'npi-only', 'pp-mod-subj', 'full', 're-irr-sv-agr', 'det-adj-noun', 'npi-sent-neg', 'superlative-quantifier'}
{'sentential_negation_npi_scope', 'existential_there_quantifiers_1', 'matrix_question_npi_licensor_present', 'superlative_quantifiers_1', 'principle_A_case_1', 'principle_A_domain_1', 'principle_A_domain_3', 'principle_A_case_2', 'determiner_noun_agreement_with_adj_2', 'distractor_agreement_relative_clause', 'irregular_plural_subject_verb_agreement_2', 'determiner_noun_agreement_with_adj_irregular_1', 'determiner_noun_agreement_with_adjective_1', 'irregular_plural_subject_verb_agreement_1', 'regular_plural_subject_verb_agreement_2', 'sentential_negation_npi_licensor_present', 'principle_A_c_command', 'principle_A_reconstruction', 'superlative_quantifiers_2', 'determiner_noun_agreement_irregular_1', 'principle_A_domain

# Read the main data

In [3]:
main_data = pd.read_csv("data/tidy_results.csv", index_col=0)
main_data = main_data.rename(columns = {"blimp_delta_all_seed_avg": "blimp_delta"})
print(main_data.columns)
print(main_data)

filter_target_data = main_data[main_data["filter_target"] == True]
print(filter_target_data)

Index(['corpus', 'arch', 'seed', 'validation_loss', 'validation_ppl',
       'test_loss', 'test_ppl', 'blimp_benchmark', 'blimp_acc',
       'filter_target', 'z_validation_ppl', 'z_validation_loss', 'z_test_ppl',
       'z_test_loss', 'full_same_seed_acc', 'blimp_delta_same_seed',
       'full_all_seed_avg_acc', 'blimp_delta', 'field', 'linguistics_term',
       'corpus_tokens'],
      dtype='object')
        corpus         arch  seed  validation_loss  validation_ppl  test_loss  \
0         full  transformer     0         3.841752       46.607061   3.845174   
1         full  transformer     1         3.840523       46.549825   3.844156   
2         full  transformer     2         3.840050       46.527789   3.843551   
3         full  transformer     3         3.841511       46.595850   3.844735   
4         full  transformer     4         3.842284       46.631870   3.846043   
...        ...          ...   ...              ...             ...        ...   
10715  passive         lstm 

# Basic descriptive stats

In [4]:
# group data by corpus and architecture, take the mean test ppl across seeds
data_by_arch_and_corpus = main_data.groupby(["corpus", "arch"])

In [5]:
def cohen_d(group1: pd.Series, group2: pd.Series) -> float:
    n1, n2 = len(group1), len(group2)
    s1, s2 = group1.std(), group2.std()
    s = ((n1 - 1) * s1 ** 2 + (n2 - 1) * s2 ** 2) / (n1 + n2 - 2)
    return (group1.mean() - group2.mean()) / (s ** 0.5)

## Perplexities

In [6]:
ppls_by_arch = data_by_arch_and_corpus["test_ppl"].mean()
# ppls_by_arch = filter_target_data.groupby(["corpus", "arch"])["test_ppl"].mean()
# get the difference between lstm and transformer for each corpus
ppl_unstacked = ppls_by_arch.unstack(level="arch")

# print the mean ppl for each architecture
print(ppl_unstacked.mean())
print(scipy.stats.ttest_rel(ppl_unstacked["lstm"], ppl_unstacked["transformer"]))
print(cohen_d(ppl_unstacked["lstm"], ppl_unstacked["transformer"]))


arch
lstm           53.557030
transformer    47.132456
dtype: float64
TtestResult(statistic=270.8441367109314, pvalue=4.324417318161816e-29, df=15)
9.14590781117233


## BLiMP Accuracies

## All data

In [7]:
accs_by_arch = data_by_arch_and_corpus["blimp_acc"].mean()
accs_unstacked = accs_by_arch.unstack(level="arch")
print(accs_unstacked.mean())
print(scipy.stats.ttest_rel(accs_unstacked["lstm"], accs_unstacked["transformer"]))
print(cohen_d(accs_unstacked["lstm"], accs_unstacked["transformer"]))

arch
lstm           0.704122
transformer    0.718575
dtype: float64
TtestResult(statistic=-17.379131652852433, pvalue=2.3895394371004496e-11, df=15)
-3.243116580968632


## Filtered target data

In [8]:
filtered_accs_by_arch = filter_target_data.groupby(["corpus", "arch"])["blimp_acc"].mean()
filtered_accs_unstacked = filtered_accs_by_arch.unstack(level="arch")
print(filtered_accs_unstacked.mean())
print(scipy.stats.ttest_rel(filtered_accs_unstacked["lstm"], filtered_accs_unstacked["transformer"]))
print(cohen_d(filtered_accs_unstacked["lstm"], filtered_accs_unstacked["transformer"]))

arch
lstm           0.667328
transformer    0.688174
dtype: float64
TtestResult(statistic=-1.1791799116654738, pvalue=0.25797842647470387, df=14)
-0.11736564402587392


# Next steps

In [9]:
mixed_test_ppl_model = smf.mixedlm("test_ppl ~ corpus_tokens", main_data, groups=main_data["arch"])
mixed_test_ppl_result = mixed_test_ppl_model.fit()
print(mixed_test_ppl_result.summary())

fixed_test_ppl_model = smf.ols("test_ppl ~ corpus_tokens + arch + corpus_tokens*arch", main_data)
fixed_test_ppl_result = fixed_test_ppl_model.fit()
print(fixed_test_ppl_result.summary())

print(sm.stats.anova_lm(fixed_test_ppl_result, mixed_test_ppl_result))

lstm_test_ppl_model = smf.ols("test_ppl ~ corpus_tokens", main_data[main_data["arch"] == "lstm"])
lstm_test_ppl_result = lstm_test_ppl_model.fit()
print(lstm_test_ppl_result.summary())

print(scipy.stats.pearsonr(main_data[main_data["arch"]=="lstm"]["corpus_tokens"], main_data[main_data["arch"]=="lstm"]["test_ppl"]))

            Mixed Linear Model Regression Results
Model:              MixedLM   Dependent Variable:   test_ppl 
No. Observations:   10720     Method:               REML     
No. Groups:         2         Scale:                0.0268   
Min. group size:    5360      Log-Likelihood:       4158.2101
Max. group size:    5360      Converged:            Yes      
Mean group size:    5360.0                                   
-------------------------------------------------------------
               Coef.  Std.Err.    z     P>|z|  [0.025  0.975]
-------------------------------------------------------------
Intercept     110.442    2.130   51.849 0.000 106.267 114.617
corpus_tokens  -0.000    0.000 -419.795 0.000  -0.000  -0.000
Group Var       9.034   41.311                               

                            OLS Regression Results                            
Dep. Variable:               test_ppl   R-squared:                       0.998
Model:                            OLS   Adj. R-

AttributeError: 'MixedLMResults' object has no attribute 'ssr'