In [2]:
import pandas as pd
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
blimp_corpus_map = {
    "full": [],
    "pp-mod-subj": ["distractor_agreement_relational_noun"],
    "rel-cl": ["distractor_agreement_relative_clause"],
    "re-irr-sv-agr": [
        "irregular_plural_subject_verb_agreement_1",
        "irregular_plural_subject_verb_agreement_2",
        "regular_plural_subject_verb_agreement_1",
        "regular_plural_subject_verb_agreement_2",
    ],
    "npi-only": ["only_npi_licensor_present", "only_npi_scope"],
    "npi-sent-neg": [
        "sentential_negation_npi_licensor_present",
        "sentential_negation_npi_scope",
    ],
    "npi-sim-ques": ["matrix_question_npi_licensor_present"],
    "superlative-quantifier": [
        "superlative_quantifiers_1",
        "superlative_quantifiers_2",
    ],
    "existential-there-quantifier": ["existential_there_quantifiers_1"],
    "binding-c-command": ["principle_A_c_command"],
    "binding-case": ["principle_A_case_1", "principle_A_case_2"],
    "binding-domain": [
        "principle_A_domain_1",
        "principle_A_domain_2",
        "principle_A_domain_3",
    ],
    "binding-reconstruction": ["principle_A_reconstruction"],
    "passive": ["passive_1", "passive_2"],
    "det-adj-noun": [
        "determiner_noun_agreement_with_adjective_1",
        "determiner_noun_agreement_with_adj_2",
        "determiner_noun_agreement_with_adj_irregular_1",
        "determiner_noun_agreement_with_adj_irregular_2",
    ],
    "det-noun": [
        "determiner_noun_agreement_1",
        "determiner_noun_agreement_2",
        "determiner_noun_agreement_irregular_1",
        "determiner_noun_agreement_irregular_2",
    ],
}
all_filters = set(blimp_corpus_map.keys())
benchmarks_with_filters = set([_ for xs in blimp_corpus_map.values() for _ in xs])
print(all_filters)
print(benchmarks_with_filters)

{'npi-sent-neg', 'npi-only', 'npi-sim-ques', 'existential-there-quantifier', 'pp-mod-subj', 'binding-c-command', 'binding-case', 'rel-cl', 'binding-reconstruction', 'det-noun', 'full', 'det-adj-noun', 'passive', 'binding-domain', 're-irr-sv-agr', 'superlative-quantifier'}
{'superlative_quantifiers_2', 'distractor_agreement_relative_clause', 'principle_A_c_command', 'distractor_agreement_relational_noun', 'regular_plural_subject_verb_agreement_1', 'irregular_plural_subject_verb_agreement_1', 'determiner_noun_agreement_with_adj_irregular_1', 'principle_A_case_2', 'sentential_negation_npi_scope', 'principle_A_domain_3', 'principle_A_case_1', 'determiner_noun_agreement_irregular_2', 'determiner_noun_agreement_with_adj_irregular_2', 'regular_plural_subject_verb_agreement_2', 'determiner_noun_agreement_with_adjective_1', 'matrix_question_npi_licensor_present', 'determiner_noun_agreement_1', 'only_npi_licensor_present', 'determiner_noun_agreement_irregular_1', 'determiner_noun_agreement_2', '

# Read the main data

In [4]:
main_data = pd.read_csv("data/tidy_results.csv", index_col=0)
main_data = main_data.rename(columns = {"blimp_delta_all_seed_avg": "blimp_delta"})
print(main_data.columns)
print(main_data)

filter_target_data = main_data[main_data["filter_target"] == True]
print(filter_target_data)

Index(['corpus', 'arch', 'seed', 'validation_loss', 'validation_ppl',
       'test_loss', 'test_ppl', 'blimp_benchmark', 'blimp_acc',
       'filter_target', 'z_validation_ppl', 'z_validation_loss', 'z_test_ppl',
       'z_test_loss', 'full_same_seed_acc', 'blimp_delta_same_seed',
       'full_all_seed_avg_acc', 'blimp_delta', 'field', 'linguistics_term',
       'corpus_tokens'],
      dtype='object')
        corpus         arch  seed  validation_loss  validation_ppl  test_loss  \
0         full  transformer     0         3.841752       46.607061   3.845174   
1         full  transformer     1         3.840523       46.549825   3.844156   
2         full  transformer     2         3.840050       46.527789   3.843551   
3         full  transformer     3         3.841511       46.595850   3.844735   
4         full  transformer     4         3.842284       46.631870   3.846043   
...        ...          ...   ...              ...             ...        ...   
10715  passive         lstm 

# Basic descriptive stats

In [5]:
# group data by corpus and architecture, take the mean test ppl across seeds
data_by_arch_and_corpus = main_data.groupby(["corpus", "arch"])

In [6]:
def cohen_d(group1: pd.Series, group2: pd.Series) -> float:
    n1, n2 = len(group1), len(group2)
    s1, s2 = group1.std(), group2.std()
    s = ((n1 - 1) * s1 ** 2 + (n2 - 1) * s2 ** 2) / (n1 + n2 - 2)
    return (group1.mean() - group2.mean()) / (s ** 0.5)

## Perplexities

In [7]:
ppls_by_arch = data_by_arch_and_corpus["test_ppl"].mean()
# ppls_by_arch = filter_target_data.groupby(["corpus", "arch"])["test_ppl"].mean()
# get the difference between lstm and transformer for each corpus
ppl_unstacked = ppls_by_arch.unstack(level="arch")

# print the mean ppl for each architecture
print(ppl_unstacked.mean())
print(scipy.stats.ttest_rel(ppl_unstacked["lstm"], ppl_unstacked["transformer"]))
print(cohen_d(ppl_unstacked["lstm"], ppl_unstacked["transformer"]))


arch
lstm           53.557030
transformer    47.132456
dtype: float64
TtestResult(statistic=270.8441367109314, pvalue=4.324417318161816e-29, df=15)
9.14590781117233


Correlation between ppl and corpus tokens by architecture:

In [11]:
print(scipy.stats.pearsonr(main_data[main_data["arch"]=="lstm"]["corpus_tokens"], main_data[main_data["arch"]=="lstm"]["test_ppl"]))
print(scipy.stats.pearsonr(main_data[main_data["arch"]=="transformer"]["corpus_tokens"], main_data[main_data["arch"]=="transformer"]["test_ppl"]))

PearsonRResult(statistic=-0.9694350791008717, pvalue=0.0)
PearsonRResult(statistic=-0.9755968743080783, pvalue=0.0)


## BLiMP Accuracies

## All data

In [8]:
accs_by_arch = data_by_arch_and_corpus["blimp_acc"].mean()
accs_unstacked = accs_by_arch.unstack(level="arch")
print(accs_unstacked.mean())
print(scipy.stats.ttest_rel(accs_unstacked["lstm"], accs_unstacked["transformer"]))
print(cohen_d(accs_unstacked["lstm"], accs_unstacked["transformer"]))

arch
lstm           0.704122
transformer    0.718575
dtype: float64
TtestResult(statistic=-17.379131652852433, pvalue=2.3895394371004496e-11, df=15)
-3.243116580968632


## Filtered target data

In [9]:
filtered_accs_by_arch = filter_target_data.groupby(["corpus", "arch"])["blimp_acc"].mean()
filtered_accs_unstacked = filtered_accs_by_arch.unstack(level="arch")
print(filtered_accs_unstacked.mean())
print(scipy.stats.ttest_rel(filtered_accs_unstacked["lstm"], filtered_accs_unstacked["transformer"]))
print(cohen_d(filtered_accs_unstacked["lstm"], filtered_accs_unstacked["transformer"]))

arch
lstm           0.667328
transformer    0.688174
dtype: float64
TtestResult(statistic=-1.1791799116654738, pvalue=0.25797842647470387, df=14)
-0.11736564402587392


## Regression Analysis

Here's an attempt at a regression that attempts to decipher what factors are and are not responsible for the accuracy deltas.   Let me know what you think and/or what I'm forgetting!

Rough summary: architecture _on its own_ is not significant!  Neither is _test perplexity_!  The only consistent factor is _filter-target_ and all of the interactions with it.  Does this make sense?  I'm also not sure this is the best specification of the predictors for the model.  I did random intercepts for each combination of corpus and benchmark; I found similar things when only doing random intercepts for corpus.

In [24]:
main_data["corpus-and-benchmark"] = main_data["corpus"] + "-" + main_data["blimp_benchmark"]
full_regression = smf.mixedlm("blimp_delta ~ corpus_tokens + test_ppl*filter_target*arch", main_data, groups=main_data["corpus-and-benchmark"])
full_regression_result = full_regression.fit()
print(full_regression_result.summary())


                            Mixed Linear Model Regression Results
Model:                        MixedLM             Dependent Variable:             blimp_delta
No. Observations:             10720               Method:                         REML       
No. Groups:                   16                  Scale:                          0.0014     
Min. group size:              670                 Log-Likelihood:                 19977.0387 
Max. group size:              670                 Converged:                      Yes        
Mean group size:              670.0                                                          
---------------------------------------------------------------------------------------------
                                                   Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------
Intercept                                           0.365    0.438  0.833 0.405 -0.494  

