In [2]:
import sys
sys.path.append('../Util')

In [3]:
from evaluation import aabcc, sig_props, correlation, lr, perceptron, kmeans_1dim, \
                     score_comparison, run_tests, report, dimensions_report, repeated_dimensions, \
                    kmeans_multi_dim
from preparation import prepare_dataset, read_datasets

In [4]:
from scipy.stats import f_oneway

In [5]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mean_absolute_error, accuracy_score

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
import warnings
warnings.filterwarnings('ignore')

Due to big size of WE files they are not uploaded to Github, but can instead be downloaded [here](https://drive.google.com/drive/folders/10Ea62GRlq4t7bq-nK9tPtYFu0kbCciey?usp=sharing).

The code below expects a folder "Data" in the root folder containing all the information from the Google Drive.

In [8]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [9]:
labels = [m['label'] for m in models]

In [10]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [11]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.370462,-0.543173,0.313777,0.114062,0.105755,0.101263,0.047496,0.941036,-0.635094,0.35125,...,0.262812,-0.034462,0.003319,-0.33907,invariable,feminine,2D,NOUN,,
3D,0.35464,-0.545033,0.300138,0.125182,0.109215,0.103105,0.037388,0.945384,-0.627031,0.362873,...,0.266173,-0.040272,-0.011564,-0.345399,invariable,feminine,3D,NOUN,,
aa,0.350173,-0.535272,0.285707,0.138325,0.116959,0.096526,0.028733,0.941748,-0.610729,0.365452,...,0.266548,-0.035119,-0.017618,-0.364172,invariable,masculine,aa,NOUN,,
abandon,0.348776,-0.534598,0.285159,0.137568,0.115843,0.09603,0.028422,0.942982,-0.611038,0.366652,...,0.266958,-0.034558,-0.017089,-0.363497,singular,masculine,abandon,NOUN,,
abbaye,0.342727,-0.537595,0.289408,0.12858,0.12162,0.101963,0.029359,0.952322,-0.611914,0.365287,...,0.274898,-0.035381,-0.019363,-0.353987,singular,feminine,abbaye,NOUN,,


In [12]:
# There are 6 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, pos, tense,
# person
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'Gender'

# Methodology

1. Exclude datapoints with undefined feature values (e.g. Gender = `invariable`)
2. Encode a grammatical feature as binary (e.g. Gender = 0 if masculine and 1 if feminine) (feature vector)
3. Shuffle the data set
4. Separate the dataset into 80% "training" and 20% test data
5. For each dimension in the test dataset measure if the dimension values are dependent on the grammatic feature
* Using [ANOVA](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html): The data is split into 2 samples, all dimension values when the grammatical feature is 0 and when it's equal to 1.
* Using [Mutual Information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
6. For each dimension highlighted during step #5 find medians for 2 subgroups: when the grammatical feature == 0 and when it == 1.
7. For each word in the test dataset, find the predicted label using MAE using medians of the dimensions from #6.
8. Compute accuracy on the training dataset.

We assume that achieved accuracies can be an efficient way of comparing the quality of grammatical information encoding in the word embeddings.

The experiments for `Gender` and `Number` will be performed for "nouns only", "adjectives only" and "nouns and adjectives" combined.

# Nouns

In [13]:
# We start the experiment with nouns only
pos = ['NOUN']

In [14]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [15]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_noun_train.append(xtr)
    X_noun_test.append(xtst)
    
    y_noun_train.append(ytr)
    y_noun_test.append(ytst)

### Compute ANOVA

We split each dimension into 2 samples: feminine nouns and masculine nouns. ANOVA test is used to assess if the population means are the same.

If the population means are not the same, we can make an assumption that the gender information affects the distribution of values in the given dimension.

If p-value < 0.001, reject the null hypothesis.

In [16]:
pv_threshold = 0.01

In [17]:
anova_dims = []

In [18]:
for i in range(len(models)):
    model_dims = []
    for dim in X_noun_train[i].columns:
        sample1 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims.append(model_dims)

We can see that a very large amount of dimensions appear to be highlighted by the test:

In [19]:
for i in range(len(models)):
    print(f'{models[i]["label"]}:\nTotal dimensions {len(X_noun_train[i].columns)}\nANOVA dimensions: {len(anova_dims[i])}\n')

flau_small_c:
Total dimensions 512
ANOVA dimensions: 336

flau_base_u:
Total dimensions 768
ANOVA dimensions: 236

flau_base_c:
Total dimensions 768
ANOVA dimensions: 305

flau_large_c:
Total dimensions 1024
ANOVA dimensions: 776

cam_base:
Total dimensions 768
ANOVA dimensions: 149

xlm_large:
Total dimensions 1024
ANOVA dimensions: 491

xlm_base:
Total dimensions 768
ANOVA dimensions: 637

bert_base_u:
Total dimensions 768
ANOVA dimensions: 15

distilbert_base:
Total dimensions 768
ANOVA dimensions: 231

bert_base_c:
Total dimensions 768
ANOVA dimensions: 1



### Compute Mutual Information

If mutual information is 0, we can consider that a given dimension is independent from Gender information.

If MI > 0, we can't consider the dimension completely independent and it could encode the Gender information.

In [20]:
mi_dims = []

In [21]:
for i in range(len(models)):
    res = mutual_info_classif(X_noun_train[i], y_noun_train[i], discrete_features=[False]*len(X_noun_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims.append(non_indep_dims)

Overall, threshold of 0 finds much more dimensions. This could be potentially addressed with a different threshold.

In [22]:
for i in range(len(models)):
    print(f"""{models[i]['label']}:
    Total dimensions {len(X_noun_train[i].columns)}
    ANOVA dimensions: {len(anova_dims[i])}
    Mutual Information dimension: {len(mi_dims[i])}\n\n""")

flau_small_c:
    Total dimensions 512
    ANOVA dimensions: 336
    Mutual Information dimension: 318


flau_base_u:
    Total dimensions 768
    ANOVA dimensions: 236
    Mutual Information dimension: 431


flau_base_c:
    Total dimensions 768
    ANOVA dimensions: 305
    Mutual Information dimension: 434


flau_large_c:
    Total dimensions 1024
    ANOVA dimensions: 776
    Mutual Information dimension: 725


cam_base:
    Total dimensions 768
    ANOVA dimensions: 149
    Mutual Information dimension: 460


xlm_large:
    Total dimensions 1024
    ANOVA dimensions: 491
    Mutual Information dimension: 631


xlm_base:
    Total dimensions 768
    ANOVA dimensions: 637
    Mutual Information dimension: 425


bert_base_u:
    Total dimensions 768
    ANOVA dimensions: 15
    Mutual Information dimension: 595


distilbert_base:
    Total dimensions 768
    ANOVA dimensions: 231
    Mutual Information dimension: 416


bert_base_c:
    Total dimensions 768
    ANOVA dimensions: 1
   

For now, for each model we select only dimensions that are potentially dependent on the gender information and found by the both tests.

In [23]:
mi_annova_dims = [set(anova_dims[i]).intersection(mi_dims[i]) for i in range(len(models))]

Final number of dimensions that we can consider not independent from the gender information for each model:

In [24]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: {len(mi_annova_dims[i])}')

flau_small_c: 219
flau_base_u: 146
flau_base_c: 187
flau_large_c: 601
cam_base: 96
xlm_large: 340
xlm_base: 362
bert_base_u: 12
distilbert_base: 145
bert_base_c: 1


### Compute medians

We can try making predictions using the following dimensions:
- All dimensions of WE (as the baseline)
- Using the dimensions found in the ANOVA test
- Using the dimensions found in the Mutual Information test
- Using dimensions found in the both test

To make such predictions we will:
- Compute medians of class 1 and class 0 using only selected dimensions
- For each word in the test set compute its label by finding MAE in comparison to the the median
- Assign the label of the class with the smallest MAE

In [25]:
medians_all = []

In [27]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1].median()
    medians_all.append(medians_df)

In [28]:
medians_anova = []

In [29]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0][list(anova_dims[i])].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1][list(anova_dims[i])].median()
    medians_anova.append(medians_df)

In [30]:
medians_mi = []

In [32]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0][list(mi_dims[i])].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1][list(mi_dims[i])].median()
    medians_mi.append(medians_df)

In [33]:
medians_combined = []

In [34]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0][list(mi_annova_dims[i])].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1][list(mi_annova_dims[i])].median()
    medians_combined.append(medians_df)

In [35]:
medians_combined[1]

Unnamed: 0,median_0,median_1
233,0.491430,0.489383
249,0.452455,0.468181
594,0.531762,0.516457
45,0.477513,0.465895
166,0.487703,0.481313
...,...,...
755,0.490334,0.479075
427,0.557225,0.546022
583,0.484613,0.468212
231,0.475296,0.461325


### Predict label for test set using MAE

In [36]:
y_preds_all = []

In [38]:
for i in range(len(models)):
    mae0 = X_noun_test[i].apply(lambda x: mean_absolute_error(medians_all[i]['median_0'], x), axis=1)
    mae1 = X_noun_test[i].apply(lambda x: mean_absolute_error(medians_all[i]['median_1'], x), axis=1)
    # If MAE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_all.append(y_pred)

In [41]:
y_preds_anova = []

In [42]:
for i in range(len(models)):
    mae0 = X_noun_test[i][list(anova_dims[i])].apply(lambda x: mean_absolute_error(medians_anova[i]['median_0'], x), axis=1)
    mae1 = X_noun_test[i][list(anova_dims[i])].apply(lambda x: mean_absolute_error(medians_anova[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_anova.append(y_pred)

In [43]:
y_preds_mi = []

In [44]:
for i in range(len(models)):
    mae0 = X_noun_test[i][list(mi_dims[i])].apply(lambda x: mean_absolute_error(medians_mi[i]['median_0'], x), axis=1)
    mae1 = X_noun_test[i][list(mi_dims[i])].apply(lambda x: mean_absolute_error(medians_mi[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_mi.append(y_pred)

In [45]:
y_preds_combined = []

In [46]:
for i in range(len(models)):
    mae0 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians_combined[i]['median_0'], x), axis=1)
    mae1 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians_combined[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_combined.append(y_pred)

### Compute accuracy

In [56]:
accs_noun_df = pd.DataFrame(columns=['All dims', 'ANOVA dims', 'MI dims', 'Combined dims'], index=[m['label'] for m in models])

In [57]:
for i in range(len(models)):
    accs_noun_df.loc[models[i]['label'], 'All dims'] = accuracy_score(y_noun_test[i], y_preds_all[i])
    
    if len(y_preds_anova[i]) >= 1:
        accs_noun_df.loc[models[i]['label'], 'ANOVA dims'] = accuracy_score(y_noun_test[i], y_preds_anova[i])
    else: 
        accs_noun_df.loc[models[i]['label'], 'ANOVA dims'] = 0
    
    if len(y_preds_mi[i]) >= 1:
        accs_noun_df.loc[models[i]['label'], 'MI dims'] = accuracy_score(y_noun_test[i], y_preds_mi[i])
    else:
        accs_noun_df.loc[models[i]['label'], 'MI dims'] = 0
    
    if len(y_preds_combined[i]) >= 1:
        accs_noun_df.loc[models[i]['label'], 'Combined dims'] = accuracy_score(y_noun_test[i], y_preds_combined[i])
    else:
        accs_noun_df.loc[models[i]['label'], 'Combined dims'] = 0

In [58]:
accs_noun_df

Unnamed: 0,All dims,ANOVA dims,MI dims,Combined dims
flau_small_c,0.729958,0.729958,0.726341,0.727547
flau_base_u,0.580426,0.581395,0.577519,0.58188
flau_base_c,0.600362,0.629898,0.605184,0.62387
flau_large_c,0.901145,0.901748,0.895118,0.895118
cam_base,0.51578,0.520289,0.51578,0.52119
xlm_large,0.523422,0.531568,0.529532,0.531568
xlm_base,0.498982,0.501018,0.498982,0.503055
bert_base_u,0.507429,0.501714,0.507429,0.507429
distilbert_base,0.556962,0.576854,0.564195,0.56962
bert_base_c,0.567812,0.555154,0.567812,0.555154


In [60]:
medians_all[3]

Unnamed: 0,median_0,median_1
0,0.429337,0.418281
1,0.564280,0.539976
2,0.481558,0.510774
3,0.445667,0.476402
4,0.535114,0.546352
...,...,...
1019,0.462996,0.460294
1020,0.487724,0.492311
1021,0.538437,0.520282
1022,0.537357,0.617977


# Adjectives

Repeat all steps but for adjectives only.

In [61]:
pos = ['ADJ']

In [62]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [63]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_adj_train.append(xtr)
    X_adj_test.append(xtst)
    
    y_adj_train.append(ytr)
    y_adj_test.append(ytst)

In [64]:
anova_dims_adj = []

In [65]:
for i in range(len(models)):
    model_dims = []
    for dim in X_adj_train[i].columns:
        sample1 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_adj.append(model_dims)

In [66]:
mi_dims_adj = []

In [67]:
for i in range(len(models)):
    res = mutual_info_classif(X_adj_train[i], y_adj_train[i], discrete_features=[False]*len(X_adj_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_adj.append(non_indep_dims)

In [68]:
mi_annova_dims_adj = [set(anova_dims_adj[i]).intersection(mi_dims_adj[i]) for i in range(len(models))]

In [69]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: {len(mi_annova_dims_adj[i])}')

flau_small_c: 222
flau_base_u: 99
flau_base_c: 98
flau_large_c: 462
cam_base: 285
xlm_large: 6
xlm_base: 0
bert_base_u: 9
distilbert_base: 62
bert_base_c: 7


### Computing medians

In [70]:
medians_adj_all = []

In [71]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1].median()
    medians_adj_all.append(medians_df)

In [72]:
medians_adj_anova = []

In [73]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0][list(anova_dims_adj[i])].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1][list(anova_dims_adj[i])].median()
    medians_adj_anova.append(medians_df)

In [74]:
medians_adj_mi = []

In [75]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0][list(mi_dims_adj[i])].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1][list(mi_dims_adj[i])].median()
    medians_adj_mi.append(medians_df)

In [76]:
medians_adj_combined = []

In [77]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0][list(mi_annova_dims_adj[i])].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1][list(mi_annova_dims_adj[i])].median()
    medians_adj_combined.append(medians_df)

### Computing predictions

In [79]:
y_preds_adj_all = []

for i in range(len(models)):
    mae0 = X_adj_test[i].apply(lambda x: mean_absolute_error(medians_adj_all[i]['median_0'], x), axis=1)
    mae1 = X_adj_test[i].apply(lambda x: mean_absolute_error(medians_adj_all[i]['median_1'], x), axis=1)
    # If MAE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_all.append(y_pred)

y_preds_adj_anova = []

for i in range(len(models)):
    mae0 = X_adj_test[i][list(anova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_anova[i]['median_0'], x), axis=1)
    mae1 = X_adj_test[i][list(anova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_anova[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_anova.append(y_pred)

y_preds_adj_mi = []

for i in range(len(models)):
    mae0 = X_adj_test[i][list(mi_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_mi[i]['median_0'], x), axis=1)
    mae1 = X_adj_test[i][list(mi_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_mi[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_mi.append(y_pred)

y_preds_adj_combined = []

for i in range(len(models)):
    mae0 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_combined[i]['median_0'], x), axis=1)
    mae1 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj_combined[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_combined.append(y_pred)

In [89]:
accs_adj_df = pd.DataFrame(columns=['All dims', 'ANOVA dims', 'MI dims', 'Combined dims'], index=[m['label'] for m in models])

In [90]:
for i in range(len(models)):
    accs_adj_df.loc[models[i]['label'], 'All dims'] = accuracy_score(y_adj_test[i], y_preds_adj_all[i])
    
    if any(y_preds_adj_anova[i]):
        accs_adj_df.loc[models[i]['label'], 'ANOVA dims'] = accuracy_score(y_adj_test[i], y_preds_adj_anova[i])
    else: 
        accs_adj_df.loc[models[i]['label'], 'ANOVA dims'] = 0
    
    if any(y_preds_adj_mi[i]):
        accs_adj_df.loc[models[i]['label'], 'MI dims'] = accuracy_score(y_adj_test[i], y_preds_adj_mi[i])
    else:
        accs_adj_df.loc[models[i]['label'], 'MI dims'] = 0
    
    if any(y_preds_adj_combined[i]):
        accs_adj_df.loc[models[i]['label'], 'Combined dims'] = accuracy_score(y_adj_test[i], y_preds_adj_combined[i])
    else:
        accs_adj_df.loc[models[i]['label'], 'Combined dims'] = 0

Here are the accuracies for adjectives only:

In [91]:
accs_adj_df

Unnamed: 0,All dims,ANOVA dims,MI dims,Combined dims
flau_small_c,0.830239,0.832891,0.835544,0.830239
flau_base_u,0.42539,0.461024,0.427617,0.469933
flau_base_c,0.607427,0.607427,0.618037,0.618037
flau_large_c,0.909814,0.907162,0.915119,0.907162
cam_base,0.569378,0.569378,0.569378,0.569378
xlm_large,0.487805,0.682927,0.487805,0.658537
xlm_base,0.536585,0.634146,0.536585,0.0
bert_base_u,0.53012,0.53012,0.53012,0.53012
distilbert_base,0.660377,0.716981,0.716981,0.754717
bert_base_c,0.509434,0.471698,0.509434,0.509434


# Adjectives and nouns

In [92]:
pos = ['ADJ', 'NOUN']

X_na_train = []
y_na_train = []

X_na_test = []
y_na_test = []

for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_na_train.append(xtr)
    X_na_test.append(xtst)
    
    y_na_train.append(ytr)
    y_na_test.append(ytst)

anova_dims_na = []

for i in range(len(models)):
    model_dims = []
    for dim in X_adj_train[i].columns:
        sample1 = [x[0] for x in zip(X_na_train[i][dim], y_na_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_na_train[i][dim], y_na_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_na.append(model_dims)

mi_dims_na = []

for i in range(len(models)):
    res = mutual_info_classif(X_na_train[i], y_na_train[i], discrete_features=[False]*len(X_na_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_na.append(non_indep_dims)

mi_annova_dims_na = [set(anova_dims_na[i]).intersection(mi_dims_na[i]) for i in range(len(models))]

for i in range(len(models)):
    print(f'{models[i]["label"]}: {len(mi_annova_dims_na[i])}')

### Computing medians

medians_na_all = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_na_train[i][y_na_train[i] == 0].median()
    medians_df['median_1'] = X_na_train[i][y_na_train[i] == 1].median()
    medians_na_all.append(medians_df)

medians_na_anova = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_na_train[i][y_na_train[i] == 0][list(anova_dims_na[i])].median()
    medians_df['median_1'] = X_na_train[i][y_na_train[i] == 1][list(anova_dims_na[i])].median()
    medians_na_anova.append(medians_df)

medians_na_mi = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_na_train[i][y_na_train[i] == 0][list(mi_dims_na[i])].median()
    medians_df['median_1'] = X_na_train[i][y_na_train[i] == 1][list(mi_dims_na[i])].median()
    medians_na_mi.append(medians_df)

medians_na_combined = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_na_train[i][y_na_train[i] == 0][list(mi_annova_dims_na[i])].median()
    medians_df['median_1'] = X_na_train[i][y_na_train[i] == 1][list(mi_annova_dims_na[i])].median()
    medians_na_combined.append(medians_df)

### Computing predictions

y_preds_adj_all = []

for i in range(len(models)):
    mae0 = X_na_test[i].apply(lambda x: mean_absolute_error(medians_na_all[i]['median_0'], x), axis=1)
    mae1 = X_na_test[i].apply(lambda x: mean_absolute_error(medians_na_all[i]['median_1'], x), axis=1)
    # If MAE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_all.append(y_pred)

y_preds_adj_anova = []

for i in range(len(models)):
    mae0 = X_na_test[i][list(anova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_anova[i]['median_0'], x), axis=1)
    mae1 = X_na_test[i][list(anova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_anova[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_anova.append(y_pred)

y_preds_adj_mi = []

for i in range(len(models)):
    mae0 = X_na_test[i][list(mi_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_mi[i]['median_0'], x), axis=1)
    mae1 = X_na_test[i][list(mi_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_mi[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_mi.append(y_pred)

y_preds_adj_combined = []

for i in range(len(models)):
    mae0 = X_na_test[i][list(mi_annova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_combined[i]['median_0'], x), axis=1)
    mae1 = X_na_test[i][list(mi_annova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na_combined[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mae0 > mae1).apply(int)
    y_preds_adj_combined.append(y_pred)


flau_small_c: 257
flau_base_u: 210
flau_base_c: 201
flau_large_c: 611
cam_base: 264
xlm_large: 280
xlm_base: 15
bert_base_u: 6
distilbert_base: 176
bert_base_c: 1


NameError: name 'y_preds_na_all' is not defined

In [94]:
accs_na_df = pd.DataFrame(columns=['All dims', 'ANOVA dims', 'MI dims', 'Combined dims'], index=[m['label'] for m in models])

for i in range(len(models)):
    accs_na_df.loc[models[i]['label'], 'All dims'] = accuracy_score(y_na_test[i], y_preds_adj_all[i])
    
    if any(y_preds_adj_anova[i]):
        accs_na_df.loc[models[i]['label'], 'ANOVA dims'] = accuracy_score(y_na_test[i], y_preds_adj_anova[i])
    else: 
        accs_na_df.loc[models[i]['label'], 'ANOVA dims'] = 0
    
    if any(y_preds_adj_mi[i]):
        accs_na_df.loc[models[i]['label'], 'MI dims'] = accuracy_score(y_na_test[i], y_preds_adj_mi[i])
    else:
        accs_na_df.loc[models[i]['label'], 'MI dims'] = 0
    
    if any(y_preds_adj_combined[i]):
        accs_na_df.loc[models[i]['label'], 'Combined dims'] = accuracy_score(y_na_test[i], y_preds_adj_combined[i])
    else:
        accs_na_df.loc[models[i]['label'], 'Combined dims'] = 0

In [95]:
accs_na_df

Unnamed: 0,All dims,ANOVA dims,MI dims,Combined dims
flau_small_c,0.725799,0.728256,0.718919,0.72285
flau_base_u,0.52885,0.541584,0.52885,0.547951
flau_base_c,0.571499,0.575921,0.590172,0.591646
flau_large_c,0.898771,0.896314,0.890418,0.890909
cam_base,0.509105,0.508346,0.509105,0.508346
xlm_large,0.537594,0.554511,0.548872,0.554511
xlm_base,0.571429,0.593985,0.571429,0.590226
bert_base_u,0.504702,0.520376,0.492163,0.547544
distilbert_base,0.556106,0.584158,0.564356,0.592409
bert_base_c,0.508251,0.491749,0.50495,0.491749
