In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_nouns_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Number,Gender,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.836139,0.551025,0.347197,0.789782,0.393246,0.300031,0.609232,0.913722,0.178287,0.838988,...,0.377143,0.586894,0.674134,0.276217,0.081619,0.154076,0.553278,invariable,feminine,2D
3D,0.828946,0.54988,0.339958,0.797102,0.395346,0.301288,0.603051,0.916539,0.185836,0.844714,...,0.370481,0.584636,0.674752,0.278596,0.078074,0.143366,0.548315,invariable,feminine,3D
a,0.82666,0.556389,0.332618,0.805891,0.399844,0.295803,0.597985,0.914675,0.201301,0.846672,...,0.371326,0.577705,0.673679,0.278649,0.081689,0.13949,0.533415,invariable,masculine,a
aa,0.826916,0.555891,0.332298,0.805754,0.400046,0.296797,0.597758,0.914183,0.201098,0.845985,...,0.372299,0.578594,0.673429,0.278861,0.081218,0.13901,0.533594,invariable,masculine,aa
abandon,0.826281,0.556306,0.332007,0.805256,0.399368,0.296458,0.597567,0.914983,0.200809,0.846577,...,0.37353,0.578808,0.673362,0.279152,0.08156,0.13939,0.534123,singular,masculine,abandon


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma
feature_col_count = 3

# Feature to investigate in this notebook
feature = 'Number'

# Nouns

In [14]:
pos = ['NOUN']

Split each model into train and test:

In [28]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [29]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='plural',
                                           split=True,
                                           balance=True)
    X_noun_train.append(xtr)
    X_noun_test.append(xtst)
    
    y_noun_train.append(ytr)
    y_noun_test.append(ytst)

In [30]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['Train size'] = [len(x) for x in X_noun_train]
dataset_sizes['Test size'] = [len(x) for x in X_noun_test]
dataset_sizes

Unnamed: 0,Train size,Test size
flau_small_c,6603,1651
flau_base_u,7851,1963
flau_base_c,6603,1651
flau_large_c,6603,1651
cam_base,2915,729
xlm_large,531,133
xlm_base,531,133
bert_base_u,2008,502
distilbert_base,1316,330
bert_base_c,1316,330


## Non-independent dims

ANOVA test with p-value < 0.01:

In [31]:
anova_dims = [get_anova_dims(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [32]:
mi_dims = [get_mi_dims(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [33]:
non_ind_dims = [list(set(anova_dims[i]).intersection(mi_dims[i])) for i in range(len(models))]

Stats about the number of dimensions for each model:

In [34]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [35]:
non_ind_df['ANOVA'] = [len(x) for x in anova_dims]
non_ind_df['MI'] = [len(x) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x) for x in non_ind_dims]

In [36]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,400,372,313
flau_base_u,373,620,305
flau_base_c,532,540,401
flau_large_c,859,786,694
cam_base,18,396,9
xlm_large,284,646,215
xlm_base,4,363,3
bert_base_u,514,677,458
distilbert_base,311,520,235
bert_base_c,1,529,1


In [37]:
dims = {}

In [38]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = X_noun_train[i].columns
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [39]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [40]:
lr_res = [lr(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

Train Perceptron 10 times and get average weights:

In [41]:
perceptron_res = [perceptron(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

Compute correlation to the gender vector:

In [42]:
corr_res = [correlation(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [43]:
for i in range(len(models)):
    for alpha in alphas:
        num_imp_dims = len(X_noun_test[i].columns)*alpha//100
        lr_dims = [str(x[0]) for x in lr_res[i][:num_imp_dims]]
        perc_dims = [str(x[0]) for x in perceptron_res[i][:num_imp_dims]]
        corr_dims = [str(x[0]) for x in corr_res[i][:num_imp_dims]]
        dims[labels[i]][f'LR{alpha}'] = lr_dims
        dims[labels[i]][f'Perc{alpha}'] = perc_dims
        dims[labels[i]][f'Corr{alpha}'] = corr_dims
        dims[labels[i]][f'All imp dims{alpha}'] = list(set(lr_dims).intersection(perc_dims).intersection(corr_dims))

## Compute medians

In [44]:
medians = {}

In [45]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = {}
        dim_list = dims[labels[i]][dim_group]
        # Median of dimensions where feature vector is equal to 0
        median_0 = X_noun_train[i][y_noun_train[i] == 0][dim_list].median()
        # Median of dimensions where feature vector is equal to 1
        median_1 = X_noun_train[i][y_noun_train[i] == 1][dim_list].median()
        
        medians[labels[i]][dim_group]['0'] = median_0
        medians[labels[i]][dim_group]['1'] = median_1


In [46]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group])

In [47]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,512,768,768,1024,768,1024,768,768,768,768
ANOVA,400,373,532,859,18,284,4,514,311,1
MI,372,620,540,786,396,646,363,677,520,529
All non ind,313,305,401,694,9,215,3,458,235,1
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,2,2,1,4,1,2,3,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [48]:
y_preds = {}

In [49]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        dim_list = dims[labels[i]][dim_group]
        mae0 = X_noun_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['0'], x), axis=1)
        mae1 = X_noun_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['1'], x), axis=1)
        # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
        y_preds[labels[i]][dim_group] = (mae0 > mae1).apply(int)
    

## Compute accuracies

In [50]:
accs = {}

In [51]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_true = y_noun_test[i]
        y_pred = y_preds[labels[i]][dim_group]
        if any(y_pred):
            acc = accuracy_score(y_true, y_pred)
        else:
            acc = 0
        accs[labels[i]][dim_group] = acc

In [52]:
accs_df = pd.DataFrame(accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.915203,0.638309,0.612356,0.939431,0.48011,0.533835,0.518797,0.573705,0.669697,0.609091
ANOVA,0.916414,0.640346,0.612962,0.938219,0.510288,0.526316,0.571429,0.573705,0.678788,0.533333
MI,0.912174,0.644931,0.625681,0.939431,0.48011,0.533835,0.518797,0.573705,0.678788,0.606061
All non ind,0.909752,0.638309,0.626287,0.939431,0.499314,0.533835,0.578947,0.571713,0.681818,0.533333
LR1,0.771654,0.577687,0.650515,0.857056,0.541838,0.578947,0.601504,0.579681,0.545455,0.6
Perc1,0.74682,0.583291,0.688674,0.885524,0.51989,0.586466,0.56391,0.573705,0.642424,0.593939
Corr1,0.738946,0.583291,0.683828,0.881284,0.514403,0.593985,0.533835,0.579681,0.630303,0.59697
All imp dims1,0.715324,0.548141,0.649909,0.837674,0.532236,0.593985,0.586466,0.0,0.0,0.0
LR5,0.854634,0.602649,0.677771,0.92974,0.502058,0.578947,0.556391,0.577689,0.645455,0.563636
Perc5,0.866142,0.618951,0.712296,0.947305,0.491084,0.586466,0.511278,0.571713,0.684848,0.542424


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [53]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.001211,0.002038,0.000606,-0.001211,0.030178,-0.007519,0.052632,0.0,0.009091,-0.075758
MI,-0.003028,0.006623,0.013325,0.0,0.0,0.0,0.0,0.0,0.009091,-0.00303
All non ind,-0.005451,0.0,0.013931,0.0,0.019204,0.0,0.06015,-0.001992,0.012121,-0.075758
LR1,-0.143549,-0.060621,0.038159,-0.082374,0.061728,0.045113,0.082707,0.005976,-0.124242,-0.009091
Perc1,-0.168383,-0.055018,0.076317,-0.053907,0.039781,0.052632,0.045113,0.0,-0.027273,-0.015152
Corr1,-0.176257,-0.055018,0.071472,-0.058147,0.034294,0.06015,0.015038,0.005976,-0.039394,-0.012121
All imp dims1,-0.199879,-0.090168,0.037553,-0.101757,0.052126,0.06015,0.067669,-0.573705,-0.669697,-0.609091
LR5,-0.060569,-0.03566,0.065415,-0.009691,0.021948,0.045113,0.037594,0.003984,-0.024242,-0.045455
Perc5,-0.049061,-0.019358,0.099939,0.007874,0.010974,0.052632,-0.007519,-0.001992,0.015152,-0.066667


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [54]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc50 0.03149606299212604
flau_base_u Perc25 0.020886398369842074
flau_base_c Perc5 0.09993943064809208
flau_large_c LR25 0.016353725015142317
cam_base LR1 0.06172839506172839
xlm_large Corr1 0.06015037593984962
xlm_base LR1 0.08270676691729317
bert_base_u LR1 0.005976095617529875
distilbert_base LR25 0.051515151515151514
bert_base_c All dims 0.0


In [55]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc50 0.9466989703210176
flau_base_u Perc25 0.6591951095262354
flau_base_c Perc5 0.7122955784373107
flau_large_c LR25 0.9557843731072078
cam_base LR1 0.541838134430727
xlm_large Corr1 0.5939849624060151
xlm_base LR1 0.6015037593984962
bert_base_u LR1 0.5796812749003984
distilbert_base LR25 0.7212121212121212
bert_base_c All dims 0.6090909090909091


In [56]:
if 'number_noun.pickle' not in os.listdir('../Data/best_results/'):
    best_res = {}
    for label in labels:
        best_res[label] = {}
        best_res[label]['best_dim_set'] = []
        best_res[label]['best_dims'] = []
        best_res[label]['accs'] = []
        best_res[label]['gains'] = []
        best_res[label]['medians_0'] = []
        best_res[label]['medians_1'] = []


else:
    with open('../Data/best_results/number_noun.pickle', 'rb') as f:
        best_res = pickle.load(f)

In [57]:
for i in range(len(models)):
    best_res[labels[i]]['best_dim_set'].append(gains_df.idxmax()[i])
    best_res[labels[i]]['best_dims'].append(dims[labels[i]][gains_df.idxmax()[i]])
    best_res[labels[i]]['accs'].append(accs_df.loc[accs_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['gains'].append(gains_df.loc[gains_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['medians_0'].append(medians[labels[i]][accs_df.idxmax()[i]]['0'])
    best_res[labels[i]]['medians_1'].append(medians[labels[i]][accs_df.idxmax()[i]]['1'])

In [58]:
with open('../Data/best_results/number_noun.pickle', 'wb') as f:
    pickle.dump(best_res, f)

# Conclusion

1. mBERT-base-cased has the highest accuracy for encoding number information throughout the whole WE vector and improvement for mBERT-base-uncased, is not very significant.
2. Similar to previous experiment results, cased models appear to achieve higher accuracy (mBERT cased, FlauBERT cased).
3. Overall the number in nouns seems to be encoded relatively better than gender throughout all models (they achieve higher accuracy).
4. Once again, smaller models (XLM-base, DistilBERT, FlauBERT-small) achieve comparable or better accuracies than their bigger counterparts.