In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_adjs_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abandonné,0.530145,0.060065,0.007523,0.990865,0.430226,0.009984,0.979131,0.978007,0.169568,0.868192,...,0.03252,0.537425,0.982741,0.068059,0.106907,0.257188,0.76537,masculine,singular,abandonné
absent,0.531874,0.058954,0.008474,0.99232,0.430201,0.009588,0.979643,0.978505,0.169001,0.868324,...,0.030967,0.538838,0.983681,0.068282,0.106521,0.258556,0.765498,masculine,singular,absent
absolue,0.530525,0.059162,0.006882,0.99101,0.430248,0.010904,0.97886,0.97844,0.167912,0.868436,...,0.03162,0.535794,0.983176,0.068435,0.106496,0.259584,0.765379,feminine,singular,absolu
accessible,0.533019,0.059168,0.008158,0.993498,0.430884,0.008257,0.979684,0.97807,0.167863,0.868585,...,0.027722,0.534105,0.983887,0.06777,0.107722,0.258436,0.763149,invariable,singular,accessible
accompagné,0.52827,0.0593,0.005714,0.989006,0.430188,0.010091,0.980515,0.978253,0.171316,0.867668,...,0.035285,0.540979,0.982319,0.070023,0.107055,0.257919,0.768394,masculine,singular,accompagné


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, semantic info
feature_col_count = 4

# Feature to investigate in this notebook
feature = 'Number'

In this notebook we will calculate **InfEnc** metric for word embeddings of different models, in order to compare the quality of information encoding about `Number of Adjectives`.

# Adjectives

In [15]:
pos = ['ADJ']

Split each model into train and test using k_fold cross validation:

In [16]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [17]:
n_folds = 5

In [18]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='plural',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_adj_train.append(X_trains)
    X_adj_test.append(X_tests)
    
    y_adj_train.append(y_trains)
    y_adj_test.append(y_tests)

In [19]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_adj_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,968
flau_base_u,1138
flau_base_c,968
flau_large_c,968
cam_base,341
xlm_large,32
xlm_base,32
bert_base_u,198
distilbert_base,89
bert_base_c,89


## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [20]:
anova_dims = []

In [21]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_adj_train[i][j], y_adj_train[i][j]))

Get dimensions where MI > 0:

In [22]:
mi_dims = []

In [23]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_adj_train[i][j], y_adj_train[i][j]))

In [24]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [25]:
non_ind_dims = []

In [26]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [27]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [28]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [29]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,378,367,298
flau_base_u,304,460,203
flau_base_c,488,505,360
flau_large_c,773,768,629
cam_base,106,412,65
xlm_large,41,505,27
xlm_base,1,219,0
bert_base_u,498,639,417
distilbert_base,115,411,89
bert_base_c,12,542,10


In [30]:
dims = {}

In [31]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_adj_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [32]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [33]:
lr_res = []

In [34]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_adj_train[i][j], y_adj_train[i][j]))

Train Perceptron:

In [35]:
perceptron_res = []

In [36]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_adj_train[i][j], y_adj_train[i][j]))

Compute correlation to the gender vector:

In [37]:
corr_res = []

In [38]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_adj_train[i][j], y_adj_train[i][j]))

In [39]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_adj_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

## Compute medians

For each of the dimension groups above, compute a median vector: the one that should describe the expected values of the dimensions the best.
We compute median vector associated with feature vector values = 0 and another one for feature vector values = 1.

In [41]:
medians = {}

In [42]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_adj_train[i][j][y_adj_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_adj_train[i][j][y_adj_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [43]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

In [44]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,378,304,488,773,106,41,1,498,115,12
MI,367,460,505,768,412,505,219,639,411,542
All non ind,298,203,360,629,65,27,0,417,89,10
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,2,1,0,3,0,0,0,0,1,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [45]:
y_preds = {}

In [46]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_adj_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_adj_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

Using the median vectors computed for each group above, try to predict the value of the feature vector on test.
For this, for any word embedding in test compute distances to median for feature vector values = 0 and for feature vector values = 1. 
If the given word embedding is closer to median of feature vector values = 0, assign 0 as predicted label, 1 otherwise.

In [47]:
accs = {}

In [48]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_adj_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [49]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [50]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [51]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.929959,0.543761,0.636157,0.947934,0.530545,0.576815,0.525605,0.55374,0.62191,0.535981
ANOVA,0.930372,0.564323,0.639669,0.948554,0.533483,0.595766,0.30625,0.554761,0.630975,0.542773
MI,0.930579,0.549033,0.650826,0.946281,0.530545,0.576613,0.55121,0.554756,0.601558,0.535981
All non ind,0.929339,0.573111,0.65186,0.945661,0.534663,0.589315,0.103226,0.554761,0.624157,0.538279
LR1,0.780579,0.560808,0.609504,0.875,0.521754,0.493952,0.609073,0.552725,0.66951,0.542824
Perc1,0.747934,0.58471,0.705785,0.891942,0.527012,0.564516,0.54496,0.556781,0.606078,0.556333
Corr1,0.754752,0.581195,0.708678,0.88905,0.546992,0.563911,0.55121,0.557801,0.637717,0.549591
All imp dims1,0.718388,0.555712,0.141116,0.830579,0.0,0.0,0.348387,0.0,0.371885,0.0
LR5,0.872521,0.568541,0.706405,0.933264,0.524075,0.583266,0.55746,0.55273,0.674132,0.535981
Perc5,0.871281,0.595079,0.729339,0.935537,0.535249,0.576613,0.518952,0.55374,0.628779,0.554086


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [52]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.000413,0.020562,0.003512,0.00062,0.002938,0.018952,-0.219355,0.00102,0.009065373,0.006793
MI,0.00062,0.005272,0.014669,-0.001653,0.0,-0.000202,0.025605,0.001015,-0.0203524,0.0
All non ind,-0.00062,0.02935,0.015702,-0.002273,0.004118,0.0125,-0.422379,0.00102,0.002247191,0.002298
LR1,-0.14938,0.017047,-0.026653,-0.072934,-0.008791,-0.082863,0.083468,-0.001015,0.04759959,0.006844
Perc1,-0.182025,0.040949,0.069628,-0.055992,-0.003533,-0.012298,0.019355,0.003041,-0.01583248,0.020352
Corr1,-0.175207,0.037434,0.072521,-0.058884,0.016446,-0.012903,0.025605,0.004061,0.01580695,0.013611
All imp dims1,-0.21157,0.011951,-0.495041,-0.117355,-0.530545,-0.576815,-0.177218,-0.55374,-0.2500255,-0.535981
LR5,-0.057438,0.02478,0.070248,-0.014669,-0.006471,0.006452,0.031855,-0.00101,0.05222165,0.0
Perc5,-0.058678,0.051318,0.093182,-0.012397,0.004704,-0.000202,-0.006653,0.0,0.006869254,0.018105


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [53]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c LR50 0.02438016528925624
flau_base_u Perc5 0.051318101933216065
flau_base_c Perc10 0.09586776859504142
flau_large_c All imp dims50 0.009504132231404894
cam_base Corr1 0.01644643781266164
xlm_large All imp dims50 0.025604838709677336
xlm_base LR1 0.08346774193548379
bert_base_u All imp dims10 0.005065887299389837
distilbert_base LR10 0.07254851889683345
bert_base_c Perc1 0.020352400408580107


Below, you can find the best achieved accuracy and the dimension group that produced it. This value is `InfEnc`.

In [54]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c LR50 0.9543388429752067
flau_base_u Perc5 0.5950790861159929
flau_base_c Perc10 0.7320247933884297
flau_large_c All imp dims50 0.9574380165289256
cam_base Corr1 0.5469915473520786
xlm_large All imp dims50 0.6024193548387096
xlm_base LR1 0.6090725806451612
bert_base_u All imp dims10 0.5588063374865405
distilbert_base LR10 0.6944586312563841
bert_base_c Perc1 0.5563329928498467


In [58]:
with open('../Data/best_results/number_adj_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [59]:
with open('../Data/best_results/number_adj_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [60]:
with open('../Data/best_results/number_adj_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion

(Similar to number of nouns)
- FlauBERT small and large show very similar results despite FlauBERT large being 10x larger in terms of paramters
- DistilBERT shows much better results for number than for gender, alongside with XLM