In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.909603,0.551025,0.436937,0.789782,0.311517,0.417476,0.629341,0.913722,0.349834,0.838988,...,0.285154,0.057887,0.030874,0.310204,invariable,feminine,2D,NOUN,,
3D,0.901779,0.54988,0.427826,0.797102,0.3139,0.419226,0.623478,0.916539,0.355807,0.844714,...,0.28761,0.054251,0.018604,0.305226,invariable,feminine,3D,NOUN,,
aa,0.89957,0.555891,0.418186,0.805754,0.319232,0.412976,0.618457,0.914183,0.367882,0.845985,...,0.287883,0.057476,0.013614,0.290463,invariable,masculine,aa,NOUN,,
abandon,0.898879,0.556306,0.417821,0.805256,0.318463,0.412505,0.618276,0.914983,0.367654,0.846577,...,0.288183,0.057827,0.01405,0.290994,singular,masculine,abandon,NOUN,,
abbaye,0.895888,0.554461,0.420659,0.799339,0.322441,0.418141,0.61882,0.921033,0.367005,0.845904,...,0.293985,0.057312,0.012175,0.298473,singular,feminine,abbaye,NOUN,,


In [14]:
# There are 6 extra feautres in addition to embedding dimensions in the file
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'Gender'

In this notebook we will calculate **InfEnc** metric for word embeddings of different models, in order to compare the quality of information encoding about `Gender of Nouns and Adjectives`.


For this we will pick concatenate dimensions of word embedding of nouns and adjectives. Note that words that are both adjectives and nouns were discarded previously.
Feminine words will be assigned 1 in the feature vector, while masculine words will be assigned 0. Invariable words will be discared.

# Nouns & Adjectives

In [15]:
pos = ['NOUN', 'ADJ']

Split each model into train and test using k_fold cross validation:

In [16]:
X_na_train = []
y_na_train = []

X_na_test = []
y_na_test = []

In [17]:
n_folds = 5

In [18]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable') & \
                                      (we.POS.isin(pos))],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_na_train.append(X_trains)
    X_na_test.append(X_tests)
    
    y_na_train.append(y_trains)
    y_na_test.append(y_tests)

In [19]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_na_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,1792
flau_base_u,2296
flau_base_c,1792
flau_large_c,1792
cam_base,1207
xlm_large,396
xlm_base,396
bert_base_u,770
distilbert_base,528
bert_base_c,528


## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [20]:
anova_dims = []

In [21]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_na_train[i][j], y_na_train[i][j]))

Get dimensions where MI > 0:

In [22]:
mi_dims = []

In [23]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_na_train[i][j], y_na_train[i][j]))

In [24]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [25]:
non_ind_dims = []

In [26]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [27]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [28]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [29]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,347,344,253
flau_base_u,223,597,187
flau_base_c,326,411,187
flau_large_c,782,731,609
cam_base,323,540,231
xlm_large,411,633,275
xlm_base,272,406,145
bert_base_u,17,550,13
distilbert_base,239,489,163
bert_base_c,1,461,0


In [30]:
dims = {}

In [31]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_na_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [32]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [33]:
lr_res = []

In [34]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_na_train[i][j], y_na_train[i][j]))

Train Perceptron:

In [35]:
perceptron_res = []

In [36]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_na_train[i][j], y_na_train[i][j]))

Compute correlation to the gender vector:

In [37]:
corr_res = []

In [38]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_na_train[i][j], y_na_train[i][j]))

In [39]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_na_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

## Compute medians

For each of the dimension groups above, compute a median vector: the one that should describe the expected values of the dimensions the best.
We compute median vector associated with feature vector values = 0 and another one for feature vector values = 1.

In [41]:
medians = {}

In [42]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_na_train[i][j][y_na_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_na_train[i][j][y_na_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [43]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

In [44]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,347,223,326,782,323,411,272,17,239,1
MI,344,597,411,731,540,633,406,550,489,461
All non ind,253,187,187,609,231,275,145,13,163,0
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,1,0,2,0,0,1,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

Using the median vectors computed for each group above, try to predict the value of the feature vector on test.
For this, for any word embedding in test compute distances to median for feature vector values = 0 and for feature vector values = 1. 
If the given word embedding is closer to median of feature vector values = 0, assign 0 as predicted label, 1 otherwise.

In [45]:
y_preds = {}

In [46]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_na_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_na_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

Using the predictions above, compute the accuracies for each dimension group.

In [47]:
accs = {}

In [48]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_na_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [49]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [50]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [51]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.725611,0.550366,0.583388,0.886694,0.534981,0.56428,0.531377,0.500782,0.591797,0.52846
ANOVA,0.725834,0.561084,0.586178,0.886136,0.534816,0.563771,0.526816,0.518466,0.599763,0.206258
MI,0.71947,0.552806,0.587182,0.887476,0.534981,0.565287,0.531881,0.500782,0.584971,0.528458
All non ind,0.721368,0.560822,0.591647,0.887699,0.534816,0.571362,0.526819,0.516904,0.597869,0.10019
LR1,0.615315,0.52309,0.536501,0.671691,0.528679,0.561235,0.537444,0.500526,0.551608,0.528837
Perc1,0.632285,0.540258,0.581379,0.751955,0.53382,0.56579,0.538959,0.499479,0.585733,0.52694
Corr1,0.641326,0.549234,0.59098,0.75988,0.539125,0.579468,0.541488,0.518205,0.579651,0.529221
All imp dims1,0.580375,0.216685,0.0,0.607946,0.0,0.0,0.332383,0.103251,0.113472,0.0
LR5,0.710763,0.539125,0.593879,0.776289,0.535644,0.562754,0.527325,0.498182,0.591421,0.52846
Perc5,0.721254,0.571976,0.631613,0.852423,0.533654,0.585536,0.533902,0.501822,0.603176,0.523148


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [52]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.000223,0.010718,0.00279,-0.000558,-0.0001658375,-0.000509,-0.004560798,0.01768395,0.007966,-0.3222026
MI,-0.006141,0.00244,0.003794,0.000782,0.0,0.001008,0.0005037719,0.0,-0.006825,-2.156288e-06
All non ind,-0.004243,0.010457,0.008259,0.001005,-0.0001658375,0.007082,-0.004558241,0.01612146,0.006072,-0.4282704
LR1,-0.110296,-0.027276,-0.046887,-0.215003,-0.006302236,-0.003044,0.006066999,-0.0002567004,-0.040189,0.0003773504
Perc1,-0.093326,-0.010108,-0.002009,-0.13474,-0.001161412,0.00151,0.007582151,-0.001303092,-0.006063,-0.001520183
Corr1,-0.084285,-0.001131,0.007592,-0.126815,0.004143189,0.015189,0.01011124,0.01742253,-0.012146,0.0007604508
All imp dims1,-0.145236,-0.333681,-0.583388,-0.278748,-0.5349814,-0.56428,-0.1989937,-0.3975313,-0.478324,-0.5284601
LR5,-0.014848,-0.01124,0.010491,-0.110405,0.0006625255,-0.001525,-0.004051912,-0.002600442,-0.000376,-1.110223e-16
Perc5,-0.004356,0.02161,0.048225,-0.034271,-0.001327524,0.021257,0.002525253,0.001039299,0.011379,-0.005312374


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [53]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.06831908799952136
flau_base_u Perc25 0.03398404348189121
flau_base_c Perc25 0.07032888898859369
flau_large_c Perc50 0.016744810361330575
cam_base Corr1 0.004143189053352425
xlm_large All imp dims25 0.027321314409921915
xlm_base All imp dims5 0.02986063163278363
bert_base_u ANOVA 0.017683954537010438
distilbert_base All imp dims50 0.023517192800874054
bert_base_c Corr1 0.0007604508078888905


Below, you can find the best achieved accuracy and the dimension group that produced it. This value is `InfEnc`.

In [54]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.7939300296123475
flau_base_u Perc25 0.584349783273743
flau_base_c Perc25 0.6537168107402089
flau_large_c Perc50 0.9034389457206669
cam_base Corr1 0.5391245924478684
xlm_large All imp dims25 0.591600818309679
xlm_base All imp dims5 0.5612376933895922
bert_base_u ANOVA 0.5184662151892321
distilbert_base All imp dims50 0.6153139554942212
bert_base_c Corr1 0.5292205738600424


In [58]:
with open('../Data/best_results/gender_na_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [59]:
with open('../Data/best_results/gender_na_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [60]:
with open('../Data/best_results/gender_na_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion

(Similar to Gender of Nouns and Gender of Adjectives):
    
- Despite same vocabulary sizes, FlauBERT family models show different results. Bigger models (FlauBERT base) show worse results than the smaller model (FlauBERT small). Therefore, we can assume that a higher number of parameters doesn't guarantee better performance.
- Similarly, DistilBERT performs better than CamemBERT, which has significantly bigger vocabulary. Therefore, we can assume that vocabulary size doesn't have a big affect on the results.
- We can note that cased models (FlauBERT base and mBERT) perform better on the task than their uncased analogs.
- For all models a subset of dimensions and not all dimensions gave the best InfEnc results.