In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_nouns_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,762,763,764,765,766,767,Number,Gender,Lemma,Semantic
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.836139,0.551025,0.347197,0.789782,0.393246,0.300031,0.609232,0.913722,0.178287,0.838988,...,0.586894,0.674134,0.276217,0.081619,0.154076,0.553278,invariable,feminine,2D,Other
3D,0.828946,0.54988,0.339958,0.797102,0.395346,0.301288,0.603051,0.916539,0.185836,0.844714,...,0.584636,0.674752,0.278596,0.078074,0.143366,0.548315,invariable,feminine,3D,Other
a,0.82666,0.556389,0.332618,0.805891,0.399844,0.295803,0.597985,0.914675,0.201301,0.846672,...,0.577705,0.673679,0.278649,0.081689,0.13949,0.533415,invariable,masculine,a,Attribute
aa,0.826916,0.555891,0.332298,0.805754,0.400046,0.296797,0.597758,0.914183,0.201098,0.845985,...,0.578594,0.673429,0.278861,0.081218,0.13901,0.533594,invariable,masculine,aa,Other
abandon,0.826281,0.556306,0.332007,0.805256,0.399368,0.296458,0.597567,0.914983,0.200809,0.846577,...,0.578808,0.673362,0.279152,0.08156,0.13939,0.534123,singular,masculine,abandon,Other


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, semantic info
feature_col_count = 4

# Feature to investigate in this notebook
feature = 'Gender'

In this notebook we will calculate **InfEnc** metric for word embeddings of different models, in order to compare the quality of information encoding about `Gender of Nouns`.

Feminine words will be assigned 1 in the feature vector, while masculine words will be assigned 0. Invariable words will be discared.

# Nouns

In [14]:
pos = ['NOUN']

Split each model into train and test using k_fold cross validation:

In [15]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [16]:
n_folds = 5

In [17]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_noun_train.append(X_trains)
    X_noun_test.append(X_tests)
    
    y_noun_train.append(y_trains)
    y_noun_test.append(y_tests)

In [18]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_noun_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,2365
flau_base_u,2802
flau_base_c,2365
flau_large_c,2365
cam_base,1575
xlm_large,458
xlm_base,458
bert_base_u,885
distilbert_base,618
bert_base_c,618


Notably, FlauBERT-family vocabulary is the largest out of all other model families.

## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [19]:
anova_dims = []

In [20]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_noun_train[i][j], y_noun_train[i][j]))

Get dimensions where MI > 0:

In [21]:
mi_dims = []

In [22]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_noun_train[i][j], y_noun_train[i][j]))

In [23]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [24]:
non_ind_dims = []

In [25]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [26]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [27]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [28]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,360,332,256
flau_base_u,259,683,234
flau_base_c,344,422,208
flau_large_c,812,765,646
cam_base,196,447,121
xlm_large,304,565,184
xlm_base,55,431,33
bert_base_u,11,498,8
distilbert_base,255,443,170
bert_base_c,0,468,0


In [106]:
dims = {}

In [107]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_noun_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [31]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [32]:
lr_res = []

In [33]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_noun_train[i][j], y_noun_train[i][j]))

Train Perceptron:

In [34]:
perceptron_res = []

In [35]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_noun_train[i][j], y_noun_train[i][j]))

Compute correlation to the gender vector:

In [36]:
corr_res = []

In [37]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_noun_train[i][j], y_noun_train[i][j]))

In [108]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_noun_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

## Compute medians

For each of the dimension groups above, compute a median vector: the one that should describe the expected values of the dimensions the best.
We compute median vector associated with feature vector values = 0 and another one for feature vector values = 1.

In [40]:
medians = {}

In [41]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_noun_train[i][j][y_noun_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_noun_train[i][j][y_noun_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [110]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

Below you can see the number of dimensions found by each type of testing for dimension candidates:

In [111]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,360,259,344,812,196,304,55,11,255,0
MI,332,683,422,765,447,565,431,498,443,468
All non ind,256,234,208,646,121,184,33,8,170,0
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,0,0,0,0,0,1,0,1,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

Using the median vectors computed for each group above, try to predict the value of the feature vector on test.
For this, for any word embedding in test compute distances to median for feature vector values = 0 and for feature vector values = 1. 
If the given word embedding is closer to median of feature vector values = 0, assign 0 as predicted label, 1 otherwise.

In [44]:
y_preds = {}

In [45]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_noun_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_noun_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

Using the predictions above, compute the accuracies for each dimension group.

In [61]:
accs = {}

In [62]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_noun_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [72]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [76]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [78]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.735877,0.567034,0.632526,0.879398,0.518292,0.542388,0.523601,0.479434,0.560194,0.538835
ANOVA,0.737822,0.578884,0.646058,0.878975,0.519181,0.548951,0.528857,0.490733,0.566019,0.210032
MI,0.736807,0.56789,0.627029,0.879567,0.518292,0.543269,0.523165,0.47966,0.56343,0.538835
All non ind,0.736469,0.576314,0.63481,0.879483,0.520452,0.546766,0.529296,0.491185,0.566019,0.208091
LR1,0.615528,0.522273,0.544654,0.649105,0.515371,0.543708,0.547644,0.482147,0.565696,0.537864
Perc1,0.628891,0.547473,0.573917,0.76497,0.52706,0.561179,0.53977,0.488247,0.563107,0.534951
Corr1,0.639379,0.548758,0.590322,0.760826,0.524009,0.557683,0.551143,0.503617,0.576375,0.544013
All imp dims1,0.235823,0.104605,0.0,0.492061,0.0,0.0,0.113537,0.105989,0.112621,0.0
LR5,0.695789,0.545402,0.589815,0.758543,0.519563,0.544589,0.534107,0.47966,0.583819,0.540777
Perc5,0.727505,0.571388,0.623985,0.846415,0.52528,0.555507,0.53279,0.482146,0.596764,0.539482


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [80]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.001945,0.011851,0.013532,-0.000423,0.0008892116,0.006564,0.005256419,0.01129892,0.005825,-0.3288026
MI,0.000931,0.000856,-0.005497,0.000169,0.0,0.000881,-0.0004357257,0.0002262443,0.003236,0.0
All non ind,0.000592,0.00928,0.002284,8.5e-05,0.002159698,0.004378,0.005695011,0.0117509,0.005825,-0.3307443
LR1,-0.120348,-0.044761,-0.087872,-0.230293,-0.002920877,0.001321,0.02404327,0.002713143,0.005502,-0.0009708738
Perc1,-0.106985,-0.019561,-0.05861,-0.114428,0.008767552,0.018792,0.01616867,0.008813048,0.002913,-0.003883495
Corr1,-0.096498,-0.018276,-0.042204,-0.118572,0.005716545,0.015295,0.02754245,0.02418335,0.016181,0.005177994
All imp dims1,-0.500054,-0.462428,-0.632526,-0.387337,-0.5182922,-0.542388,-0.4100637,-0.3734453,-0.447573,-0.538835
LR5,-0.040087,-0.021632,-0.042711,-0.120855,0.001270648,0.002202,0.01050615,0.0002259887,0.023625,0.001941748
Perc5,-0.008372,0.004354,-0.008542,-0.032983,0.006987919,0.01312,0.009189416,0.002712376,0.03657,0.0006472492


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [81]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.06875754356217101
flau_base_u Perc10 0.01984597628918594
flau_base_c Perc25 0.03442200305498622
flau_large_c Perc25 0.013616259394797936
cam_base Perc1 0.00876755208648472
xlm_large Perc1 0.01879162565812731
xlm_base Corr1 0.027542449810325498
bert_base_u Corr1 0.024183347393716303
distilbert_base All imp dims25 0.03883495145631077
bert_base_c Corr1 0.00517799352750814


Below, you can find the best achieved accuracy and the dimension group that produced it. This value is `InfEnc`.

In [79]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.8046342780895962
flau_base_u Perc10 0.5868796221192543
flau_base_c Perc25 0.6669484480026329
flau_large_c Perc25 0.8930140622372944
cam_base Perc1 0.5270598011334987
xlm_large Perc1 0.5611793259629442
xlm_base Corr1 0.5511433021509178
bert_base_u Corr1 0.5036173530689981
distilbert_base All imp dims25 0.5990291262135923
bert_base_c Corr1 0.5440129449838188


In [276]:
with open('../Data/best_results/gender_noun_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [277]:
with open('../Data/best_results/gender_noun_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [278]:
with open('../Data/best_results/gender_noun_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion

- Despite same vocabulary sizes, FlauBERT family models show different results. Bigger models (FlauBERT base) show worse results than the smaller model (FlauBERT small). Therefore, we can assume that a higher number of parameters doesn't guarantee better performance.
- Similarly, DistilBERT performs better than FlauBERT base uncased, which has significantly bigger vocabulary. Therefore, we can assume that vocabulary size doesn't have a big affect on the results.
- We can note that cased models (FlauBERT base and mBERT) perform better on the task than their uncased analogs.
- For all models a subset of dimensions and not all dimensions gave the best InfEnc results.