In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_nouns_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Number,Gender,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.836139,0.551025,0.347197,0.789782,0.393246,0.300031,0.609232,0.913722,0.178287,0.838988,...,0.377143,0.586894,0.674134,0.276217,0.081619,0.154076,0.553278,invariable,feminine,2D
3D,0.828946,0.54988,0.339958,0.797102,0.395346,0.301288,0.603051,0.916539,0.185836,0.844714,...,0.370481,0.584636,0.674752,0.278596,0.078074,0.143366,0.548315,invariable,feminine,3D
a,0.82666,0.556389,0.332618,0.805891,0.399844,0.295803,0.597985,0.914675,0.201301,0.846672,...,0.371326,0.577705,0.673679,0.278649,0.081689,0.13949,0.533415,invariable,masculine,a
aa,0.826916,0.555891,0.332298,0.805754,0.400046,0.296797,0.597758,0.914183,0.201098,0.845985,...,0.372299,0.578594,0.673429,0.278861,0.081218,0.13901,0.533594,invariable,masculine,aa
abandon,0.826281,0.556306,0.332007,0.805256,0.399368,0.296458,0.597567,0.914983,0.200809,0.846577,...,0.37353,0.578808,0.673362,0.279152,0.08156,0.13939,0.534123,singular,masculine,abandon


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma
feature_col_count = 3

# Feature to investigate in this notebook
feature = 'Gender'

# Nouns

In [14]:
pos = ['NOUN']

Split each model into train and test:

In [15]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [16]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=True,
                                           balance=True)
    X_noun_train.append(xtr)
    X_noun_test.append(xtst)
    
    y_noun_train.append(ytr)
    y_noun_test.append(ytst)

In [17]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['Train size'] = [len(x) for x in X_noun_train]
dataset_sizes['Test size'] = [len(x) for x in X_noun_test]
dataset_sizes

Unnamed: 0,Train size,Test size
flau_small_c,9459,2365
flau_base_u,11206,2802
flau_base_c,9459,2365
flau_large_c,9459,2365
cam_base,6297,1575
xlm_large,1830,458
xlm_base,1830,458
bert_base_u,3539,885
distilbert_base,2472,618
bert_base_c,2472,618


## Non-independent dims

ANOVA test with p-value < 0.01:

In [18]:
anova_dims = [get_anova_dims(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [19]:
mi_dims = [get_mi_dims(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [20]:
non_ind_dims = [list(set(anova_dims[i]).intersection(mi_dims[i])) for i in range(len(models))]

Stats about the number of dimensions for each model:

In [21]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [22]:
non_ind_df['ANOVA'] = [len(x) for x in anova_dims]
non_ind_df['MI'] = [len(x) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x) for x in non_ind_dims]

In [23]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,364,321,249
flau_base_u,276,662,246
flau_base_c,337,415,202
flau_large_c,824,748,637
cam_base,68,418,41
xlm_large,184,508,106
xlm_base,132,436,79
bert_base_u,7,562,5
distilbert_base,203,440,126
bert_base_c,3,445,2


In [24]:
dims = {}

In [25]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = X_noun_train[i].columns
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [26]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [27]:
lr_res = [lr(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

Train Perceptron 10 times and get average weights:

In [28]:
perceptron_res = [perceptron(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

Compute correlation to the gender vector:

In [29]:
corr_res = [correlation(X_noun_train[i], y_noun_train[i]) for i in range(len(models))]

In [30]:
for i in range(len(models)):
    for alpha in alphas:
        num_imp_dims = len(X_noun_test[i].columns)*alpha//100
        lr_dims = [str(x[0]) for x in lr_res[i][:num_imp_dims]]
        perc_dims = [str(x[0]) for x in perceptron_res[i][:num_imp_dims]]
        corr_dims = [str(x[0]) for x in corr_res[i][:num_imp_dims]]
        dims[labels[i]][f'LR{alpha}'] = lr_dims
        dims[labels[i]][f'Perc{alpha}'] = perc_dims
        dims[labels[i]][f'Corr{alpha}'] = corr_dims
        dims[labels[i]][f'All imp dims{alpha}'] = list(set(lr_dims).intersection(perc_dims).intersection(corr_dims))

## Compute medians

In [31]:
medians = {}

In [32]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = {}
        dim_list = dims[labels[i]][dim_group]
        # Median of dimensions where feature vector is equal to 0
        median_0 = X_noun_train[i][y_noun_train[i] == 0][dim_list].median()
        # Median of dimensions where feature vector is equal to 1
        median_1 = X_noun_train[i][y_noun_train[i] == 1][dim_list].median()
        
        medians[labels[i]][dim_group]['0'] = median_0
        medians[labels[i]][dim_group]['1'] = median_1


In [33]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group])

In [34]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,512,768,768,1024,768,1024,768,768,768,768
ANOVA,364,276,337,824,68,184,132,7,203,3
MI,321,662,415,748,418,508,436,562,440,445
All non ind,249,246,202,637,41,106,79,5,126,2
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,0,0,0,0,0,0,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [35]:
y_preds = {}

In [36]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        dim_list = dims[labels[i]][dim_group]
        mae0 = X_noun_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['0'], x), axis=1)
        mae1 = X_noun_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['1'], x), axis=1)
        # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
        y_preds[labels[i]][dim_group] = (mae0 > mae1).apply(int)
    

## Compute accuracies

In [37]:
accs = {}

In [38]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_true = y_noun_test[i]
        y_pred = y_preds[labels[i]][dim_group]
        if any(y_pred):
            acc = accuracy_score(y_true, y_pred)
        else:
            acc = 0
        accs[labels[i]][dim_group] = acc

In [39]:
accs_df = pd.DataFrame(accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.750106,0.566738,0.665116,0.865116,0.533968,0.528384,0.502183,0.494915,0.57767,0.530744
ANOVA,0.750106,0.577445,0.663425,0.867653,0.531429,0.530568,0.524017,0.525424,0.567961,0.540453
MI,0.738689,0.569593,0.661734,0.871882,0.534603,0.537118,0.502183,0.494915,0.57767,0.530744
All non ind,0.734461,0.576374,0.666808,0.869767,0.535238,0.530568,0.521834,0.522034,0.563107,0.530744
LR1,0.618605,0.532834,0.54334,0.659619,0.525079,0.521834,0.543668,0.506215,0.55178,0.537217
Perc1,0.634249,0.559957,0.589006,0.756871,0.537143,0.552402,0.530568,0.505085,0.522654,0.530744
Corr1,0.635095,0.554604,0.618182,0.751374,0.515556,0.539301,0.528384,0.525424,0.538835,0.530744
All imp dims1,0.579281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LR5,0.696829,0.536046,0.613953,0.767865,0.530159,0.5131,0.539301,0.498305,0.572816,0.535599
Perc5,0.730655,0.587794,0.646934,0.843975,0.537143,0.558952,0.534934,0.496045,0.564725,0.533981


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [40]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.0,0.010707,-0.001691,0.002537,-0.00254,0.002183,0.021834,0.030508,-0.009709,0.009709
MI,-0.011416,0.002855,-0.003383,0.006765,0.000635,0.008734,0.0,0.0,0.0,0.0
All non ind,-0.015645,0.009636,0.001691,0.004651,0.00127,0.002183,0.019651,0.027119,-0.014563,0.0
LR1,-0.131501,-0.033904,-0.121776,-0.205497,-0.008889,-0.00655,0.041485,0.011299,-0.02589,0.006472
Perc1,-0.115856,-0.006781,-0.07611,-0.108245,0.003175,0.024017,0.028384,0.010169,-0.055016,0.0
Corr1,-0.115011,-0.012134,-0.046934,-0.113742,-0.018413,0.010917,0.026201,0.030508,-0.038835,0.0
All imp dims1,-0.170825,-0.566738,-0.665116,-0.865116,-0.533968,-0.528384,-0.502183,-0.494915,-0.57767,-0.530744
LR5,-0.053277,-0.030692,-0.051163,-0.097252,-0.00381,-0.015284,0.037118,0.00339,-0.004854,0.004854
Perc5,-0.01945,0.021056,-0.018182,-0.021142,0.003175,0.030568,0.032751,0.00113,-0.012945,0.003236


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [41]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.052854122621564414
flau_base_u Perc25 0.021413276231263323
flau_base_c All imp dims50 0.018604651162790642
flau_large_c Perc25 0.02579281183932347
cam_base Perc10 0.005714285714285672
xlm_large Corr5 0.037117903930131035
xlm_base All imp dims5 0.06986899563318782
bert_base_u ANOVA 0.030508474576271205
distilbert_base LR10 0.017799352750809128
bert_base_c All imp dims25 0.012944983818770184


In [42]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.8029598308668076
flau_base_u Perc25 0.5881513204853676
flau_base_c All imp dims50 0.6837209302325581
flau_large_c Perc25 0.8909090909090909
cam_base Perc10 0.5396825396825397
xlm_large Corr5 0.5655021834061136
xlm_base All imp dims5 0.5720524017467249
bert_base_u ANOVA 0.5254237288135594
distilbert_base LR10 0.5954692556634305
bert_base_c All imp dims25 0.5436893203883495


In [43]:
if 'gender_noun.pickle' not in os.listdir('../Data/best_results/'):
    best_res = {}
    for label in labels:
        best_res[label] = {}
        best_res[label]['best_dim_set'] = []
        best_res[label]['best_dims'] = []
        best_res[label]['accs'] = []
        best_res[label]['gains'] = []
        best_res[label]['medians_0'] = []
        best_res[label]['medians_1'] = []


else:
    with open('../Data/best_results/gender_noun.pickle', 'rb') as f:
        best_res = pickle.load(f)

In [44]:
for i in range(len(models)):
    best_res[labels[i]]['best_dim_set'].append(gains_df.idxmax()[i])
    best_res[labels[i]]['best_dims'].append(dims[labels[i]][gains_df.idxmax()[i]])
    best_res[labels[i]]['accs'].append(accs_df.loc[accs_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['gains'].append(gains_df.loc[gains_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['medians_0'].append(medians[labels[i]][accs_df.idxmax()[i]]['0'])
    best_res[labels[i]]['medians_1'].append(medians[labels[i]][accs_df.idxmax()[i]]['1'])

In [45]:
with open('../Data/best_results/gender_noun.pickle', 'wb') as f:
    pickle.dump(best_res, f)

# Conclusion

1. We can note that for none of the models the best accuracy was achieved on all dimension, which can be a signal that gender information in French is encode in a set of dimensions only. 

2. However, accuracy improvement for CamemBERT over all dimension is very not significant - 0.005.

2. We can note that smaller models (FlauBERT-small and DistilBERT) might be performing on par or better than their bigger counterparts (FlauBERT large and mBERT-base).

3. A bigger vocabulary size doesn't seem to be correlated with the observed accuracy: CamemBERT has a bigger vocabulary than DistilBERT, however, shows a lower accuracy.