In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_adjs_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abandonné,0.530145,0.060065,0.007523,0.990865,0.430226,0.009984,0.979131,0.978007,0.169568,0.868192,...,0.03252,0.537425,0.982741,0.068059,0.106907,0.257188,0.76537,masculine,singular,abandonné
absent,0.531874,0.058954,0.008474,0.99232,0.430201,0.009588,0.979643,0.978505,0.169001,0.868324,...,0.030967,0.538838,0.983681,0.068282,0.106521,0.258556,0.765498,masculine,singular,absent
absolue,0.530525,0.059162,0.006882,0.99101,0.430248,0.010904,0.97886,0.97844,0.167912,0.868436,...,0.03162,0.535794,0.983176,0.068435,0.106496,0.259584,0.765379,feminine,singular,absolu
accessible,0.533019,0.059168,0.008158,0.993498,0.430884,0.008257,0.979684,0.97807,0.167863,0.868585,...,0.027722,0.534105,0.983887,0.06777,0.107722,0.258436,0.763149,invariable,singular,accessible
accompagné,0.52827,0.0593,0.005714,0.989006,0.430188,0.010091,0.980515,0.978253,0.171316,0.867668,...,0.035285,0.540979,0.982319,0.070023,0.107055,0.257919,0.768394,masculine,singular,accompagné


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma
feature_col_count = 3

# Feature to investigate in this notebook
feature = 'Gender'

# Adjectivess

In [14]:
pos = ['ADJ']

Split each model into train and test:

In [15]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [16]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=True,
                                           balance=True)
    X_adj_train.append(xtr)
    X_adj_test.append(xtst)
    
    y_adj_train.append(ytr)
    y_adj_test.append(ytst)

In [17]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['Train size'] = [len(x) for x in X_adj_train]
dataset_sizes['Test size'] = [len(x) for x in X_adj_test]
dataset_sizes

Unnamed: 0,Train size,Test size
flau_small_c,4235,1059
flau_base_u,4950,1238
flau_base_c,4235,1059
flau_large_c,4235,1059
cam_base,2345,587
xlm_large,366,92
xlm_base,366,92
bert_base_u,838,210
distilbert_base,636,160
bert_base_c,636,160


## Non-independent dims

ANOVA test with p-value < 0.01:

In [18]:
anova_dims = [get_anova_dims(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [19]:
mi_dims = [get_mi_dims(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [20]:
non_ind_dims = [list(set(anova_dims[i]).intersection(mi_dims[i])) for i in range(len(models))]

Stats about the number of dimensions for each model:

In [21]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [22]:
non_ind_df['ANOVA'] = [len(x) for x in anova_dims]
non_ind_df['MI'] = [len(x) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x) for x in non_ind_dims]

In [23]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,369,372,294
flau_base_u,220,410,134
flau_base_c,336,445,221
flau_large_c,797,731,626
cam_base,134,449,85
xlm_large,10,526,7
xlm_base,33,416,21
bert_base_u,2,347,2
distilbert_base,212,462,152
bert_base_c,1,465,1


In [24]:
dims = {}

In [25]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = X_adj_train[i].columns
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [26]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [27]:
lr_res = [lr(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

Train Perceptron 10 times and get average weights:

In [28]:
perceptron_res = [perceptron(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

Compute correlation to the gender vector:

In [29]:
corr_res = [correlation(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [30]:
for i in range(len(models)):
    for alpha in alphas:
        num_imp_dims = len(X_adj_test[i].columns)*alpha//100
        lr_dims = [str(x[0]) for x in lr_res[i][:num_imp_dims]]
        perc_dims = [str(x[0]) for x in perceptron_res[i][:num_imp_dims]]
        corr_dims = [str(x[0]) for x in corr_res[i][:num_imp_dims]]
        dims[labels[i]][f'LR{alpha}'] = lr_dims
        dims[labels[i]][f'Perc{alpha}'] = perc_dims
        dims[labels[i]][f'Corr{alpha}'] = corr_dims
        dims[labels[i]][f'All imp dims{alpha}'] = list(set(lr_dims).intersection(perc_dims).intersection(corr_dims))

## Compute medians

In [31]:
medians = {}

In [32]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = {}
        dim_list = dims[labels[i]][dim_group]
        # Median of dimensions where feature vector is equal to 0
        median_0 = X_adj_train[i][y_adj_train[i] == 0][dim_list].median()
        # Median of dimensions where feature vector is equal to 1
        median_1 = X_adj_train[i][y_adj_train[i] == 1][dim_list].median()
        
        medians[labels[i]][dim_group]['0'] = median_0
        medians[labels[i]][dim_group]['1'] = median_1


In [33]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group])

In [34]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,512,768,768,1024,768,1024,768,768,768,768
ANOVA,369,220,336,797,134,10,33,2,212,1
MI,372,410,445,731,449,526,416,347,462,465
All non ind,294,134,221,626,85,7,21,2,152,1
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,0,0,0,0,1,0,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [35]:
y_preds = {}

In [36]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        dim_list = dims[labels[i]][dim_group]
        mae0 = X_adj_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['0'], x), axis=1)
        mae1 = X_adj_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['1'], x), axis=1)
        # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
        y_preds[labels[i]][dim_group] = (mae0 > mae1).apply(int)
    

## Compute accuracies

In [37]:
accs = {}

In [38]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_true = y_adj_test[i]
        y_pred = y_preds[labels[i]][dim_group]
        if any(y_pred):
            acc = accuracy_score(y_true, y_pred)
        else:
            acc = 0
        accs[labels[i]][dim_group] = acc

In [39]:
accs_df = pd.DataFrame(accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.932956,0.506462,0.589235,0.915958,0.528109,0.5,0.521739,0.495238,0.58125,0.51875
ANOVA,0.931067,0.517771,0.593957,0.913126,0.528109,0.532609,0.532609,0.5,0.58125,0.5375
MI,0.931067,0.511309,0.593012,0.918791,0.528109,0.576087,0.543478,0.490476,0.58125,0.5125
All non ind,0.932011,0.519386,0.591124,0.916903,0.528109,0.554348,0.554348,0.5,0.58125,0.5375
LR1,0.723324,0.527464,0.539188,0.691218,0.521295,0.586957,0.5,0.480952,0.55625,0.5625
Perc1,0.732767,0.544426,0.594901,0.799811,0.475298,0.565217,0.543478,0.5,0.59375,0.55
Corr1,0.714825,0.528271,0.599622,0.777148,0.51448,0.532609,0.554348,0.490476,0.58125,0.5375
All imp dims1,0.604344,0.0,0.0,0.0,0.0,0.554348,0.0,0.0,0.0,0.0
LR5,0.855524,0.553312,0.580737,0.866856,0.526405,0.543478,0.5,0.495238,0.63125,0.525
Perc5,0.875354,0.550889,0.627951,0.877243,0.531516,0.597826,0.456522,0.5,0.6125,0.525


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [40]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,-0.001889,0.011309,0.004721,-0.002833,0.0,0.032609,0.01087,0.004762,0.0,0.01875
MI,-0.001889,0.004847,0.003777,0.002833,0.0,0.076087,0.021739,-0.004762,0.0,-0.00625
All non ind,-0.000944,0.012924,0.001889,0.000944,0.0,0.054348,0.032609,0.004762,0.0,0.01875
LR1,-0.209632,0.021002,-0.050047,-0.22474,-0.006814,0.086957,-0.021739,-0.014286,-0.025,0.04375
Perc1,-0.200189,0.037964,0.005666,-0.116147,-0.052811,0.065217,0.021739,0.004762,0.0125,0.03125
Corr1,-0.21813,0.021809,0.010387,-0.13881,-0.013629,0.032609,0.032609,-0.004762,0.0,0.01875
All imp dims1,-0.328612,-0.506462,-0.589235,-0.915958,-0.528109,0.054348,-0.521739,-0.495238,-0.58125,-0.51875
LR5,-0.077432,0.04685,-0.008499,-0.049103,-0.001704,0.043478,-0.021739,0.0,0.05,0.00625
Perc5,-0.057602,0.044426,0.038716,-0.038716,0.003407,0.097826,-0.065217,0.004762,0.03125,0.00625


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [41]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc50 0.016997167138810165
flau_base_u LR5 0.04684975767366717
flau_base_c Perc25 0.06610009442870635
flau_large_c Perc25 0.01510859301227574
cam_base Perc5 0.0034071550255536653
xlm_large Perc5 0.09782608695652173
xlm_base Corr25 0.05434782608695654
bert_base_u All imp dims5 0.01904761904761898
distilbert_base LR5 0.04999999999999993
bert_base_c LR1 0.043749999999999956


In [42]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc50 0.9499527856468366
flau_base_u LR5 0.5533117932148627
flau_base_c Perc25 0.6553352219074599
flau_large_c Perc25 0.931067044381492
cam_base Perc5 0.5315161839863713
xlm_large Perc5 0.5978260869565217
xlm_base Corr25 0.5760869565217391
bert_base_u All imp dims5 0.5142857142857142
distilbert_base LR5 0.63125
bert_base_c LR1 0.5625


In [43]:
if 'gender_adj.pickle' not in os.listdir('../Data/best_results/'):
    best_res = {}
    for label in labels:
        best_res[label] = {}
        best_res[label]['best_dim_set'] = []
        best_res[label]['best_dims'] = []
        best_res[label]['accs'] = []
        best_res[label]['gains'] = []
        best_res[label]['medians_0'] = []
        best_res[label]['medians_1'] = []


else:
    with open('../Data/best_results/gender_adj.pickle', 'rb') as f:
        best_res = pickle.load(f)

In [44]:
for i in range(len(models)):
    best_res[labels[i]]['best_dim_set'].append(gains_df.idxmax()[i])
    best_res[labels[i]]['best_dims'].append(dims[labels[i]][gains_df.idxmax()[i]])
    best_res[labels[i]]['accs'].append(accs_df.loc[accs_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['gains'].append(gains_df.loc[gains_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['medians_0'].append(medians[labels[i]][accs_df.idxmax()[i]]['0'])
    best_res[labels[i]]['medians_1'].append(medians[labels[i]][accs_df.idxmax()[i]]['1'])

In [45]:
with open('../Data/best_results/gender_adj.pickle', 'wb') as f:
    pickle.dump(best_res, f)

# Conclusion

1. We can note that smaller models (FlauBERT-small and DistilBERT) might be performing on par or better than their bigger counterparts (FlauBERT large and mBERT-base).
3. Once again, a bigger vocabulary size doesn't seem to be correlated with the observed accuracy: CamemBERT having a vocabulary 4 times bigger than DistilBERT, however, showing a lower accuracy.
4. A lot of models (FlauBERT, XLM, DistilBERT) show higher accuracies for adjectives, despite small vocabulary size.