In [2]:
import sys
sys.path.append('../Util')
import pickle
import os

In [3]:
from IPython.display import Image

In [4]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [5]:
from we import get_we, initiate_model

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_adjs_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abandonné,0.530145,0.060065,0.007523,0.990865,0.430226,0.009984,0.979131,0.978007,0.169568,0.868192,...,0.03252,0.537425,0.982741,0.068059,0.106907,0.257188,0.76537,masculine,singular,abandonné
absent,0.531874,0.058954,0.008474,0.99232,0.430201,0.009588,0.979643,0.978505,0.169001,0.868324,...,0.030967,0.538838,0.983681,0.068282,0.106521,0.258556,0.765498,masculine,singular,absent
absolue,0.530525,0.059162,0.006882,0.99101,0.430248,0.010904,0.97886,0.97844,0.167912,0.868436,...,0.03162,0.535794,0.983176,0.068435,0.106496,0.259584,0.765379,feminine,singular,absolu
accessible,0.533019,0.059168,0.008158,0.993498,0.430884,0.008257,0.979684,0.97807,0.167863,0.868585,...,0.027722,0.534105,0.983887,0.06777,0.107722,0.258436,0.763149,invariable,singular,accessible
accompagné,0.52827,0.0593,0.005714,0.989006,0.430188,0.010091,0.980515,0.978253,0.171316,0.867668,...,0.035285,0.540979,0.982319,0.070023,0.107055,0.257919,0.768394,masculine,singular,accompagné


In [13]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma
feature_col_count = 3

# Feature to investigate in this notebook
feature = 'Number'

# Adjectivess

In [14]:
pos = ['ADJ']

Split each model into train and test:

In [32]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [33]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='plural',
                                           split=True,
                                           balance=True)
    X_adj_train.append(xtr)
    X_adj_test.append(xtst)
    
    y_adj_train.append(ytr)
    y_adj_test.append(ytst)

In [34]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['Train size'] = [len(x) for x in X_adj_train]
dataset_sizes['Test size'] = [len(x) for x in X_adj_test]
dataset_sizes

Unnamed: 0,Train size,Test size
flau_small_c,3872,968
flau_base_u,4552,1138
flau_base_c,3872,968
flau_large_c,3872,968
cam_base,1361,341
xlm_large,124,32
xlm_base,124,32
bert_base_u,788,198
distilbert_base,353,89
bert_base_c,353,89


## Non-independent dims

ANOVA test with p-value < 0.01:

In [35]:
anova_dims = [get_anova_dims(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [36]:
mi_dims = [get_mi_dims(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [37]:
non_ind_dims = [list(set(anova_dims[i]).intersection(mi_dims[i])) for i in range(len(models))]

Stats about the number of dimensions for each model:

In [38]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [39]:
non_ind_df['ANOVA'] = [len(x) for x in anova_dims]
non_ind_df['MI'] = [len(x) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x) for x in non_ind_dims]

In [40]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,380,375,311
flau_base_u,307,448,200
flau_base_c,484,519,356
flau_large_c,777,769,645
cam_base,179,461,106
xlm_large,58,527,47
xlm_base,0,268,0
bert_base_u,467,661,414
distilbert_base,166,516,131
bert_base_c,34,467,26


In [41]:
dims = {}

In [42]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = X_adj_train[i].columns
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [43]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [44]:
lr_res = [lr(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

Train Perceptron 10 times and get average weights:

In [45]:
perceptron_res = [perceptron(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

Compute correlation to the gender vector:

In [46]:
corr_res = [correlation(X_adj_train[i], y_adj_train[i]) for i in range(len(models))]

In [47]:
for i in range(len(models)):
    for alpha in alphas:
        num_imp_dims = len(X_adj_test[i].columns)*alpha//100
        lr_dims = [str(x[0]) for x in lr_res[i][:num_imp_dims]]
        perc_dims = [str(x[0]) for x in perceptron_res[i][:num_imp_dims]]
        corr_dims = [str(x[0]) for x in corr_res[i][:num_imp_dims]]
        dims[labels[i]][f'LR{alpha}'] = lr_dims
        dims[labels[i]][f'Perc{alpha}'] = perc_dims
        dims[labels[i]][f'Corr{alpha}'] = corr_dims
        dims[labels[i]][f'All imp dims{alpha}'] = list(set(lr_dims).intersection(perc_dims).intersection(corr_dims))

## Compute medians

In [48]:
medians = {}

In [49]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = {}
        dim_list = dims[labels[i]][dim_group]
        # Median of dimensions where feature vector is equal to 0
        median_0 = X_adj_train[i][y_adj_train[i] == 0][dim_list].median()
        # Median of dimensions where feature vector is equal to 1
        median_1 = X_adj_train[i][y_adj_train[i] == 1][dim_list].median()
        
        medians[labels[i]][dim_group]['0'] = median_0
        medians[labels[i]][dim_group]['1'] = median_1


In [50]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group])

In [51]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,512,768,768,1024,768,1024,768,768,768,768
ANOVA,380,307,484,777,179,58,0,467,166,34
MI,375,448,519,769,461,527,268,661,516,467
All non ind,311,200,356,645,106,47,0,414,131,26
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,3,3,1,4,1,0,0,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [52]:
y_preds = {}

In [53]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        dim_list = dims[labels[i]][dim_group]
        mae0 = X_adj_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['0'], x), axis=1)
        mae1 = X_adj_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['1'], x), axis=1)
        # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
        y_preds[labels[i]][dim_group] = (mae0 > mae1).apply(int)
    

## Compute accuracies

In [54]:
accs = {}

In [55]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_true = y_adj_test[i]
        y_pred = y_preds[labels[i]][dim_group]
        if any(y_pred):
            acc = accuracy_score(y_true, y_pred)
        else:
            acc = 0
        accs[labels[i]][dim_group] = acc

In [56]:
accs_df = pd.DataFrame(accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.923554,0.536028,0.647727,0.948347,0.565982,0.65625,0.625,0.515152,0.651685,0.494382
ANOVA,0.923554,0.552724,0.650826,0.947314,0.565982,0.625,0.0,0.515152,0.651685,0.651685
MI,0.922521,0.544815,0.667355,0.945248,0.565982,0.625,0.625,0.515152,0.651685,0.494382
All non ind,0.924587,0.560633,0.664256,0.946281,0.565982,0.625,0.0,0.515152,0.640449,0.651685
LR1,0.777893,0.543937,0.659091,0.892562,0.56305,0.46875,0.75,0.510101,0.573034,0.629213
Perc1,0.757231,0.563269,0.713843,0.904959,0.560117,0.59375,0.5,0.510101,0.606742,0.606742
Corr1,0.755165,0.553603,0.721074,0.893595,0.571848,0.65625,0.625,0.505051,0.640449,0.629213
All imp dims1,0.730372,0.557118,0.622934,0.879132,0.557185,0.0,0.0,0.0,0.0,0.0
LR5,0.868802,0.571178,0.684917,0.92562,0.565982,0.625,0.625,0.510101,0.685393,0.595506
Perc5,0.882231,0.584359,0.729339,0.932851,0.565982,0.59375,0.5625,0.510101,0.651685,0.674157


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [57]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.0,0.016696,0.003099,-0.001033,0.0,-0.03125,-0.625,0.0,0.0,0.157303
MI,-0.001033,0.008787,0.019628,-0.003099,0.0,-0.03125,0.0,0.0,0.0,0.0
All non ind,0.001033,0.024605,0.016529,-0.002066,0.0,-0.03125,-0.625,0.0,-0.011236,0.157303
LR1,-0.145661,0.007909,0.011364,-0.055785,-0.002933,-0.1875,0.125,-0.005051,-0.078652,0.134831
Perc1,-0.166322,0.027241,0.066116,-0.043388,-0.005865,-0.0625,-0.125,-0.005051,-0.044944,0.11236
Corr1,-0.168388,0.017575,0.073347,-0.054752,0.005865,0.0,0.0,-0.010101,-0.011236,0.134831
All imp dims1,-0.193182,0.02109,-0.024793,-0.069215,-0.008798,-0.65625,-0.625,-0.515152,-0.651685,-0.494382
LR5,-0.054752,0.035149,0.03719,-0.022727,0.0,-0.03125,0.0,-0.005051,0.033708,0.101124
Perc5,-0.041322,0.04833,0.081612,-0.015496,0.0,-0.0625,-0.0625,-0.005051,0.0,0.179775


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [58]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.032024793388429784
flau_base_u Perc5 0.048330404217926115
flau_base_c Perc25 0.08884297520661155
flau_large_c LR25 0.010330578512396715
cam_base All imp dims5 0.008797653958944274
xlm_large All imp dims5 0.03125
xlm_base LR1 0.125
bert_base_u All dims 0.0
distilbert_base All imp dims10 0.0786516853932584
bert_base_c Perc5 0.1797752808988764


In [59]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.9555785123966942
flau_base_u Perc5 0.5843585237258347
flau_base_c Perc25 0.7365702479338843
flau_large_c LR25 0.9586776859504132
cam_base All imp dims5 0.5747800586510264
xlm_large All imp dims5 0.6875
xlm_base LR1 0.75
bert_base_u All dims 0.5151515151515151
distilbert_base All imp dims10 0.7303370786516854
bert_base_c Perc5 0.6741573033707865


In [60]:
if 'number_adj.pickle' not in os.listdir('../Data/best_results/'):
    best_res = {}
    for label in labels:
        best_res[label] = {}
        best_res[label]['best_dim_set'] = []
        best_res[label]['best_dims'] = []
        best_res[label]['accs'] = []
        best_res[label]['gains'] = []
        best_res[label]['medians_0'] = []
        best_res[label]['medians_1'] = []


else:
    with open('../Data/best_results/number_adj.pickle', 'rb') as f:
        best_res = pickle.load(f)

In [61]:
for i in range(len(models)):
    best_res[labels[i]]['best_dim_set'].append(gains_df.idxmax()[i])
    best_res[labels[i]]['best_dims'].append(dims[labels[i]][gains_df.idxmax()[i]])
    best_res[labels[i]]['accs'].append(accs_df.loc[accs_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['gains'].append(gains_df.loc[gains_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['medians_0'].append(medians[labels[i]][accs_df.idxmax()[i]]['0'])
    best_res[labels[i]]['medians_1'].append(medians[labels[i]][accs_df.idxmax()[i]]['1'])

In [62]:
with open('../Data/best_results/number_adj.pickle', 'wb') as f:
    pickle.dump(best_res, f)

# Conclusion

1. Unlike for nouns, mBERT-base-cased achieved a masive improvement in accuracy (17%) when limiting the number of dimensions. This big jump could be due to the small size of the test set, however, in comparison, mBERT-base-uncased is showing accuracy close to random.
2. For CamemBERT, however, the accuracy improvement over all dimensions is insignificant.