In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [3]:
from we import get_we, initiate_model

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from scipy.stats import f_oneway

In [6]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [9]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.909603,0.551025,0.436937,0.789782,0.311517,0.417476,0.629341,0.913722,0.349834,0.838988,...,0.285154,0.057887,0.030874,0.310204,invariable,feminine,2D,NOUN,,
3D,0.901779,0.54988,0.427826,0.797102,0.3139,0.419226,0.623478,0.916539,0.355807,0.844714,...,0.28761,0.054251,0.018604,0.305226,invariable,feminine,3D,NOUN,,
aa,0.89957,0.555891,0.418186,0.805754,0.319232,0.412976,0.618457,0.914183,0.367882,0.845985,...,0.287883,0.057476,0.013614,0.290463,invariable,masculine,aa,NOUN,,
abandon,0.898879,0.556306,0.417821,0.805256,0.318463,0.412505,0.618276,0.914983,0.367654,0.846577,...,0.288183,0.057827,0.01405,0.290994,singular,masculine,abandon,NOUN,,
abbaye,0.895888,0.554461,0.420659,0.799339,0.322441,0.418141,0.61882,0.921033,0.367005,0.845904,...,0.293985,0.057312,0.012175,0.298473,singular,feminine,abbaye,NOUN,,


In [13]:
# There are 6 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, pos, tense, person
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'Gender'

# Adjectivess

In [14]:
pos = ['ADJ', 'NOUN']

Split each model into train and test:

In [15]:
X_na_train = []
y_na_train = []

X_na_test = []
y_na_test = []

In [16]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & \
                                                      (we.Number != 'invariable') & (we.POS.isin(pos))],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=True,
                                           balance=True)
    X_na_train.append(xtr)
    X_na_test.append(xtst)
    
    y_na_train.append(ytr)
    y_na_test.append(ytst)

In [17]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['Train size'] = [len(x) for x in X_na_train]
dataset_sizes['Test size'] = [len(x) for x in X_na_test]
dataset_sizes

Unnamed: 0,Train size,Test size
flau_small_c,7166,1792
flau_base_u,9180,2296
flau_base_c,7166,1792
flau_large_c,7166,1792
cam_base,4825,1207
xlm_large,1580,396
xlm_base,1580,396
bert_base_u,3076,770
distilbert_base,2108,528
bert_base_c,2108,528


## Non-independent dims

ANOVA test with p-value < 0.01:

In [18]:
anova_dims = [get_anova_dims(X_na_train[i], y_na_train[i]) for i in range(len(models))]

In [19]:
mi_dims = [get_mi_dims(X_na_train[i], y_na_train[i]) for i in range(len(models))]

In [20]:
non_ind_dims = [list(set(anova_dims[i]).intersection(mi_dims[i])) for i in range(len(models))]

Stats about the number of dimensions for each model:

In [21]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [22]:
non_ind_df['ANOVA'] = [len(x) for x in anova_dims]
non_ind_df['MI'] = [len(x) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x) for x in non_ind_dims]

In [23]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,340,328,240
flau_base_u,252,593,197
flau_base_c,320,420,202
flau_large_c,785,740,621
cam_base,334,524,245
xlm_large,234,574,148
xlm_base,565,477,371
bert_base_u,15,548,11
distilbert_base,203,447,130
bert_base_c,2,462,1


In [24]:
dims = {}

In [25]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = X_na_train[i].columns
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [26]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [27]:
lr_res = [lr(X_na_train[i], y_na_train[i]) for i in range(len(models))]

Train Perceptron 10 times and get average weights:

In [28]:
perceptron_res = [perceptron(X_na_train[i], y_na_train[i]) for i in range(len(models))]

Compute correlation to the gender vector:

In [29]:
corr_res = [correlation(X_na_train[i], y_na_train[i]) for i in range(len(models))]

In [30]:
for i in range(len(models)):
    for alpha in alphas:
        num_imp_dims = len(X_na_test[i].columns)*alpha//100
        lr_dims = [str(x[0]) for x in lr_res[i][:num_imp_dims]]
        perc_dims = [str(x[0]) for x in perceptron_res[i][:num_imp_dims]]
        corr_dims = [str(x[0]) for x in corr_res[i][:num_imp_dims]]
        dims[labels[i]][f'LR{alpha}'] = lr_dims
        dims[labels[i]][f'Perc{alpha}'] = perc_dims
        dims[labels[i]][f'Corr{alpha}'] = corr_dims
        dims[labels[i]][f'All imp dims{alpha}'] = list(set(lr_dims).intersection(perc_dims).intersection(corr_dims))

## Compute medians

In [31]:
medians = {}

In [32]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = {}
        dim_list = dims[labels[i]][dim_group]
        # Median of dimensions where feature vector is equal to 0
        median_0 = X_na_train[i][y_na_train[i] == 0][dim_list].median()
        # Median of dimensions where feature vector is equal to 1
        median_1 = X_na_train[i][y_na_train[i] == 1][dim_list].median()
        
        medians[labels[i]][dim_group]['0'] = median_0
        medians[labels[i]][dim_group]['1'] = median_1


In [33]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group])

In [34]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,512,768,768,1024,768,1024,768,768,768,768
ANOVA,340,252,320,785,334,234,565,15,203,2
MI,328,593,420,740,524,574,477,548,447,462
All non ind,240,197,202,621,245,148,371,11,130,1
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,1,0,2,0,0,1,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [35]:
y_preds = {}

In [36]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        dim_list = dims[labels[i]][dim_group]
        mae0 = X_na_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['0'], x), axis=1)
        mae1 = X_na_test[i][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group]['1'], x), axis=1)
        # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
        y_preds[labels[i]][dim_group] = (mae0 > mae1).apply(int)
    

## Compute accuracies

In [37]:
accs = {}

In [38]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_true = y_na_test[i]
        y_pred = y_preds[labels[i]][dim_group]
        if any(y_pred):
            acc = accuracy_score(y_true, y_pred)
        else:
            acc = 0
        accs[labels[i]][dim_group] = acc

In [39]:
accs_df = pd.DataFrame(accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.720424,0.530923,0.634487,0.872768,0.545153,0.575758,0.540404,0.505195,0.651515,0.558712
ANOVA,0.72154,0.556185,0.637835,0.872768,0.545153,0.583333,0.540404,0.503896,0.655303,0.558712
MI,0.715402,0.543554,0.636719,0.878906,0.545153,0.578283,0.540404,0.496104,0.642045,0.560606
All non ind,0.720424,0.550087,0.639509,0.87779,0.544325,0.588384,0.540404,0.509091,0.642045,0.558712
LR1,0.600446,0.544861,0.546875,0.632254,0.549296,0.555556,0.545455,0.490909,0.530303,0.55303
Perc1,0.626116,0.557491,0.583705,0.762277,0.536868,0.613636,0.520202,0.490909,0.606061,0.537879
Corr1,0.62779,0.533972,0.580915,0.751116,0.545982,0.626263,0.54798,0.528571,0.630682,0.539773
All imp dims1,0.582031,0.537892,0.0,0.607143,0.0,0.0,0.555556,0.0,0.0,0.0
LR5,0.698661,0.534843,0.592076,0.740513,0.540182,0.621212,0.540404,0.51039,0.632576,0.554924
Perc5,0.728795,0.585801,0.613839,0.848772,0.532726,0.608586,0.532828,0.505195,0.657197,0.55303


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [40]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.001116,0.025261,0.003348,0.0,0.0,0.007576,0.0,-0.001299,0.003788,0.0
MI,-0.005022,0.012631,0.002232,0.006138,0.0,0.002525,0.0,-0.009091,-0.00947,0.001894
All non ind,0.0,0.019164,0.005022,0.005022,-0.000829,0.012626,0.0,0.003896,-0.00947,0.0
LR1,-0.119978,0.013937,-0.087612,-0.240513,0.004143,-0.020202,0.005051,-0.014286,-0.121212,-0.005682
Perc1,-0.094308,0.026568,-0.050781,-0.110491,-0.008285,0.037879,-0.020202,-0.014286,-0.045455,-0.020833
Corr1,-0.092634,0.003049,-0.053571,-0.121652,0.000829,0.050505,0.007576,0.023377,-0.020833,-0.018939
All imp dims1,-0.138393,0.006969,-0.634487,-0.265625,-0.545153,-0.575758,0.015152,-0.505195,-0.651515,-0.558712
LR5,-0.021763,0.00392,-0.042411,-0.132254,-0.004971,0.045455,0.0,0.005195,-0.018939,-0.003788
Perc5,0.008371,0.054878,-0.020647,-0.023996,-0.012428,0.032828,-0.007576,0.0,0.005682,-0.005682


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [41]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.0803571428571429
flau_base_u Perc5 0.054878048780487854
flau_base_c Perc25 0.0345982142857143
flau_large_c Perc50 0.0234375
cam_base LR1 0.0041425020712511085
xlm_large Corr1 0.0505050505050505
xlm_base All imp dims1 0.015151515151515138
bert_base_u Corr1 0.023376623376623384
distilbert_base Perc10 0.03598484848484851
bert_base_c Perc25 0.0037878787878787845


In [42]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.80078125
flau_base_u Perc5 0.585801393728223
flau_base_c Perc25 0.6690848214285714
flau_large_c Perc50 0.8962053571428571
cam_base LR1 0.5492957746478874
xlm_large Corr1 0.6262626262626263
xlm_base All imp dims1 0.5555555555555556
bert_base_u Corr1 0.5285714285714286
distilbert_base Perc10 0.6875
bert_base_c Perc25 0.5625


In [43]:
if 'gender_na.pickle' not in os.listdir('../Data/best_results/'):
    best_res = {}
    for label in labels:
        best_res[label] = {}
        best_res[label]['best_dim_set'] = []
        best_res[label]['best_dims'] = []
        best_res[label]['accs'] = []
        best_res[label]['gains'] = []
        best_res[label]['medians_0'] = []
        best_res[label]['medians_1'] = []


else:
    with open('../Data/best_results/gender_na.pickle', 'rb') as f:
        best_res = pickle.load(f)

In [44]:
for i in range(len(models)):
    best_res[labels[i]]['best_dim_set'].append(gains_df.idxmax()[i])
    best_res[labels[i]]['best_dims'].append(dims[labels[i]][gains_df.idxmax()[i]])
    best_res[labels[i]]['accs'].append(accs_df.loc[accs_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['gains'].append(gains_df.loc[gains_df.idxmax()[i], labels[i]])
    best_res[labels[i]]['medians_0'].append(medians[labels[i]][accs_df.idxmax()[i]]['0'])
    best_res[labels[i]]['medians_1'].append(medians[labels[i]][accs_df.idxmax()[i]]['1'])

In [45]:
with open('../Data/best_results/gender_na.pickle', 'wb') as f:
    pickle.dump(best_res, f)

# Conclusion

1. The results for both nouns and adjectives show the same trends: out of french models, FlauBERT shows better results, out of multilingual models - XLM large and DistilBERT.
2. For all models, the accuracy is improved when a subset of dimensions is selected instead of all dimensions.