In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_adjs_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abandonné,0.530145,0.060065,0.007523,0.990865,0.430226,0.009984,0.979131,0.978007,0.169568,0.868192,...,0.03252,0.537425,0.982741,0.068059,0.106907,0.257188,0.76537,masculine,singular,abandonné
absent,0.531874,0.058954,0.008474,0.99232,0.430201,0.009588,0.979643,0.978505,0.169001,0.868324,...,0.030967,0.538838,0.983681,0.068282,0.106521,0.258556,0.765498,masculine,singular,absent
absolue,0.530525,0.059162,0.006882,0.99101,0.430248,0.010904,0.97886,0.97844,0.167912,0.868436,...,0.03162,0.535794,0.983176,0.068435,0.106496,0.259584,0.765379,feminine,singular,absolu
accessible,0.533019,0.059168,0.008158,0.993498,0.430884,0.008257,0.979684,0.97807,0.167863,0.868585,...,0.027722,0.534105,0.983887,0.06777,0.107722,0.258436,0.763149,invariable,singular,accessible
accompagné,0.52827,0.0593,0.005714,0.989006,0.430188,0.010091,0.980515,0.978253,0.171316,0.867668,...,0.035285,0.540979,0.982319,0.070023,0.107055,0.257919,0.768394,masculine,singular,accompagné


In [17]:
# There are 3 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, semantic info
feature_col_count = 3

# Feature to investigate in this notebook
feature = 'Gender'

# Adjectives

In [18]:
pos = ['ADJ']

Split each model into train and test using k_fold cross validation:

In [19]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [20]:
n_folds = 5

In [21]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='feminine',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_adj_train.append(X_trains)
    X_adj_test.append(X_tests)
    
    y_adj_train.append(y_trains)
    y_adj_test.append(y_tests)

In [22]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_adj_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,1059
flau_base_u,1238
flau_base_c,1059
flau_large_c,1059
cam_base,587
xlm_large,92
xlm_base,92
bert_base_u,210
distilbert_base,160
bert_base_c,160


## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [23]:
anova_dims = []

In [24]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_adj_train[i][j], y_adj_train[i][j]))

Get dimensions where MI > 0:

In [25]:
mi_dims = []

In [26]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_adj_train[i][j], y_adj_train[i][j]))

In [27]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [28]:
non_ind_dims = []

In [29]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [30]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [31]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [32]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,372,371,290
flau_base_u,202,418,121
flau_base_c,360,474,247
flau_large_c,791,744,615
cam_base,106,447,63
xlm_large,10,506,6
xlm_base,11,303,6
bert_base_u,7,373,1
distilbert_base,234,522,191
bert_base_c,11,481,11


In [33]:
dims = {}

In [34]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_adj_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [35]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [36]:
lr_res = []

In [37]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_adj_train[i][j], y_adj_train[i][j]))

Train Perceptron:

In [38]:
perceptron_res = []

In [39]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_adj_train[i][j], y_adj_train[i][j]))

Compute correlation to the gender vector:

In [40]:
corr_res = []

In [41]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_adj_train[i][j], y_adj_train[i][j]))

In [42]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_adj_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

## Compute medians

In [43]:
medians = {}

In [44]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_adj_train[i][j][y_adj_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_adj_train[i][j][y_adj_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [45]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

In [46]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,372,202,360,791,106,10,11,7,234,11
MI,371,418,474,744,447,506,303,373,522,481
All non ind,290,121,247,615,63,6,6,1,191,11
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,1,0,1,0,0,0,1,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [47]:
y_preds = {}

In [48]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_adj_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_adj_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

In [49]:
accs = {}

In [50]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_adj_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [51]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [52]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [53]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.920099,0.545734,0.583862,0.912731,0.523861,0.517296,0.541472,0.519945,0.610503,0.531439
ANOVA,0.920854,0.565448,0.58443,0.912353,0.524201,0.537076,0.550119,0.518988,0.607964,0.575322
MI,0.921609,0.554298,0.589531,0.91103,0.523861,0.534806,0.537124,0.520898,0.611761,0.523884
All non ind,0.922177,0.5724,0.590288,0.910275,0.522498,0.530506,0.569828,0.520893,0.610503,0.565259
LR1,0.697961,0.531197,0.549673,0.696259,0.51568,0.541424,0.554443,0.523764,0.591667,0.506203
Perc1,0.720813,0.558981,0.608421,0.773518,0.522494,0.526135,0.550143,0.512326,0.600417,0.536423
Corr1,0.729884,0.560436,0.615599,0.794673,0.518752,0.537076,0.558863,0.524694,0.602956,0.56283
All imp dims1,0.625803,0.0,0.106138,0.372543,0.098635,0.091304,0.084783,0.0,0.10566,0.0
LR5,0.855118,0.556077,0.59293,0.850774,0.525226,0.554372,0.534974,0.518988,0.663278,0.521329
Perc5,0.880996,0.576439,0.636002,0.893838,0.520794,0.53483,0.556713,0.518988,0.610495,0.515008


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [54]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.000755,0.019714,0.000568,-0.000378,0.000341,0.01978,0.008648,-0.0009569378,-0.002539,0.043884
MI,0.00151,0.008564,0.005669,-0.0017,0.0,0.017511,-0.004348,0.000952381,0.001258,-0.007555
All non ind,0.002078,0.026665,0.006426,-0.002456,-0.001363,0.013211,0.028356,0.0009478241,0.0,0.033821
LR1,-0.222138,-0.014537,-0.034189,-0.216472,-0.00818,0.024128,0.012972,0.003818638,-0.018836,-0.025236
Perc1,-0.199286,0.013247,0.024559,-0.139212,-0.001366,0.008839,0.008672,-0.007619048,-0.010086,0.004984
Corr1,-0.190215,0.014702,0.031737,-0.118057,-0.005109,0.01978,0.017391,0.004748234,-0.007547,0.031392
All imp dims1,-0.294296,-0.545734,-0.477724,-0.540187,-0.425226,-0.425991,-0.456689,-0.5199453,-0.504843,-0.531439
LR5,-0.064981,0.010343,0.009067,-0.061957,0.001365,0.037076,-0.006498,-0.0009569378,0.052775,-0.01011
Perc5,-0.039103,0.030704,0.05214,-0.018893,-0.003066,0.017535,0.015241,-0.0009569378,-8e-06,-0.016431


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [55]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c LR50 0.028712395865129525
flau_base_u Perc25 0.03749064585093698
flau_base_c Perc10 0.061959868692332054
flau_large_c All imp dims50 0.016243165521562575
cam_base All imp dims10 0.003425179224494368
xlm_large LR5 0.03707596751075015
xlm_base All non ind 0.028356426182513217
bert_base_u Corr1 0.00474823422191839
distilbert_base LR10 0.061564465408805
bert_base_c ANOVA 0.0438836477987421


In [56]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c LR50 0.9488114299790615
flau_base_u Perc25 0.5832248273808514
flau_base_c Perc10 0.6458220206315121
flau_large_c All imp dims50 0.9289737259711075
cam_base All imp dims10 0.527285730067271
xlm_large LR5 0.5543717152412805
xlm_base All non ind 0.5698279980888677
bert_base_u Corr1 0.5246935520619731
distilbert_base LR10 0.6720676100628931
bert_base_c ANOVA 0.5753223270440251


For each fold find dimensions with the highest accuracy, and pick dimensions appeared in all 5 folds:

In [57]:
best_dims = {}

In [58]:
dim_cand_accs = {}

In [59]:
for i in range(len(labels)):
    bd = []
    for j in range(n_folds):
        best_dim_group = ''
        best_acc = 0
        for dim_group in accs[labels[i]].keys():
            if accs[labels[i]][dim_group][j] > best_acc:
                best_dim_group = dim_group
                best_acc = accs[labels[i]][dim_group][j]
        bd.extend(dims[labels[i]][best_dim_group][j])
    
    # Find dimensions that appeared within dimensions with the highest accuracy in all 5 folds
    d, c = np.unique(bd, return_counts=True)
    dim_cand = [x[0] for x in zip(d, c) if x[1] >= 5]
    best_dims[labels[i]] = dim_cand
    print(f'{labels[i]}: {len(dim_cand)} dimensions repeated in 5 folds')

    # For each fold build a prediction and calculate accuracy
    cand_accs = []
    for j in range(n_folds):
        med0_cand = X_adj_train[i][j][y_adj_train[i][j] == 0][dim_cand].median()
        med1_cand = X_adj_train[i][j][y_adj_train[i][j] == 1][dim_cand].median()
        
        mae0_cand = X_adj_test[i][j][dim_cand].apply(lambda x: mean_absolute_error(med0_cand, x), axis=1)
        mae1_cand = X_adj_test[i][j][dim_cand].apply(lambda x: mean_absolute_error(med1_cand, x), axis=1)
        
        y_pred_cand = (mae0_cand > mae1_cand).apply(int)
        if any(y_pred_cand):
            cand_accs.append(accuracy_score(y_true=y_adj_test[i][j], y_pred=y_pred_cand))
        else:
            cand_accs.append(0)
    dim_cand_accs[labels[i]] = cand_accs
    print(f'Average accuracy: {np.average(cand_accs)}')

flau_small_c: 175 dimensions repeated in 5 folds
Average accuracy: 0.9507005396181081
flau_base_u: 26 dimensions repeated in 5 folds
Average accuracy: 0.5715898984332044
flau_base_c: 22 dimensions repeated in 5 folds
Average accuracy: 0.6427999450207154
flau_large_c: 134 dimensions repeated in 5 folds
Average accuracy: 0.9261392582437689
cam_base: 1 dimensions repeated in 5 folds
Average accuracy: 0.52488967445971
xlm_large: 0 dimensions repeated in 5 folds
Average accuracy: 0.0
xlm_base: 0 dimensions repeated in 5 folds
Average accuracy: 0.0
bert_base_u: 0 dimensions repeated in 5 folds
Average accuracy: 0.0
distilbert_base: 18 dimensions repeated in 5 folds
Average accuracy: 0.6532232704402515
bert_base_c: 5 dimensions repeated in 5 folds
Average accuracy: 0.6055345911949686


In [60]:
with open('../Data/best_results/gender_adj_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [61]:
with open('../Data/best_results/gender_adj_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [62]:
with open('../Data/best_results/gender_adj_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion