In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.909603,0.551025,0.436937,0.789782,0.311517,0.417476,0.629341,0.913722,0.349834,0.838988,...,0.285154,0.057887,0.030874,0.310204,invariable,feminine,2D,NOUN,,
3D,0.901779,0.54988,0.427826,0.797102,0.3139,0.419226,0.623478,0.916539,0.355807,0.844714,...,0.28761,0.054251,0.018604,0.305226,invariable,feminine,3D,NOUN,,
aa,0.89957,0.555891,0.418186,0.805754,0.319232,0.412976,0.618457,0.914183,0.367882,0.845985,...,0.287883,0.057476,0.013614,0.290463,invariable,masculine,aa,NOUN,,
abandon,0.898879,0.556306,0.417821,0.805256,0.318463,0.412505,0.618276,0.914983,0.367654,0.846577,...,0.288183,0.057827,0.01405,0.290994,singular,masculine,abandon,NOUN,,
abbaye,0.895888,0.554461,0.420659,0.799339,0.322441,0.418141,0.61882,0.921033,0.367005,0.845904,...,0.293985,0.057312,0.012175,0.298473,singular,feminine,abbaye,NOUN,,


In [13]:
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'POS'

# Verbs

In [14]:
pos = ['VERB']

Split each model into train and test using k_fold cross validation:

In [15]:
X_verb_train = []
y_verb_train = []

X_verb_test = []
y_verb_test = []

In [16]:
n_folds = 5

In [17]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='VERB',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_verb_train.append(X_trains)
    X_verb_test.append(X_tests)
    
    y_verb_train.append(y_trains)
    y_verb_test.append(y_tests)

In [18]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_verb_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,1275
flau_base_u,1518
flau_base_c,1275
flau_large_c,1275
cam_base,779
xlm_large,348
xlm_base,348
bert_base_u,492
distilbert_base,232
bert_base_c,232


## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [19]:
anova_dims = []

In [20]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_verb_train[i][j], y_verb_train[i][j]))

Get dimensions where MI > 0:

In [21]:
mi_dims = []

In [22]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_verb_train[i][j], y_verb_train[i][j]))

In [23]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [24]:
non_ind_dims = []

In [25]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [26]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [27]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [28]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,435,442,390
flau_base_u,581,766,581
flau_base_c,522,572,410
flau_large_c,817,853,721
cam_base,402,664,356
xlm_large,516,693,382
xlm_base,87,503,60
bert_base_u,467,439,270
distilbert_base,255,519,199
bert_base_c,24,677,23


In [29]:
dims = {}

In [30]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_verb_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [31]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [32]:
lr_res = []

In [33]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_verb_train[i][j], y_verb_train[i][j]))

Train Perceptron:

In [34]:
perceptron_res = []

In [35]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_verb_train[i][j], y_verb_train[i][j]))

Compute correlation to the gender vector:

In [36]:
corr_res = []

In [37]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_verb_train[i][j], y_verb_train[i][j]))

In [38]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_verb_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

In [39]:
dims[labels[0]]['LR1']

[[378, 192, 182, 434, 432],
 [378, 434, 56, 192, 316],
 [378, 192, 56, 434, 299],
 [434, 378, 192, 56, 310],
 [182, 378, 434, 480, 56]]

## Compute medians

In [40]:
medians = {}

In [41]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_verb_train[i][j][y_verb_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_verb_train[i][j][y_verb_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [42]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

In [43]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,435,581,522,817,402,516,87,467,255,24
MI,442,766,572,853,664,693,503,439,519,677
All non ind,390,581,410,721,356,382,60,270,199,23
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,0,0,0,1,0,0,1,0,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

In [44]:
y_preds = {}

In [45]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_verb_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_verb_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

In [46]:
accs = {}

In [47]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_verb_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [48]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [49]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [50]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.913397,0.602792,0.581113,0.905084,0.549825,0.587572,0.499995,0.533374,0.595029,0.596723
ANOVA,0.913554,0.624405,0.58174,0.903829,0.550082,0.591606,0.538585,0.53378,0.626097,0.597593
MI,0.913868,0.603319,0.583152,0.902103,0.549825,0.588724,0.520165,0.532967,0.603661,0.597589
All non ind,0.914025,0.624537,0.584564,0.902574,0.550338,0.589299,0.542047,0.532967,0.626101,0.596727
LR1,0.809382,0.607802,0.613428,0.778632,0.536209,0.571437,0.592754,0.534189,0.61573,0.602784
Perc1,0.840914,0.700709,0.641984,0.842171,0.568319,0.587573,0.578358,0.536219,0.623515,0.601045
Corr1,0.855191,0.694779,0.647474,0.837778,0.568827,0.589286,0.580659,0.527271,0.636453,0.600183
All imp dims1,0.303802,0.0,0.48684,0.393412,0.0,0.0,0.368542,0.210569,0.360528,0.0
LR5,0.909632,0.665789,0.636652,0.87904,0.54674,0.586414,0.565098,0.532561,0.62522,0.595007
Perc5,0.921556,0.709142,0.691089,0.920301,0.576533,0.607155,0.560497,0.532966,0.621776,0.600175


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [51]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.000157,0.021613,0.000627,-0.001255,0.000257,0.004035,0.03859,0.0004056762,0.031068,0.00087
MI,0.000471,0.000527,0.002039,-0.002981,0.0,0.001153,0.02017,-0.000407332,0.008632,0.000866
All non ind,0.000628,0.021745,0.003451,-0.00251,0.000514,0.001727,0.042052,-0.000407332,0.031072,4e-06
LR1,-0.104015,0.00501,0.032316,-0.126452,-0.013615,-0.016135,0.092759,0.000814664,0.020701,0.006061
Perc1,-0.072483,0.097917,0.060872,-0.062912,0.018494,2e-06,0.078363,0.002845528,0.028486,0.004322
Corr1,-0.058206,0.091987,0.066362,-0.067306,0.019003,0.001714,0.080663,-0.006103356,0.041424,0.003459
All imp dims1,-0.609595,-0.602792,-0.094272,-0.511672,-0.549825,-0.587572,-0.131454,-0.3228048,-0.234501,-0.596723
LR5,-0.003765,0.062997,0.055539,-0.026044,-0.003085,-0.001158,0.065103,-0.0008130081,0.030191,-0.001717
Perc5,0.008159,0.10635,0.109976,0.015217,0.026709,0.019583,0.060502,-0.0004081599,0.026747,0.003452


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [52]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.0219660787391881
flau_base_u Perc10 0.11504946573875519
flau_base_c Perc5 0.10997592883307161
flau_large_c Perc25 0.029495244251546815
cam_base Perc5 0.026708818569717185
xlm_large Perc5 0.019583291927523283
xlm_base LR1 0.09275895193613576
bert_base_u Perc1 0.0028455284552846294
distilbert_base Corr5 0.04404015524705185
bert_base_c LR1 0.006060606060605989


In [53]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.9353630683042449
flau_base_u Perc10 0.7178413639707383
flau_base_c Perc5 0.69108862006341
flau_large_c Perc25 0.9345788777049281
cam_base Perc5 0.5765334239731248
xlm_large Perc5 0.6071549239789327
xlm_base LR1 0.5927539832389281
bert_base_u Perc1 0.53621942940407
distilbert_base Corr5 0.6390692640692641
bert_base_c LR1 0.6027839976115839


For each fold find dimensions with the highest accuracy, and pick dimensions appeared in all 5 folds:

In [54]:
best_dims = {}

In [55]:
dim_cand_accs = {}

In [56]:
for i in range(len(labels)):
    bd = []
    for j in range(n_folds):
        best_dim_group = ''
        best_acc = 0
        for dim_group in accs[labels[i]].keys():
            if accs[labels[i]][dim_group][j] > best_acc:
                best_dim_group = dim_group
                best_acc = accs[labels[i]][dim_group][j]
        bd.extend(dims[labels[i]][best_dim_group][j])
    
    # Find dimensions that appeared within dimensions with the highest accuracy in all 5 folds
    d, c = np.unique(bd, return_counts=True)
    dim_cand = [x[0] for x in zip(d, c) if x[1] >= 5]
    best_dims[labels[i]] = dim_cand
    print(f'{labels[i]}: {len(dim_cand)} dimensions repeated in 5 folds')

    # For each fold build a prediction and calculate accuracy
    cand_accs = []
    for j in range(n_folds):
        med0_cand = X_verb_train[i][j][y_verb_train[i][j] == 0][dim_cand].median()
        med1_cand = X_verb_train[i][j][y_verb_train[i][j] == 1][dim_cand].median()
        
        mae0_cand = X_verb_test[i][j][dim_cand].apply(lambda x: mean_absolute_error(med0_cand, x), axis=1)
        mae1_cand = X_verb_test[i][j][dim_cand].apply(lambda x: mean_absolute_error(med1_cand, x), axis=1)
        
        y_pred_cand = (mae0_cand > mae1_cand).apply(int)
        if any(y_pred_cand):
            cand_accs.append(accuracy_score(y_true=y_verb_test[i][j], y_pred=y_pred_cand))
        else:
            cand_accs.append(0)
    dim_cand_accs[labels[i]] = cand_accs
    print(f'Average accuracy: {np.average(cand_accs)}')

flau_small_c: 24 dimensions repeated in 5 folds
Average accuracy: 0.919673223135408
flau_base_u: 49 dimensions repeated in 5 folds
Average accuracy: 0.7287812347197289
flau_base_c: 22 dimensions repeated in 5 folds
Average accuracy: 0.7017575030012004
flau_large_c: 73 dimensions repeated in 5 folds
Average accuracy: 0.9317552251669898
cam_base: 0 dimensions repeated in 5 folds
Average accuracy: 0.0
xlm_large: 1 dimensions repeated in 5 folds
Average accuracy: 0.5950462088840306
xlm_base: 1 dimensions repeated in 5 folds
Average accuracy: 0.6175328762131903
bert_base_u: 0 dimensions repeated in 5 folds
Average accuracy: 0.0
distilbert_base: 5 dimensions repeated in 5 folds
Average accuracy: 0.6451074787281684
bert_base_c: 0 dimensions repeated in 5 folds
Average accuracy: 0.0


In [57]:
with open('../Data/best_results/pos_verb_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [58]:
with open('../Data/best_results/pos_verb_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [59]:
with open('../Data/best_results/pos_verb_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion