In [1]:
import sys
sys.path.append('../Util')
import pickle
import os

In [2]:
from IPython.display import Image

In [3]:
from evaluation import correlation, lr, perceptron, get_anova_dims, get_mi_dims
from preparation import prepare_dataset, read_datasets

In [4]:
from we import get_we, initiate_model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from scipy.stats import f_oneway

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [10]:
labels = [m['label'] for m in models]

In [11]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [12]:
we_with_features[-1].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,764,765,766,767,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,0.909603,0.551025,0.436937,0.789782,0.311517,0.417476,0.629341,0.913722,0.349834,0.838988,...,0.285154,0.057887,0.030874,0.310204,invariable,feminine,2D,NOUN,,
3D,0.901779,0.54988,0.427826,0.797102,0.3139,0.419226,0.623478,0.916539,0.355807,0.844714,...,0.28761,0.054251,0.018604,0.305226,invariable,feminine,3D,NOUN,,
aa,0.89957,0.555891,0.418186,0.805754,0.319232,0.412976,0.618457,0.914183,0.367882,0.845985,...,0.287883,0.057476,0.013614,0.290463,invariable,masculine,aa,NOUN,,
abandon,0.898879,0.556306,0.417821,0.805256,0.318463,0.412505,0.618276,0.914983,0.367654,0.846577,...,0.288183,0.057827,0.01405,0.290994,singular,masculine,abandon,NOUN,,
abbaye,0.895888,0.554461,0.420659,0.799339,0.322441,0.418141,0.61882,0.921033,0.367005,0.845904,...,0.293985,0.057312,0.012175,0.298473,singular,feminine,abbaye,NOUN,,


In [13]:
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'POS'

In this notebook we will calculate **InfEnc** metric for word embeddings of different models, in order to compare the quality of information encoding about `Noun-ness`.

For this we will pick concatenate dimensions of word embedding of nouns, adjectives and verbs. Note that words that are both adjectives and nouns were discarded previously.
Nouns will be assigned 1 in the feature vector, while verbs and adjectives will be assigned 0.

# Nouns

In [14]:
pos = ['NOUN']

Split each model into train and test using k_fold cross validation:

In [15]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [16]:
n_folds = 5

In [17]:
for we in we_with_features:
    X, y = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.Number != 'invariable')],
                                           feature_col_count=feature_col_count,
                                           feature_name=feature,
                                           normalize=False,
                                           encode=True,
                                           encode_as1='NOUN',
                                           split=False,
                                           balance=True)
    X_trains = []
    y_trains = []
    
    X_tests = []
    y_tests = []
    
    X_folds = np.array_split(X, n_folds)
    y_folds = np.array_split(y, n_folds)
    
    for i in range(n_folds):
        X_trains.append(pd.DataFrame(np.concatenate(X_folds[:i] + X_folds[i+1:])))
        y_trains.append(np.concatenate(y_folds[:i] + y_folds[i+1:]))

        X_folds[i].columns = X_folds[i].columns.map(int)
        X_tests.append(X_folds[i])
        y_tests.append(y_folds[i])
        
    
    X_noun_train.append(X_trains)
    X_noun_test.append(X_tests)
    
    y_noun_train.append(y_trains)
    y_noun_test.append(y_tests)

In [18]:
dataset_sizes = pd.DataFrame(index=labels)
dataset_sizes['1 fold size'] = [len(x[0]) for x in X_noun_test]
dataset_sizes

Unnamed: 0,1 fold size
flau_small_c,2028
flau_base_u,2414
flau_base_c,2028
flau_large_c,2028
cam_base,1196
xlm_large,429
xlm_base,429
bert_base_u,658
distilbert_base,337
bert_base_c,337


## Non-independent dims

Calculate non-independendent dimensions for each of _k_ splits.

ANOVA test with p-value < 0.01:

In [19]:
anova_dims = []

In [20]:
for i in range(len(models)):
    anova_dims.append([])
    for j in range(n_folds):
        anova_dims[i].append(get_anova_dims(X_noun_train[i][j], y_noun_train[i][j]))

Get dimensions where MI > 0:

In [21]:
mi_dims = []

In [22]:
for i in range(len(models)):
    mi_dims.append([])
    for j in range(n_folds):
        mi_dims[i].append(get_mi_dims(X_noun_train[i][j], y_noun_train[i][j]))

In [23]:
for i in range(len(models)):
    for j in range(n_folds):
        temp = list(map(lambda x: int(x), mi_dims[i][j]))
        mi_dims[i][j] = temp

Get dimensions that are both found by the ANOVA independency test and MI test:

In [24]:
non_ind_dims = []

In [25]:
for i in range(len(models)):
    non_ind_dims.append([])
    for j in range(n_folds):
        non_ind_dims[i].append(list(set(anova_dims[i][j]).intersection(mi_dims[i][j])))

Stats about the number of dimensions for each model for the 1st split:

In [26]:
non_ind_df = pd.DataFrame(index=labels, columns=['ANOVA', 'MI', 'Total non independent'])

In [27]:
non_ind_df['ANOVA'] = [len(x[0]) for x in anova_dims]
non_ind_df['MI'] = [len(x[0]) for x in mi_dims]
non_ind_df['Total non independent'] = [len(x[0]) for x in non_ind_dims]

In [28]:
non_ind_df

Unnamed: 0,ANOVA,MI,Total non independent
flau_small_c,438,432,387
flau_base_u,587,768,587
flau_base_c,555,557,431
flau_large_c,854,886,762
cam_base,197,471,129
xlm_large,378,633,263
xlm_base,286,477,202
bert_base_u,310,311,133
distilbert_base,329,455,219
bert_base_c,10,551,6


In [29]:
dims = {}

In [30]:
for i in range(len(models)):
    dims[labels[i]] = {}
    dims[labels[i]]['All dims'] = [X_noun_train[i][0].columns]*n_folds
    dims[labels[i]]['ANOVA'] = anova_dims[i]
    dims[labels[i]]['MI'] = mi_dims[i]
    dims[labels[i]]['All non ind'] = non_ind_dims[i]

## Important dims

We can test different $\alpha$ values: 1%, 5%, 10%, 25%, 50%, 75%.

In [31]:
alphas = [1, 5, 10, 25, 50, 75]

Train Logistic Regression on train set for each model:

In [32]:
lr_res = []

In [33]:
for i in range(len(models)):
    lr_res.append([])
    for j in range(n_folds):
        lr_res[i].append(lr(X_noun_train[i][j], y_noun_train[i][j]))

Train Perceptron:

In [34]:
perceptron_res = []

In [35]:
for i in range(len(models)):
    perceptron_res.append([])
    for j in range(n_folds):
        perceptron_res[i].append(perceptron(X_noun_train[i][j], y_noun_train[i][j]))

Compute correlation to the gender vector:

In [36]:
corr_res = []

In [37]:
for i in range(len(models)):
    corr_res.append([])
    for j in range(n_folds):
        corr_res[i].append(correlation(X_noun_train[i][j], y_noun_train[i][j]))

In [38]:
for i in range(len(models)):
    for alpha in alphas:
        dims[labels[i]][f'LR{alpha}'] = []
        dims[labels[i]][f'Perc{alpha}'] = []
        dims[labels[i]][f'Corr{alpha}'] = []
        dims[labels[i]][f'All imp dims{alpha}'] = []
        for j in range(n_folds):
            num_imp_dims = len(X_noun_test[i][0].columns)*alpha//100
            lr_dims = [x[0] for x in lr_res[i][j][:num_imp_dims]]
            perc_dims = [x[0] for x in perceptron_res[i][j][:num_imp_dims]]
            corr_dims = [x[0] for x in corr_res[i][j][:num_imp_dims]]
            dims[labels[i]][f'LR{alpha}'].append(lr_dims)
            dims[labels[i]][f'Perc{alpha}'].append(perc_dims)
            dims[labels[i]][f'Corr{alpha}'].append(corr_dims)
            dims[labels[i]][f'All imp dims{alpha}'].append(list(set(lr_dims).intersection(perc_dims).intersection(corr_dims)))

## Compute medians

For each of the dimension groups above, compute a median vector: the one that should describe the expected values of the dimensions the best.
We compute median vector associated with feature vector values = 0 and another one for feature vector values = 1.

In [40]:
medians = {}

In [41]:
for i in range(len(models)):
    medians[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        medians[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else list(dims[labels[i]][dim_group])
            # Median of dimensions where feature vector is equal to 0
            median_0 = X_noun_train[i][j][y_noun_train[i][j] == 0][dim_list].median()
            # Median of dimensions where feature vector is equal to 1
            median_1 = X_noun_train[i][j][y_noun_train[i][j] == 1][dim_list].median()

            medians[labels[i]][dim_group].append({
                    '0': median_0,
                    '1': median_1
                })

We can compare the number of dimensions found by each test for each model in the first fold:

In [42]:
dim_lens = {}

for model in dims.keys():
    dim_lens[model] = {}
    for dim_group in dims[model].keys():
        dim_lens[model][dim_group] = len(dims[model][dim_group][0]) if dim_group != 'All dims' else \
                len(dims[model][dim_group])

In [43]:
pd.DataFrame(dim_lens)

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,5,5,5,5,5,5,5,5,5,5
ANOVA,438,587,555,854,197,378,286,310,329,10
MI,432,768,557,886,471,633,477,311,455,551
All non ind,387,587,431,762,129,263,202,133,219,6
LR1,5,7,7,10,7,10,7,7,7,7
Perc1,5,7,7,10,7,10,7,7,7,7
Corr1,5,7,7,10,7,10,7,7,7,7
All imp dims1,0,2,0,0,0,0,0,1,0,0
LR5,25,38,38,51,38,51,38,38,38,38
Perc5,25,38,38,51,38,51,38,38,38,38


## Compute predictions

Using the median vectors computed for each group above, try to predict the value of the feature vector on test.
For this, for any word embedding in test compute distances to median for feature vector values = 0 and for feature vector values = 1. 
If the given word embedding is closer to median of feature vector values = 0, assign 0 as predicted label, 1 otherwise.

In [44]:
y_preds = {}

In [45]:
for i in range(len(models)):
    y_preds[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        y_preds[labels[i]][dim_group] = []
        n = len(dims[labels[i]][dim_group])
        for j in range(n_folds):
            dim_list = dims[labels[i]][dim_group][j] if n == n_folds else dims[labels[i]][dim_group]
            
            # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
            mae0 = X_noun_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['0'], x), axis=1)
            mae1 = X_noun_test[i][j][dim_list].apply(lambda x: mean_absolute_error(medians[labels[i]][dim_group][j]['1'], x), axis=1)
            
            y_preds[labels[i]][dim_group].append((mae0 > mae1).apply(int))
    

## Compute accuracies

Using the predictions above, compute the accuracies for each dimension group.

In [46]:
accs = {}

In [47]:
for i in range(len(models)):
    accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        accs[labels[i]][dim_group] = []
        for j in range(n_folds):
            y_true = y_noun_test[i][j]
            y_pred = y_preds[labels[i]][dim_group][j]
            if any(y_pred):
                acc = accuracy_score(y_true, y_pred)
            else:
                acc = 0
            accs[labels[i]][dim_group].append(acc)

In [48]:
# Compute average accuracy between 5 folds
avg_accs = {}

In [49]:
for i in range(len(models)):
    avg_accs[labels[i]] = {}
    for dim_group in dims[labels[i]].keys():
        avg_accs[labels[i]][dim_group] = np.average(accs[labels[i]][dim_group])

In [50]:
accs_df = pd.DataFrame(avg_accs)
accs_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.862004,0.593869,0.569343,0.868317,0.515217,0.566752,0.529872,0.532559,0.605105,0.559397
ANOVA,0.862695,0.607291,0.569935,0.867824,0.521739,0.567219,0.533136,0.53256,0.606295,0.559397
MI,0.861117,0.593869,0.572993,0.868317,0.51689,0.569553,0.52894,0.532255,0.604508,0.558208
All non ind,0.861511,0.607291,0.57388,0.867626,0.526589,0.56862,0.534069,0.53256,0.608674,0.559398
LR1,0.625272,0.610853,0.560764,0.678148,0.528094,0.552741,0.569568,0.529516,0.585525,0.554645
Perc1,0.787533,0.652361,0.611069,0.793153,0.523077,0.560204,0.585449,0.531644,0.622331,0.559395
Corr1,0.782993,0.661723,0.620735,0.815151,0.530602,0.569576,0.579366,0.532863,0.610453,0.561771
All imp dims1,0.0,0.49536,0.0,0.0,0.103177,0.0,0.338052,0.32221,0.0,0.0
LR5,0.814263,0.61657,0.583645,0.841389,0.52408,0.570489,0.55649,0.531951,0.617596,0.556429
Perc5,0.866641,0.689064,0.63987,0.877687,0.542475,0.578889,0.565356,0.532256,0.627088,0.559992


We can show what is the accuracy gain for each dimension test comparison to using all dimensions:

In [51]:
gains_df = (accs_df - accs_df.loc['All dims'])
gains_df

Unnamed: 0,flau_small_c,flau_base_u,flau_base_c,flau_large_c,cam_base,xlm_large,xlm_base,bert_base_u,distilbert_base,bert_base_c
All dims,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ANOVA,0.000691,0.013422,0.000592,-0.000493048,0.006521739,0.000467,0.003264,9.252705e-07,0.00119,1.110223e-16
MI,-0.000887,0.0,0.003649,-9.73057e-08,0.001672241,0.002802,-0.000931,-0.000304414,-0.000597,-0.00118871
All non ind,-0.000493,0.013422,0.004537,-0.0006905786,0.01137124,0.001868,0.004197,4.626353e-07,0.00357,1.766285e-06
LR1,-0.236732,0.016984,-0.00858,-0.1901682,0.01287625,-0.014011,0.039697,-0.003043677,-0.019579,-0.004751307
Perc1,-0.074472,0.058492,0.041726,-0.07516379,0.007859532,-0.006548,0.055577,-0.0009155552,0.017227,-1.766285e-06
Corr1,-0.079012,0.067854,0.051391,-0.05316535,0.01538462,0.002824,0.049495,0.0003034887,0.005348,0.002373887
All imp dims1,-0.862004,-0.098509,-0.569343,-0.8683167,-0.4120401,-0.566752,-0.19182,-0.2103492,-0.605105,-0.5593966
LR5,-0.047742,0.022701,0.014302,-0.02692736,0.008862876,0.003737,0.026618,-0.0006079027,0.012491,-0.002967359
Perc5,0.004636,0.095195,0.070526,0.009370101,0.02725753,0.012138,0.035485,-0.0003030261,0.021983,0.0005952381


Here are the dimension groups and the accuracy gain this dimension group provides over all dimensions:

In [52]:
for i in range(len(models)):
    print(labels[i], gains_df.idxmax()[i], gains_df.loc[gains_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.031267630576954586
flau_base_u Perc5 0.09519469759734889
flau_base_c Perc5 0.07052629735260374
flau_large_c Perc25 0.03255060626317885
cam_base Perc10 0.028595317725752523
xlm_large Perc5 0.012137550922597717
xlm_base Perc1 0.05557697753959445
bert_base_u Perc10 0.00030533927357001556
distilbert_base Perc10 0.029707149922283405
bert_base_c All imp dims5 0.008324501907588044


Below, you can find the best achieved accuracy and the dimension group that produced it. This value is `InfEnc`.

In [53]:
for i in range(len(models)):
    print(labels[i], accs_df.idxmax()[i], accs_df.loc[accs_df.idxmax()[i], labels[i]])

flau_small_c Perc25 0.8932717972071316
flau_base_u Perc5 0.6890637945318974
flau_base_c Perc5 0.6398696492810568
flau_large_c Perc25 0.9008672857255455
cam_base Perc10 0.5438127090301004
xlm_large Perc5 0.5788891793564691
xlm_base Perc1 0.5854486634860467
bert_base_u Perc10 0.5328646838119294
distilbert_base Perc10 0.6348117140031087
bert_base_c All imp dims5 0.5677211389006642


In [57]:
with open('../Data/best_results/pos_noun_dims.pickle', 'wb') as f:
    pickle.dump(dims, f)

In [58]:
with open('../Data/best_results/pos_noun_accs.pickle', 'wb') as f:
    pickle.dump(accs, f)

In [59]:
with open('../Data/best_results/pos_noun_medians.pickle', 'wb') as f:
    pickle.dump(medians, f)

# Conclusion

- Unlike for gender and number information, FlauBERT base uncased performs better than the cased model in InfEnc metric for noun-ness
- Similarly, to gender and noun FlauBERT small perfrorms comparably to FlauBERT large despite being 10x smaller