In [1]:
import sys
sys.path.append('../Util')

In [2]:
from evaluation import aabcc, sig_props, correlation, lr, perceptron, kmeans_1dim, \
                     score_comparison, run_tests, report, dimensions_report, repeated_dimensions, \
                    kmeans_multi_dim
from preparation import prepare_dataset, read_datasets

In [3]:
from scipy.stats import f_oneway

In [4]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mean_absolute_error, accuracy_score

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
import warnings
warnings.filterwarnings('ignore')

Due to big size of WE files they are not uploaded to Github, but can instead be downloaded [here](https://drive.google.com/drive/folders/10Ea62GRlq4t7bq-nK9tPtYFu0kbCciey?usp=sharing).

The code below expects a folder "Data" in the root folder containing all the information from the Google Drive.

In [8]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

In [9]:
labels = [m['label'] for m in models]

In [10]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

In [11]:
we_with_features[0].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,508,509,510,511,Number,Gender,Lemma,POS,Tense,Person
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,5.531172,-8.963815,1.55832,3.14355,-5.372142,-0.174002,-1.124767,5.729996,-2.367389,4.247167,...,-2.374313,-7.161043,2.704918,-4.613951,invariable,feminine,2D,NOUN,,
3D,3.969331,-6.442656,1.451928,3.447791,-4.224664,-1.029557,-3.664733,4.911453,0.223902,5.621365,...,-1.463473,-10.008975,2.00587,-2.951385,invariable,feminine,3D,NOUN,,
aa,5.617864,-6.741737,2.519838,-3.914263,2.801907,-1.182259,4.43567,-1.600746,-0.582458,1.409745,...,-0.108588,0.809533,-10.274058,2.984729,invariable,masculine,aa,NOUN,,
aba,4.914313,-6.923126,-3.848757,5.110574,-2.516107,-4.938292,2.373581,-2.75659,2.567556,2.412183,...,-0.979258,-2.605051,-7.204095,-4.154819,singular,masculine,aba,NOUN,,
abaisse,4.652038,-4.028066,0.8832,4.782077,-2.294614,-3.894452,-0.810279,-0.713935,4.81991,4.09015,...,-7.542219,-4.662947,-0.546076,-1.836028,singular,feminine,abaisse,NOUN,,


In [12]:
# There are 6 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, pos, tense,
# person
feature_col_count = 6

# Feature to investigate in this notebook
feature = 'Number'

# Methodology

1. Exclude datapoints with undefined feature values (e.g. Gender = `invariable`)
2. Encode a grammatical feature as binary (e.g. Gender = 0 if masculine and 1 if feminine) (feature vector)
3. Shuffle the data set
4. Separate the dataset into 80% "training" and 20% test data
5. For each dimension in the test dataset measure if the dimension values are dependent on the grammatic feature
* Using [ANOVA](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html): The data is split into 2 samples, all dimension values when the grammatical feature is 0 and when it's equal to 1.
* Using [Mutual Information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
6. For each dimension highlighted during step #5 find medians for 2 subgroups: when the grammatical feature == 0 and when it == 1.
7. For each word in the test dataset, find the predicted label using MSE using medians of the dimensions from #6.
8. Compute accuracy on the training dataset.

We assume that achieved accuracies can be an efficient way of comparing the quality of grammatical information encoding in the word embeddings.

The experiments for `Gender` and `Number` will be performed for "nouns only", "adjectives only" and "nouns and adjectives" combined.

# Nouns

In [13]:
# We start the experiment with nouns only
pos = ['NOUN']

In [14]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [15]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Number != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_noun_train.append(xtr)
    X_noun_test.append(xtst)
    
    y_noun_train.append(ytr)
    y_noun_test.append(ytst)

### Compute ANOVA

We split each dimension into 2 samples: feminine nouns and masculine nouns. ANOVA test is used to assess if the population means are the same.

If the population means are not the same, we can make an assumption that the gender information affects the distribution of values in the given dimension.

If p-value < 0.001, reject the null hypothesis.

In [16]:
pv_threshold = 0.01

In [17]:
anova_dims = []

In [18]:
for i in range(len(models)):
    model_dims = []
    for dim in X_noun_train[i].columns:
        sample1 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims.append(model_dims)

We can see that a very large amount of dimensions appear to be highlighted by the test:

In [19]:
for i in range(len(models)):
    print(f'{models[i]["label"]}:\nTotal dimensions {len(X_noun_train[i].columns)}\nANOVA dimensions: {len(anova_dims[i])}\n')

flau_small_c:
Total dimensions 512
ANOVA dimensions: 389

flau_base_u:
Total dimensions 768
ANOVA dimensions: 347

flau_base_c:
Total dimensions 768
ANOVA dimensions: 495

flau_large_c:
Total dimensions 1024
ANOVA dimensions: 860

cam_base:
Total dimensions 768
ANOVA dimensions: 15

xlm_large:
Total dimensions 1024
ANOVA dimensions: 255

xlm_base:
Total dimensions 768
ANOVA dimensions: 131

bert_base_u:
Total dimensions 768
ANOVA dimensions: 634

distilbert_base:
Total dimensions 768
ANOVA dimensions: 329

bert_base_c:
Total dimensions 768
ANOVA dimensions: 0



### Compute Mutual Information

If mutual information is 0, we can consider that a given dimension is independent from Number information.

If MI > 0, we can't consider the dimension completely independent and it could encode the Number information.

In [21]:
mi_dims = []

In [22]:
for i in range(len(models)):
    res = mutual_info_classif(X_noun_train[i], y_noun_train[i], discrete_features=[False]*len(X_noun_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims.append(non_indep_dims)

Overall, threshold of 0 finds much more dimensions. This could be potentially addressed with a different threshold.

In [23]:
for i in range(len(models)):
    print(f"""{models[i]['label']}:
    Total dimensions {len(X_noun_train[i].columns)}
    ANOVA dimensions: {len(anova_dims[i])}
    Mutual Information dimension: {len(mi_dims[i])}\n\n""")

flau_small_c:
    Total dimensions 512
    ANOVA dimensions: 389
    Mutual Information dimension: 382


flau_base_u:
    Total dimensions 768
    ANOVA dimensions: 347
    Mutual Information dimension: 474


flau_base_c:
    Total dimensions 768
    ANOVA dimensions: 495
    Mutual Information dimension: 531


flau_large_c:
    Total dimensions 1024
    ANOVA dimensions: 860
    Mutual Information dimension: 798


cam_base:
    Total dimensions 768
    ANOVA dimensions: 15
    Mutual Information dimension: 385


xlm_large:
    Total dimensions 1024
    ANOVA dimensions: 255
    Mutual Information dimension: 626


xlm_base:
    Total dimensions 768
    ANOVA dimensions: 131
    Mutual Information dimension: 393


bert_base_u:
    Total dimensions 768
    ANOVA dimensions: 634
    Mutual Information dimension: 723


distilbert_base:
    Total dimensions 768
    ANOVA dimensions: 329
    Mutual Information dimension: 541


bert_base_c:
    Total dimensions 768
    ANOVA dimensions: 0
   

For now, for each model we select only dimensions that are potentially dependent on the gender information and found by the both tests.

In [28]:
mi_annova_dims = [set(anova_dims[i]).intersection(mi_dims[i]) for i in range(len(models))]

Final number of dimensions that we can consider not independent from the gender information for each model:

In [29]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: {len(mi_annova_dims[i])}')

flau_small_c: 303
flau_base_u: 170
flau_base_c: 340
flau_large_c: 695
cam_base: 1


Note that for FlauBERT large the number of highlighted dimensions is more than half of them.

### Compute medians

In [30]:
medians_mi = []
medians_annova = []
m

In [31]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0][list(mi_annova_dims[i])].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1][list(mi_annova_dims[i])].median()
    medians.append(medians_df)

In [32]:
medians[1]

Unnamed: 0,median_0,median_1
151,0.456081,0.465711
193,0.493716,0.487428
247,0.489034,0.472311
275,0.548405,0.532761
276,0.561518,0.548537
...,...,...
97,0.552575,0.536434
637,0.526278,0.508561
700,0.489403,0.493247
535,0.490570,0.494852


### Predict label for test set using MSE

In [33]:
y_preds = []

In [34]:
for i in range(len(models)):
    mse0 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians[i]['median_0'], x), axis=1)
    mse1 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds.append(y_pred)

### Compute accuracy

In [35]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_noun_test[i], y_preds[i])}')

flau_small_c: Accuracy 0.9165680473372781
flau_base_u: Accuracy 0.6323040380047505
flau_base_c: Accuracy 0.6408284023668639
flau_large_c: Accuracy 0.936094674556213
cam_base: Accuracy 0.5414462081128748


# Adjectives

Repeat all steps but for adjectives only.

In [36]:
pos = ['ADJ']

In [37]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [38]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_adj_train.append(xtr)
    X_adj_test.append(xtst)
    
    y_adj_train.append(ytr)
    y_adj_test.append(ytst)

In [39]:
anova_dims_adj = []

In [40]:
for i in range(len(models)):
    model_dims = []
    for dim in X_adj_train[i].columns:
        sample1 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_adj.append(model_dims)

In [41]:
mi_dims_adj = []

In [42]:
for i in range(len(models)):
    res = mutual_info_classif(X_adj_train[i], y_adj_train[i], discrete_features=[False]*len(X_adj_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_adj.append(non_indep_dims)

In [43]:
mi_annova_dims_adj = [set(anova_dims_adj[i]).intersection(mi_dims_adj[i]) for i in range(len(models))]

In [44]:
medians_adj = []

In [45]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0][list(mi_annova_dims_adj[i])].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1][list(mi_annova_dims_adj[i])].median()
    medians_adj.append(medians_df)

In [46]:
y_preds_adj = []

In [47]:
for i in range(len(models)):
    mse0 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj[i]['median_0'], x), axis=1)
    mse1 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds_adj.append(y_pred)

In [48]:
accs_adj = []

Here are the accuracies for adjectives only:

In [50]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_adj_test[i], y_preds_adj[i])}')

flau_small_c: Accuracy 0.40816326530612246
flau_base_u: Accuracy 0.2890792291220557
flau_base_c: Accuracy 0.288265306122449
flau_large_c: Accuracy 0.4107142857142857


ValueError: at least one array or dtype is required

# Adjectives and nouns

In [51]:
pos = ['ADJ', 'NOUN']

X_na_train = []
y_na_train = []

X_na_test = []
y_na_test = []

for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we[(we.Gender != 'invariable') & (we.POS.isin(pos))],
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          encode=True,
                                          split=True)
    X_na_train.append(xtr)
    X_na_test.append(xtst)
    
    y_na_train.append(ytr)
    y_na_test.append(ytst)

anova_dims_na = []

for i in range(len(models)):
    model_dims = []
    for dim in X_na_train[i].columns:
        sample1 = [x[0] for x in zip(X_na_train[i][dim], y_na_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_na_train[i][dim], y_na_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_na.append(model_dims)

mi_dims_na = []

for i in range(len(models)):
    res = mutual_info_classif(X_na_train[i], y_na_train[i], discrete_features=[False]*len(X_na_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_na.append(non_indep_dims)

mi_annova_dims_na = [set(anova_dims_na[i]).intersection(mi_dims_na[i]) for i in range(len(models))]

medians_na = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_na_train[i][y_na_train[i] == 0][list(mi_annova_dims_na[i])].median()
    medians_df['median_1'] = X_na_train[i][y_na_train[i] == 1][list(mi_annova_dims_na[i])].median()
    medians_na.append(medians_df)

y_preds_na = []

for i in range(len(models)):
    mse0 = X_na_test[i][list(mi_annova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na[i]['median_0'], x), axis=1)
    mse1 = X_na_test[i][list(mi_annova_dims_na[i])].apply(lambda x: mean_absolute_error(medians_na[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds_na.append(y_pred)

accs_na = []

for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_na_test[i], y_preds_na[i])}')

flau_small_c: Accuracy 0.29818857408267535
flau_base_u: Accuracy 0.23401869158878505
flau_base_c: Accuracy 0.215513237343242
flau_large_c: Accuracy 0.3111936832326986
cam_base: Accuracy 0.10642857142857143


# Conclusion

We can see that the accuracy is much higher for classifying nouns than adjectives.

For some reason, the accuracy for predictions on adjectives is below random choice.

Once again FlauBERT small despite much smaller size is showing comparable accuracy with FlauBERT large.