In [1]:
import sys
sys.path.append('../Util')

In [2]:
from preparation import prepare_dataset, read_datasets

In [3]:
from scipy.stats import f_oneway

In [4]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mean_absolute_error, accuracy_score

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
import warnings
warnings.filterwarnings('ignore')

Due to big size of WE files they are not uploaded to Github, but can instead be downloaded [here](https://drive.google.com/drive/folders/10Ea62GRlq4t7bq-nK9tPtYFu0kbCciey?usp=sharing).

The code below expects a folder "Data" in the root folder containing all the information from the Google Drive.

In [7]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    }
    
]

In [8]:
labels = [m['label'] for m in models]

In [9]:
we_with_features = read_datasets(
                            path = '../Data',
                            model_labels = labels,
                            file_name = 'all_unique_pos_we.csv'
                    )

We want to encode POS information as binary for 3 experiments:
- For the noun POS information test: nouns encoded as 1, adjectives and verbs as 0
- For the verb POS information test: verbs encoded as 1, adjectives and nouns as 0
- For the adjective POS text: adjectives encoded as 1, verbs and nouns as 

In [16]:
(we_with_features[0].POS == 'ADJ').apply(int)

Word
2D           0
3D           0
aa           0
aba          0
abaisse      0
            ..
évidentes    1
évolutif     1
évolutive    1
évoquée      1
évoquées     1
Name: POS, Length: 14883, dtype: int64

In [17]:
for df in we_with_features:
    df['is_noun'] = (df.POS == 'NOUN').apply(int)
    df['is_verb'] = (df.POS == 'VERB').apply(int)
    df['is_adj'] = (df.POS == 'ADJ').apply(int)

In [18]:
we_with_features[0].head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,511,Number,Gender,Lemma,POS,Tense,Person,is_noun,is_verb,is_adj
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D,5.531172,-8.963815,1.55832,3.14355,-5.372142,-0.174002,-1.124767,5.729996,-2.367389,4.247167,...,-4.613951,invariable,feminine,2D,NOUN,,,1,0,0
3D,3.969331,-6.442656,1.451928,3.447791,-4.224664,-1.029557,-3.664733,4.911453,0.223902,5.621365,...,-2.951385,invariable,feminine,3D,NOUN,,,1,0,0
aa,5.617864,-6.741737,2.519838,-3.914263,2.801907,-1.182259,4.43567,-1.600746,-0.582458,1.409745,...,2.984729,invariable,masculine,aa,NOUN,,,1,0,0
aba,4.914313,-6.923126,-3.848757,5.110574,-2.516107,-4.938292,2.373581,-2.75659,2.567556,2.412183,...,-4.154819,singular,masculine,aba,NOUN,,,1,0,0
abaisse,4.652038,-4.028066,0.8832,4.782077,-2.294614,-3.894452,-0.810279,-0.713935,4.81991,4.09015,...,-1.836028,singular,feminine,abaisse,NOUN,,,1,0,0


In [23]:
# There are 9 extra feautres in addition to embedding dimensions in the file: number, gender, lemma, pos, tense,
# person, is_noun, is_verb, is_adj
feature_col_count = 9

# Methodology

1. Exclude datapoints with undefined feature values (e.g. Gender = `invariable`)
2. Encode a grammatical feature as binary (e.g. Gender = 0 if masculine and 1 if feminine) (feature vector)
3. Shuffle the data set
4. Separate the dataset into 80% "training" and 20% test data
5. For each dimension in the test dataset measure if the dimension values are dependent on the grammatic feature
* Using [ANOVA](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html): The data is split into 2 samples, all dimension values when the grammatical feature is 0 and when it's equal to 1.
* Using [Mutual Information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)
6. For each dimension highlighted during step #5 find medians for 2 subgroups: when the grammatical feature == 0 and when it == 1.
7. For each word in the test dataset, find the predicted label using MSE using medians of the dimensions from #6.
8. Compute accuracy on the training dataset.

We assume that achieved accuracies can be an efficient way of comparing the quality of grammatical information encoding in the word embeddings.

The experiments for `Gender` and `Number` will be performed for "nouns only", "adjectives only" and "nouns and adjectives" combined.

# Nouns

In [24]:
# Feature to investigate in this notebook
feature = 'is_noun'

In [25]:
X_noun_train = []
y_noun_train = []

X_noun_test = []
y_noun_test = []

In [26]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we,
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          split=True)
    X_noun_train.append(xtr)
    X_noun_test.append(xtst)
    
    y_noun_train.append(ytr)
    y_noun_test.append(ytst)

### Compute ANOVA

We split each dimension into 2 samples: feminine nouns and masculine nouns. ANOVA test is used to assess if the population means are the same.

If the population means are not the same, we can make an assumption that the gender information affects the distribution of values in the given dimension.

If p-value < 0.001, reject the null hypothesis.

In [27]:
pv_threshold = 0.001

In [28]:
anova_dims = []

In [29]:
for i in range(len(models)):
    model_dims = []
    for dim in X_noun_train[i].columns:
        sample1 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_noun_train[i][dim], y_noun_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims.append(model_dims)

We can see that a very large amount of dimensions appear to be highlighted by the test:

In [30]:
for i in range(len(models)):
    print(f'{models[i]["label"]}:\nTotal dimensions {len(X_noun_train[i].columns)}\nANOVA dimensions: {len(anova_dims[i])}\n')

flau_small_c:
Total dimensions 512
ANOVA dimensions: 432

flau_base_u:
Total dimensions 768
ANOVA dimensions: 552

flau_base_c:
Total dimensions 768
ANOVA dimensions: 503

flau_large_c:
Total dimensions 1024
ANOVA dimensions: 835

cam_base:
Total dimensions 768
ANOVA dimensions: 100



### Compute Mutual Information

If mutual information is 0, we can consider that a given dimension is independent from POS information.

If MI > 0, we can't consider the dimension completely independent and it could encode the POS information.

In [31]:
mi_dims = []

In [32]:
for i in range(len(models)):
    res = mutual_info_classif(X_noun_train[i], y_noun_train[i], discrete_features=[False]*len(X_noun_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims.append(non_indep_dims)

Overall, threshold of 0 finds much more dimensions. This could be potentially addressed with a different threshold.

In [33]:
for i in range(len(models)):
    print(f"""{models[i]['label']}:
    Total dimensions {len(X_noun_train[i].columns)}
    ANOVA dimensions: {len(anova_dims[i])}
    Mutual Information dimension: {len(mi_dims[i])}\n\n""")

flau_small_c:
    Total dimensions 512
    ANOVA dimensions: 432
    Mutual Information dimension: 427


flau_base_u:
    Total dimensions 768
    ANOVA dimensions: 552
    Mutual Information dimension: 768


flau_base_c:
    Total dimensions 768
    ANOVA dimensions: 503
    Mutual Information dimension: 559


flau_large_c:
    Total dimensions 1024
    ANOVA dimensions: 835
    Mutual Information dimension: 872


cam_base:
    Total dimensions 768
    ANOVA dimensions: 100
    Mutual Information dimension: 432




For now, for each model we select only dimensions that are potentially dependent on the gender information and found by the both tests.

In [34]:
mi_annova_dims = [set(anova_dims[i]).intersection(mi_dims[i]) for i in range(len(models))]

Final number of dimensions that we can consider not independent from the gender information for each model:

In [35]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: {len(mi_annova_dims[i])}')

flau_small_c: 384
flau_base_u: 552
flau_base_c: 394
flau_large_c: 752
cam_base: 61


### Compute medians

In [36]:
medians = []

In [37]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_noun_train[i][y_noun_train[i] == 0][list(mi_annova_dims[i])].median()
    medians_df['median_1'] = X_noun_train[i][y_noun_train[i] == 1][list(mi_annova_dims[i])].median()
    medians.append(medians_df)

In [77]:
medians[0]

Unnamed: 0,median_0,median_1
310,0.600646,0.512569
178,0.496572,0.451101
450,0.525901,0.509614
305,0.584902,0.493335
109,0.534789,0.552056
...,...,...
488,0.450956,0.477892
478,0.549268,0.590075
283,0.505178,0.529084
209,0.389550,0.432775


### Predict label for test set using MAE

In [39]:
y_preds = []

In [40]:
for i in range(len(models)):
    mse0 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians[i]['median_0'], x), axis=1)
    mse1 = X_noun_test[i][list(mi_annova_dims[i])].apply(lambda x: mean_absolute_error(medians[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds.append(y_pred)

### Compute accuracy

In [41]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_noun_test[i], y_preds[i])}')

flau_small_c: Accuracy 0.8558951965065502
flau_base_u: Accuracy 0.6657549234135668
flau_base_c: Accuracy 0.5754114880752436
flau_large_c: Accuracy 0.8538797447094391
cam_base: Accuracy 0.4769071094966269


# Adjectives

Repeat all steps but for adjectives only.

In [57]:
feature = 'is_adj'

In [58]:
X_adj_train = []
y_adj_train = []

X_adj_test = []
y_adj_test = []

In [59]:
for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we,
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          split=True)
    X_adj_train.append(xtr)
    X_adj_test.append(xtst)
    
    y_adj_train.append(ytr)
    y_adj_test.append(ytst)

In [60]:
anova_dims_adj = []

In [61]:
for i in range(len(models)):
    model_dims = []
    for dim in X_adj_train[i].columns:
        sample1 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_adj_train[i][dim], y_adj_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_adj.append(model_dims)

In [62]:
mi_dims_adj = []

In [63]:
for i in range(len(models)):
    res = mutual_info_classif(X_adj_train[i], y_adj_train[i], discrete_features=[False]*len(X_adj_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_adj.append(non_indep_dims)

In [64]:
mi_annova_dims_adj = [set(anova_dims_adj[i]).intersection(mi_dims_adj[i]) for i in range(len(models))]

In [65]:
medians_adj = []

In [66]:
for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_adj_train[i][y_adj_train[i] == 0][list(mi_annova_dims_adj[i])].median()
    medians_df['median_1'] = X_adj_train[i][y_adj_train[i] == 1][list(mi_annova_dims_adj[i])].median()
    medians_adj.append(medians_df)

In [67]:
y_preds_adj = []

In [68]:
for i in range(len(models)):
    mse0 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj[i]['median_0'], x), axis=1)
    mse1 = X_adj_test[i][list(mi_annova_dims_adj[i])].apply(lambda x: mean_absolute_error(medians_adj[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds_adj.append(y_pred)

In [69]:
accs_adj = []

Here are the accuracies for adjectives only:

In [70]:
accs_adj = [accuracy_score(y_adj_test[i], y_preds_adj[i]) for i in range(len(models))]

In [71]:
for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_adj_test[i], y_preds_adj[i])}')

flau_small_c: Accuracy 0.8642929123278468
flau_base_u: Accuracy 0.7169037199124726
flau_base_c: Accuracy 0.611353711790393
flau_large_c: Accuracy 0.8545515619751428
cam_base: Accuracy 0.6875973015049299


# Verbs

In [73]:
feature = 'is_verb'

X_v_train = []
y_v_train = []

X_v_test = []
y_v_test = []

for we in we_with_features:
    xtr, xtst, ytr, ytst = prepare_dataset(dataset=we,
                                          feature_col_count=feature_col_count,
                                          feature_name=feature,
                                          split=True)
    X_v_train.append(xtr)
    X_v_test.append(xtst)
    
    y_v_train.append(ytr)
    y_v_test.append(ytst)

anova_dims_v = []

for i in range(len(models)):
    model_dims = []
    for dim in X_v_train[i].columns:
        sample1 = [x[0] for x in zip(X_v_train[i][dim], y_v_train[i]) if x[1] == 0]
        sample2 = [x[0] for x in zip(X_v_train[i][dim], y_v_train[i]) if x[1] == 1]
        if f_oneway(sample1, sample2).pvalue < pv_threshold:
            model_dims.append(dim)
    anova_dims_v.append(model_dims)

mi_dims_v = []

for i in range(len(models)):
    res = mutual_info_classif(X_v_train[i], y_v_train[i], discrete_features=[False]*len(X_v_train[i].columns))
    non_indep_dims = [str(x[0]) for x in np.argwhere(res > 0)]
    mi_dims_v.append(non_indep_dims)

mi_annova_dims_v = [set(anova_dims_v[i]).intersection(mi_dims_v[i]) for i in range(len(models))]

medians_v = []

for i in range(len(models)):
    medians_df = pd.DataFrame(columns=['median_0', 'median_1'])
    medians_df['median_0'] = X_v_train[i][y_v_train[i] == 0][list(mi_annova_dims_v[i])].median()
    medians_df['median_1'] = X_v_train[i][y_v_train[i] == 1][list(mi_annova_dims_v[i])].median()
    medians_v.append(medians_df)

y_preds_v = []

for i in range(len(models)):
    mse0 = X_v_test[i][list(mi_annova_dims_v[i])].apply(lambda x: mean_absolute_error(medians_v[i]['median_0'], x), axis=1)
    mse1 = X_v_test[i][list(mi_annova_dims_v[i])].apply(lambda x: mean_absolute_error(medians_v[i]['median_1'], x), axis=1)
    # If MSE for sample 0 is lower than for sample 1, the label should be 0. So we need to convert False to 0.
    y_pred = (mse0 > mse1).apply(int)
    y_preds_v.append(y_pred)

accs_v = []

for i in range(len(models)):
    print(f'{models[i]["label"]}: Accuracy {accuracy_score(y_v_test[i], y_preds_v[i])}')

flau_small_c: Accuracy 0.9089687604971448
flau_base_u: Accuracy 0.7568380743982495
flau_base_c: Accuracy 0.5945582801477998
flau_large_c: Accuracy 0.8878065166274773
cam_base: Accuracy 0.4193046185781007


# Conclusion

With this framework we can compare the quality of grammatical information encoding.

From the results we can see that:
* FlauBERT cased models appear to encode the gender information much better than the uncased model
* FlauBERT large allows to achieve very high accuracy in segmentation feminine and masculine nouns and adjectives only based on the values of embedding dimensions
* Despite much smaller size FlauBERT small achieves comparable classification accuracy with FlauBERT large for adjective genders, however, not for noun genders
* CamemBERT classification results are close to random