In [118]:
import torch
from transformers import FlaubertModel, FlaubertTokenizer

import numpy as np
import pandas as pd 

# Choose among ['flaubert/flaubert_small_cased', 'flaubert/flaubert_base_uncased', 
#               'flaubert/flaubert_base_cased', 'flaubert/flaubert_large_cased']
modelname = 'flaubert/flaubert_small_cased' 

In [5]:
# Load pretrained model and tokenizer
flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)

Downloading config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/208M [00:00<?, ?B/s]

Some weights of the model checkpoint at flaubert/flaubert_small_cased were not used when initializing FlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.json:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

## Function to retrieve WE

In [107]:
def get_we(model, tokenizer, word):
    # The tokenizer creates a vector of the following format [0, <word_id>, 1] where 0 is the code for [CLS] 
    # and 1 is the code for the string end. That's why we're only taking the second element of array for getting its 
    # embedding.
    word_id = flaubert_tokenizer.encode(word)[1]
    
    # We create a Tensor since the model works with this format
    token_ids = torch.tensor([[word_id]])
    
    # Get the last layer of the model with the final embedding. `last_layer` is a Tensor
    last_layer = flaubert(token_ids)[0]
    
    # Convert the Tensor to a numpy array. Since Tensor is 3-Dimensional, we need to flatten the result as well
    we = last_layer.detach().numpy().flatten()
    
    return we

## Compare WE of word pairs

We will create 2 pairs of embedding for singular and plural nouns: one pair is using inflection with `-s`, another pair uses the inflection pair `au`->`aux`

In [145]:
pomme_we = get_we(flaubert, flaubert_tokenizer, 'pomme')
pommes_we = get_we(flaubert, flaubert_tokenizer, 'pommes')

oiseau_we = get_we(flaubert, flaubert_tokenizer, 'oiseau')
oiseaux_we = get_we(flaubert, flaubert_tokenizer, 'oiseaux')

We will now store the WE in a Pandas DataFrame.

In [146]:
df = pd.DataFrame(columns=['dim', 'pomme', 'pommes', 'oiseau', 'oiseaux'])
df.dim = range(512)
df.pomme = pomme_we
df.pommes = pommes_we
df.oiseau = oiseau_we
df.oiseaux = oiseaux_we
df

Unnamed: 0,dim,pomme,pommes,oiseau,oiseaux
0,0,-0.789583,2.870298,8.625494,4.051764
1,1,0.173690,2.873069,2.405889,-0.354589
2,2,2.101598,-0.319898,-1.324769,-2.143469
3,3,1.687404,3.300869,0.028446,-1.254749
4,4,1.308981,2.539638,3.579112,5.393857
...,...,...,...,...,...
507,507,-3.022486,-4.887702,-0.263529,2.995455
508,508,-2.054853,-1.295427,-2.647445,-3.688531
509,509,-7.838132,-5.978479,-0.322017,-3.423790
510,510,2.746323,1.209072,4.533642,5.523484


We will now create a new column calculating difference of each dimension for both pairs.

In [147]:
df['p_diff'] = abs(df.pomme - df.pommes)
df['o_diff'] = abs(df.oiseau - df.oiseaux)

In [148]:
df

Unnamed: 0,dim,pomme,pommes,oiseau,oiseaux,p_diff,o_diff
0,0,-0.789583,2.870298,8.625494,4.051764,3.659881,4.573730
1,1,0.173690,2.873069,2.405889,-0.354589,2.699379,2.760478
2,2,2.101598,-0.319898,-1.324769,-2.143469,2.421496,0.818700
3,3,1.687404,3.300869,0.028446,-1.254749,1.613466,1.283195
4,4,1.308981,2.539638,3.579112,5.393857,1.230657,1.814745
...,...,...,...,...,...,...,...
507,507,-3.022486,-4.887702,-0.263529,2.995455,1.865216,3.258984
508,508,-2.054853,-1.295427,-2.647445,-3.688531,0.759426,1.041085
509,509,-7.838132,-5.978479,-0.322017,-3.423790,1.859653,3.101774
510,510,2.746323,1.209072,4.533642,5.523484,1.537251,0.989842


## Dimensions with closest and furthest values between pairs

We will now sort the dataframe first by `p_diff` and then by `o_diff` to find the closest and furthest dimensions for both pairs. We will analyze top 5% closest dimensions and bottom 5% closest dimensions for both pairs.

In [176]:
df.sort_values('p_diff')

Unnamed: 0,dim,pomme,pommes,oiseau,oiseaux,p_diff,o_diff
481,481,-1.149791,-1.139267,-1.872658,-4.241241,0.010524,2.368583
188,188,2.357234,2.368219,2.926783,3.745159,0.010984,0.818375
202,202,4.795053,4.782814,1.274314,2.393198,0.012240,1.118885
171,171,3.845696,3.828545,6.157597,3.620735,0.017151,2.536861
176,176,4.805288,4.822771,4.384145,0.634487,0.017483,3.749658
...,...,...,...,...,...,...,...
319,319,6.953439,0.874482,4.557711,3.035436,6.078958,1.522274
263,263,-2.911643,-9.205585,-5.997969,-1.286199,6.293943,4.711770
467,467,-4.666477,1.949474,5.987657,5.078243,6.615952,0.909414
333,333,0.123309,-6.584016,0.215944,-0.595359,6.707325,0.811303


In [177]:
df.sort_values('p_diff').head(26)

Unnamed: 0,dim,pomme,pommes,oiseau,oiseaux,p_diff,o_diff
481,481,-1.149791,-1.139267,-1.872658,-4.241241,0.010524,2.368583
188,188,2.357234,2.368219,2.926783,3.745159,0.010984,0.818375
202,202,4.795053,4.782814,1.274314,2.393198,0.01224,1.118885
171,171,3.845696,3.828545,6.157597,3.620735,0.017151,2.536861
176,176,4.805288,4.822771,4.384145,0.634487,0.017483,3.749658
137,137,2.840835,2.868752,3.049018,4.329151,0.027917,1.280133
443,443,-4.009586,-4.043835,-2.234268,0.200704,0.034248,2.434972
56,56,-0.397858,-0.354643,-0.771506,0.8807,0.043215,1.652206
412,412,4.602641,4.556083,0.262771,1.080966,0.046558,0.818195
149,149,3.450408,3.498649,2.508493,5.90979,0.048241,3.401297


In [178]:
closest_dim_p = df.sort_values('p_diff').dim[:26]
furthest_dim_p = df.sort_values('p_diff').dim[-26:]

In [179]:
closest_dim_o = df.sort_values('o_diff').dim[:26]
furthest_dim_o = df.sort_values('o_diff').dim[-26:]

Now we can check if there is any intersection between the closest dimensions of `pomme` and `oiseau`.

In [180]:
set(closest_dim_o).intersection(set(closest_dim_p))

{91, 111}

In [181]:
set(furthest_dim_o).intersection(set(furthest_dim_p))

{272, 371}

We have only two intersection for closest dimensions: dimension `91` and `111`. We can make a hypothesis that one of them can represent `Noun` or `Gender`.

As for the furthest values, we have `272` and `371` . We can make a hypothesis that one of these dimensions is responsible for the multiplicity or this is due to the words ending on a different character.

## Dimensions with the closest and furthest values for pair of singular nouns

Now we can compare the diffence of values of 2 singular nouns - `pomme` and `oiseau`.

If `91` or `111` represent `Noun`, we should expect that embeddings of `pomme` and `oiseau` have the values of these dimensions close.

If `91` or `111` represent `Gender`, we should expect to see these dimensions in the list of the furthests.

If `272` or `371` represent multiplicity, we should expect to see this dimension in the closest list, since both nouns are singular.

If `272` or `371` represent the last character of the word representation, we can expect them to be in the list of the furthest dimensions since `pomme` ends with `e` and `oiseau` with `u`.

In [188]:
closest_dim_sing = abs(df.pomme - df.oiseau).sort_values()[:26].index
furthest_dim_sing = abs(df.pomme - df.oiseau).sort_values()[-26:].index

In [192]:
set(closest_dim_sing).intersection({91, 111, 272, 371})

set()

In [193]:
set(furthest_dim_sing).intersection({91, 111, 272, 371})

{371}

We have dimension `371` in the furthest list, potentially signalling that this dimension represents the word end and doesn't carry grammatical information.

We can as well look at how close the values are in all of these dimensions:

In [234]:
dim_df = pd.DataFrame(columns=['d91', 'd111', 'd272', 'd371'])

In [235]:
dim_df['d91'] = [abs(df.pomme - df.oiseau).at[91]]
dim_df['d111'] = [abs(df.pomme - df.oiseau).at[111]]
dim_df['d272'] = [abs(df.pomme - df.oiseau).at[272]]
dim_df['d371'] = [abs(df.pomme - df.oiseau).at[371]]
dim_df

Unnamed: 0,d91,d111,d272,d371
0,2.495064,1.544467,2.325571,31.315603


We can see that values of `d371` are the furthest by far the difference between other dimension is pretty equal.

## Dimensions with the closest and furthest values for pair of singular nouns

Now we can compare the diffence of values of 2 plural nouns - `pommes` and `oiseaux`.

If `91` or `111` represent `Noun`, we should expect that embeddings of `pommes` and `oiseaux` have the values of these dimensions close.

If `91` or `111` represent `Gender`, we should expect to see these dimensions in the list of the furthests.

If `272` or `371` represent multiplicity, we should expect to see this dimension in the closest list, since both nouns are plural.

If `272` or `371` represent the last character of the word representation, we can expect them to be in the list of the furthest dimensions since `pommes` ends with `s` and `oiseaux` with `x`.

In [194]:
closest_dim_sing = abs(df.pommes - df.oiseaux).sort_values()[:26].index
furthest_dim_sing = abs(df.pommes - df.oiseaux).sort_values()[-26:].index

In [195]:
set(closest_dim_sing).intersection({91, 111, 272, 371})

set()

In [196]:
set(furthest_dim_sing).intersection({91, 111, 272, 371})

{272}

Now we have only one intersection with `272` having the furthest values for both nouns. It potentially can be as well related to word endings.

However, we can see that 

In [203]:
abs(df.pommes - df.oiseaux).sort_values().at[272], abs(df.pommes - df.oiseaux).sort_values().at[371]

9.246571

In [236]:
dim_df2 = pd.DataFrame(columns=['d91', 'd111', 'd272', 'd371'])
dim_df2['d91'] = [abs(df.pommes - df.oiseaux).at[91]]
dim_df2['d111'] = [abs(df.pommes - df.oiseaux).at[111]]
dim_df2['d272'] = [abs(df.pommes - df.oiseaux).at[272]]
dim_df2['d371'] = [abs(df.pommes - df.oiseaux).at[371]]
dim_df2

Unnamed: 0,d91,d111,d272,d371
0,2.50982,1.513092,9.246571,3.353041


The values of `d272` are the farthest but nowhere as far as the values of `d371` in the experiment above.

## Experiment with same gender and same inflection type

Now we can check if we have any different results if we use nouns of the same gender and the same inflection scheme:
`-s`. We can work with `pomme/s` and `femme/s`.

In [237]:
femme_we = get_we(flaubert, flaubert_tokenizer, 'femme')
femmes_we = get_we(flaubert, flaubert_tokenizer, 'femmes')

We can add the new pair to the dataframe:

In [238]:
df['femme'] = femme_we
df['femmes'] = femmes_we
df

Unnamed: 0,dim,pomme,pommes,oiseau,oiseaux,p_diff,o_diff,femme,femmes
0,0,-0.789583,2.870298,8.625494,4.051764,3.659881,4.573730,1.250606,1.972973
1,1,0.173690,2.873069,2.405889,-0.354589,2.699379,2.760478,-0.609186,-0.691200
2,2,2.101598,-0.319898,-1.324769,-2.143469,2.421496,0.818700,2.759072,4.054496
3,3,1.687404,3.300869,0.028446,-1.254749,1.613466,1.283195,-2.650006,-1.391883
4,4,1.308981,2.539638,3.579112,5.393857,1.230657,1.814745,2.289926,3.119220
...,...,...,...,...,...,...,...,...,...
507,507,-3.022486,-4.887702,-0.263529,2.995455,1.865216,3.258984,-7.861737,-4.760276
508,508,-2.054853,-1.295427,-2.647445,-3.688531,0.759426,1.041085,-4.726769,-4.160974
509,509,-7.838132,-5.978479,-0.322017,-3.423790,1.859653,3.101774,-0.187649,0.520152
510,510,2.746323,1.209072,4.533642,5.523484,1.537251,0.989842,-0.666923,-3.463926


Now same as above we add a column for differences between values of dimensions of `femme` and `femmes`.

In [239]:
df['f_diff'] = abs(df.femme - df.femmes)

In [240]:
closest_dim_f = df.sort_values('f_diff').dim[:26]
furthest_dim_f = df.sort_values('f_diff').dim[-26:]

Now we can compare closest dimensions of `femme` and `femmes` and those of `pomme` and `pommes`.

In [241]:
set(closest_dim_f).intersection(set(closest_dim_p))

set()

In [246]:
set(furthest_dim_f).intersection(set(furthest_dim_p))

set()

There is no overlap in closest and furthest dimensions  `¯\_(ツ)_/¯`

In [247]:
set(closest_dim_f).intersection(set(closest_dim_o))

{148, 350}

In [248]:
set(furthest_dim_f).intersection(set(furthest_dim_o))

{209, 328}

If we compare `femme/s` with `oiseau/x` the overlap in closest and furthest dimensions is different from the ones in the experiment above `¯\_(ツ)_/¯`