In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append('../Util')

In [3]:
from we import initiate_model, get_we, create_we_df

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    },
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

For all PoS the comparison will be down with the set of the word associated with the PoS from Morphalou.

For each model we'll show the percentage of the words that is tokenized as one word and the percentage of words tokenized in multiple parts.

# Nouns

In [6]:
all_nouns = pd.read_csv('../Data/Morphalou/all_nouns_v2.csv', index_col = 0)

In [7]:
all_nouns = all_nouns[(all_nouns.Number != 'invariable') & (all_nouns.Gender != 'invariable')]

In [8]:
all_nouns

Unnamed: 0_level_0,Number,Gender,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a b c,singular,masculine,a b c
a demi-mot,singular,masculine,a demi-mot
a-mi-la,singular,masculine,a-mi-la
aabam,singular,masculine,aabam
aalénien,singular,masculine,aalénien
...,...,...,...
œuvre,singular,feminine,œuvre
œuvres,plural,feminine,œuvre
œuvre-testament,singular,feminine,œuvre-testament
œuvrette,singular,feminine,œuvrette


We can see that there are quite a lot of words inside Morphalou categorized as nouns, which are compositions of several words. We can find only non-complex words among Morphalou nouns:

In [9]:
all_nouns['len'] = [len(x.split()) for x in all_nouns.index.str.replace('-', ' ')]

In [10]:
one_word_nouns = all_nouns[all_nouns.len == 1]

In [11]:
one_word_nouns

Unnamed: 0_level_0,Number,Gender,Lemma,len
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aabam,singular,masculine,aabam,1
aalénien,singular,masculine,aalénien,1
aaléniens,plural,masculine,aalénien,1
aba,singular,masculine,aba,1
abaca,singular,masculine,abaca,1
...,...,...,...,...
œufriers,plural,masculine,œufrier,1
œuvre,singular,feminine,œuvre,1
œuvres,plural,feminine,œuvre,1
œuvrette,singular,feminine,œuvrette,1


In [12]:
noun_lens = []

In [13]:
for i in range(len(models)):
    nouns_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_nouns_we.csv')
    noun_lens.append(len(nouns_df))

In [14]:
noun_size_comp = pd.DataFrame(index=[m['label'] for m in models])

In [15]:
noun_size_comp['Noun vocab size'] = noun_lens
noun_size_comp['Percentage of Morphalou vocab'] = noun_size_comp['Noun vocab size'] * 100 / len(one_word_nouns)

In [16]:
noun_size_comp.sort_values(by='Noun vocab size', ascending=False)

Unnamed: 0,Noun vocab size,Percentage of Morphalou vocab
flau_base_u,17272,10.612205
flau_small_c,14091,8.657745
flau_base_c,14091,8.657745
flau_large_c,14091,8.657745
cam_base,9894,6.079039
bert_base_u,6982,4.289857
distilbert_base,4494,2.761189
bert_base_c,4494,2.761189
xlm_large,3982,2.446607
xlm_base,3982,2.446607


We can select 20 random nouns from Morphalou vocab to see how they are tokenized differently by different models:

In [17]:
tokenizations = []

In [18]:
word_sample = one_word_nouns.index.to_series().sample(n=20)

In [19]:
for m in models:
    m_tokenizations = []
    model, tokenizer = initiate_model(m['name'])
    for w in word_sample:
        tokenization = [tokenizer.decode(x) for x in tokenizer.encode(w)]
        m_tokenizations.append(tokenization)
    tokenizations.append(m_tokenizations)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

In [21]:
def check_not_stop_tokens(s):
    return (s != '<s>') and (s != '</s>') and (s != '[CLS]') and (s != '[SEP]')

In [27]:
for j in range(len(word_sample)):
    print(f'\nWord: {word_sample[j]}')
    for i in range(len(models)):
        print(f'\tTokenization by {models[i]["name"]}: {list(filter(check_not_stop_tokens, tokenizations[i][j]))}')


Word: crémoirs
	Tokenization by flaubert/flaubert_small_cased: ['cré', 'moi', 'rs']
	Tokenization by flaubert/flaubert_base_uncased: ['cré', 'moi', 'rs']
	Tokenization by flaubert/flaubert_base_cased: ['cré', 'moi', 'rs']
	Tokenization by flaubert/flaubert_large_cased: ['cré', 'moi', 'rs']
	Tokenization by camembert/camembert-base: ['cré', 'm', 'oirs']
	Tokenization by xlm-roberta-large: ['', 'cré', 'mo', 'ir', 's']
	Tokenization by xlm-roberta-base: ['', 'cré', 'mo', 'ir', 's']
	Tokenization by bert-base-multilingual-uncased: ['cr', '##em', '##oir', '##s']
	Tokenization by distilbert-base-multilingual-cased: ['c', '##ré', '##mo', '##irs']
	Tokenization by bert-base-multilingual-cased: ['c', '##ré', '##mo', '##irs']

Word: lasciveté
	Tokenization by flaubert/flaubert_small_cased: ['las', 'ci', 'veté']
	Tokenization by flaubert/flaubert_base_uncased: ['las', 'ci', 'veté']
	Tokenization by flaubert/flaubert_base_cased: ['las', 'ci', 'veté']
	Tokenization by flaubert/flaubert_large_cased

# Verbs 

In [28]:
all_verbs = pd.read_csv('../Data/Morphalou/all_verbs_v2.csv', index_col = 0)

In [29]:
all_verbs

Unnamed: 0_level_0,Number,Tense,Person,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abaissa,singular,simplePast,thirdPerson,abaisser
abaissai,singular,simplePast,firstPerson,abaisser
abaissaient,plural,imperfect,thirdPerson,abaisser
abaissais,singular,imperfect,secondPerson,abaisser
abaissait,singular,imperfect,thirdPerson,abaisser
...,...,...,...,...
œuvreront,plural,future,thirdPerson,œuvrer
œuvrâmes,plural,simplePast,firstPerson,œuvrer
œuvrâtes,plural,simplePast,secondPerson,œuvrer
œuvrèrent,plural,simplePast,thirdPerson,œuvrer


In [30]:
verb_lens = []

In [31]:
for i in range(len(models)):
    verbs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_verbs_we.csv')
    verb_lens.append(len(verbs_df))

In [32]:
verb_size_comp = pd.DataFrame(index=[m['label'] for m in models])

In [51]:
verb_size_comp['Verb vocab size'] = verb_lens
verb_size_comp['Percentage of Morphalou vocab'] = verb_size_comp['Verb vocab size'] * 100 / len(all_verbs)

In [61]:
verb_size_comp.sort_values(by='Verb vocab size', ascending=False)

Unnamed: 0,Verb vocab size,Percentage of Morphalou vocab
flau_base_u,6377,1.99117
flau_small_c,5425,1.693915
flau_base_c,5425,1.693915
flau_large_c,5425,1.693915
cam_base,3852,1.202758
bert_base_u,2353,0.734706
xlm_large,1233,0.384995
xlm_base,1233,0.384995
distilbert_base,925,0.288824
bert_base_c,925,0.288824


In [36]:
verb_sample = all_verbs.index.to_series().sample(n=20)

In [37]:
verb_tokenizations = []

In [38]:
for m in models:
    m_tokenizations = []
    model, tokenizer = initiate_model(m['name'])
    for w in verb_sample:
        tokenization = [tokenizer.decode(x) for x in tokenizer.encode(w)]
        m_tokenizations.append(tokenization)
    verb_tokenizations.append(m_tokenizations)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

In [39]:
for j in range(len(verb_sample)):
    print(f'\nWord: {verb_sample[j]}')
    for i in range(len(models)):
        print(f'\tTokenization by {models[i]["name"]}: {list(filter(check_not_stop_tokens, verb_tokenizations[i][j]))}')


Word: déparchâtes
	Tokenization by flaubert/flaubert_small_cased: ['dé', 'par', 'châ', 'tes']
	Tokenization by flaubert/flaubert_base_uncased: ['dépar', 'châ', 'tes']
	Tokenization by flaubert/flaubert_base_cased: ['dé', 'par', 'châ', 'tes']
	Tokenization by flaubert/flaubert_large_cased: ['dé', 'par', 'châ', 'tes']
	Tokenization by camembert/camembert-base: ['dép', 'arch', 'ât', 'es']
	Tokenization by xlm-roberta-large: ['dép', 'arch', 'ât', 'es']
	Tokenization by xlm-roberta-base: ['dép', 'arch', 'ât', 'es']
	Tokenization by bert-base-multilingual-uncased: ['de', '##par', '##cha', '##tes']
	Tokenization by distilbert-base-multilingual-cased: ['dé', '##par', '##ch', '##ât', '##es']
	Tokenization by bert-base-multilingual-cased: ['dé', '##par', '##ch', '##ât', '##es']

Word: dépréciés
	Tokenization by flaubert/flaubert_small_cased: ['dépréci', 'és']
	Tokenization by flaubert/flaubert_base_uncased: ['dépréci', 'és']
	Tokenization by flaubert/flaubert_base_cased: ['dépréci', 'és']
	Toke

# Adjectives

In [40]:
all_adjs = pd.read_csv('../Data/Morphalou/all_adjs_v2.csv', index_col = 0)

In [43]:
all_adjs = all_adjs[(all_adjs.Gender != 'invariable') & (all_adjs.Number != 'invariable')]

In [44]:
all_adjs

Unnamed: 0_level_0,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a-humain,masculine,singular,a-humain
a-raciste,masculine,singular,a-raciste
aalénien,masculine,singular,aalénien
aaléniens,masculine,plural,aalénien
aalénienne,feminine,singular,aalénien
...,...,...,...
œstroprogestatives,feminine,plural,œstroprogestatif
œuvé,masculine,singular,œuvé
œuvés,masculine,plural,œuvé
œuvée,feminine,singular,œuvé


Similarly to nouns, we can exclude multi-word adjectives:

In [45]:
all_adjs['len'] = [len(x.split()) for x in all_adjs.index.str.replace('-', ' ')]

In [46]:
one_word_adjs = all_adjs[all_adjs.len == 1]

In [48]:
adj_lens = []

In [49]:
for i in range(len(models)):
    adjs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_adjs_we.csv')
    adj_lens.append(len(adjs_df))

In [50]:
adj_size_comp = pd.DataFrame(index=[m['label'] for m in models])

In [54]:
adj_size_comp['Adj vocab size'] = adj_lens
adj_size_comp['Percentage of Morphalou vocab'] = adj_size_comp['Adj vocab size'] * 100 / len(one_word_adjs)

In [60]:
adj_size_comp.sort_values(by='Adj vocab size', ascending=False)

Unnamed: 0,Adj vocab size,Percentage of Morphalou vocab
flau_base_u,10362,15.867814
flau_small_c,8797,13.471257
flau_base_c,8797,13.471257
flau_large_c,8797,13.471257
cam_base,6346,9.717926
bert_base_u,2907,4.451625
distilbert_base,1610,2.465468
bert_base_c,1610,2.465468
xlm_large,1387,2.123978
xlm_base,1387,2.123978


In [62]:
adj_sample = one_word_adjs.index.to_series().sample(n=20)

In [63]:
adj_tokenizations = []

In [64]:
for m in models:
    m_tokenizations = []
    model, tokenizer = initiate_model(m['name'])
    for w in adj_sample:
        tokenization = [tokenizer.decode(x) for x in tokenizer.encode(w)]
        m_tokenizations.append(tokenization)
    adj_tokenizations.append(m_tokenizations)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

In [65]:
for j in range(len(adj_sample)):
    print(f'\nWord: {adj_sample[j]}')
    for i in range(len(models)):
        print(f'\tTokenization by {models[i]["name"]}: {list(filter(check_not_stop_tokens, adj_tokenizations[i][j]))}')


Word: chicaneurs
	Tokenization by flaubert/flaubert_small_cased: ['chi', 'can', 'eurs']
	Tokenization by flaubert/flaubert_base_uncased: ['chi', 'can', 'eurs']
	Tokenization by flaubert/flaubert_base_cased: ['chi', 'can', 'eurs']
	Tokenization by flaubert/flaubert_large_cased: ['chi', 'can', 'eurs']
	Tokenization by camembert/camembert-base: ['chic', 'an', 'eurs']
	Tokenization by xlm-roberta-large: ['chica', 'n', 'eurs']
	Tokenization by xlm-roberta-base: ['chica', 'n', 'eurs']
	Tokenization by bert-base-multilingual-uncased: ['chica', '##neur', '##s']
	Tokenization by distilbert-base-multilingual-cased: ['chica', '##neur', '##s']
	Tokenization by bert-base-multilingual-cased: ['chica', '##neur', '##s']

Word: ressuyé
	Tokenization by flaubert/flaubert_small_cased: ['res', 'su', 'yé']
	Tokenization by flaubert/flaubert_base_uncased: ['res', 'su', 'yé']
	Tokenization by flaubert/flaubert_base_cased: ['res', 'su', 'yé']
	Tokenization by flaubert/flaubert_large_cased: ['res', 'su', 'yé'

# Shared vocabulary for all models

As we can see above, the vocabulary sizes vary among different models. Here we want to have a look into the vocabulary shared by models. For this, we will create 2 groups of models: trained on French corpus only and multi-lingual (trained on multiple languages). We will find the common vocabulary for French corpus models and all models, including French and multi-lingual.

In [18]:
french_model_ind = list(range(5))

### Shared nouns

In [35]:
shared_vocab_fr = set()

In [36]:
for i in french_model_ind:
    nouns_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_nouns_we.csv', index_col=0)
    if not shared_fr_vocab:
        shared_vocab_fr.update(nouns_df.index)
    else:
        shared_vocab_fr = shared_fr_vocab.intersection(nouns_df.index)

In [37]:
print(f'Size of shared vocabulary of French models: {len(shared_vocab_fr)}')

Size of shared vocabulary of French models: 9068


Below you can find a random sample of 30 nouns from the shared French vocabulary:

In [38]:
pd.Series(list(shared_vocab_fr)).sample(n=30)

4487           croate
2157            roule
4739           tuerie
3885      réservation
5521            visée
3272           carnet
4255         standard
7232            appui
7690         canicule
2652       opérateurs
8888      psychologie
5168            garni
8533             fret
8868         magasins
6542            égaré
5462           nickel
8999        anecdotes
7499             doré
3180            types
8921       kilomètres
7209    organisateurs
734       capitaliste
8189           vivant
5298        rapproché
7792            corse
6286            lundi
1015          entière
6724        chapitres
7888            water
6997             noix
dtype: object

In [46]:
shared_vocab_all = set()

In [47]:
for i in range(len(models)):
    nouns_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_nouns_we.csv', index_col=0)
    if not shared_vocab_all:
        shared_vocab_all.update(nouns_df.index)
    else:
        shared_vocab_all = shared_vocab_all.intersection(nouns_df.index)

In [50]:
print(f'Size of shared vocabulary of all models: {len(shared_vocab_all)}')

Size of shared vocabulary of all models: 1980


Here is the sample of 30 nouns from the shared vocabulary among all models:

In [51]:
pd.Series(list(shared_vocab_all)).sample(n=30)

685        élément
1078        mentor
1388           etc
80         matière
47          profil
1363      distance
309      documents
1199        guerre
456        conflit
1976         ligne
1855          logo
840            ben
1188         sites
485        licence
1631        impact
1235         basse
1489     Cambridge
871           bras
947       batterie
1376     populaire
1476       culture
616       matériel
1231       patient
1125          More
1468       package
1223           ver
320     résistance
468             TV
1769         crime
1776      variante
dtype: object

### Shared verbs

In [45]:
shared_verbs_fr = set()

In [52]:
for i in french_model_ind:
    verbs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_verbs_we.csv', index_col=0)
    if not shared_verbs_fr:
        shared_verbs_fr.update(verbs_df.index)
    else:
        shared_verbs_fr = shared_verbs_fr.intersection(verbs_df.index)

In [54]:
print(f'Number of shared verbs by French models: {len(shared_verbs_fr)}')

Number of shared verbs by French models: 3469


In [55]:
pd.Series(list(shared_verbs_fr)).sample(n=30)

1840        parlait
2209      combinant
2742         épousé
199         rajouté
812            rasé
2450          suffi
2282      favoriser
513         fournir
118         pourrai
204         chauffé
1146       révélant
1550        offerts
439      entraînant
2782       apparait
3428       aspirant
3461        enterré
2726       renversé
1062        désirer
1569        serpent
2544         tissus
2843      distribué
1411    sophistiqué
83         comptant
1883       demeurer
345         expulsé
1766        discuté
398          croire
2757        soutenu
3325      défendent
1410     récompensé
dtype: object

In [56]:
shared_verbs_all = set()

In [57]:
for i in range(len(models)):
    verbs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_verbs_we.csv', index_col=0)
    if not shared_verbs_all:
        shared_verbs_all.update(verbs_df.index)
    else:
        shared_verbs_all = shared_verbs_all.intersection(verbs_df.index)

In [59]:
print(f'Number of shared verbs by all models: {len(shared_verbs_all)}')

Number of shared verbs by all models: 343


In [60]:
pd.Series(list(shared_verbs_all)).sample(n=30)

258         mater
142          veut
63            put
157        arriva
118          aura
196            vu
154      parcours
289      designer
111         lancé
87         agenda
73      rejoindre
208          voit
101       réalisé
322       demanda
32      convertir
33          faire
260    développer
318       contenu
84          carré
7          appelé
94       observer
286        envoyé
238        combat
146    intervenir
161        porter
263       exercer
48          débat
259       avaient
120        partir
1           lança
dtype: object

### Shared adjectives

In [61]:
shared_adj_fr = set()

for i in french_model_ind:
    adj_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_adjs_we.csv', index_col=0)
    if not shared_adj_fr:
        shared_adj_fr.update(adj_df.index)
    else:
        shared_adj_fr = shared_adj_fr.intersection(adj_df.index)

In [62]:
print(f'Number of shared adjectives by French models: {len(shared_adj_fr)}')

Number of shared adjectives by French models: 5651


In [63]:
pd.Series(list(shared_adj_fr)).sample(n=30)

3613            mural
5180        affichant
5489             pris
2076      coéquipiers
971            poussé
4464        agressive
3739        débutants
1108         entrevue
673          modifiés
18      indépendantes
3449       dramatique
5547     spécialisées
3374          réservé
3196           carrée
2843          notable
2735            venus
2371         digitale
3976         secondes
3735          traduit
3470      qualitative
4624      déterminant
700             promu
2132          passive
4201        préalable
5234           promis
688         contenant
2211      esthétiques
4336     impressionné
760           adaptée
2635             brut
dtype: object

In [64]:
shared_adj_all = set()

for i in range(len(models)):
    adj_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_adjs_we.csv', index_col=0)
    if not shared_adj_all:
        shared_adj_all.update(adj_df.index)
    else:
        shared_adj_all = shared_adj_all.intersection(adj_df.index)

In [65]:
print(f'Number of shared adjectives by all models: {len(shared_adj_all)}')

Number of shared adjectives by all models: 794


In [66]:
pd.Series(list(shared_adj_all)).sample(n=30)

88           droits
675          expert
755      nombreuses
511          maître
595          active
475          député
218           sobre
24              pro
141           chair
348           final
203           frais
103       spectacle
726          absent
760          établi
532        militant
543    représentant
290         endroit
747             off
739          propre
699             bon
317          triste
776        probable
463           suivi
752         diffuse
65           perché
412     importantes
254          invité
337     commerciale
244             nul
356       véritable
dtype: object