In [1]:
import sys
sys.path.append('../Util')

In [2]:
from we import initiate_model, get_we, create_we_df

In [3]:
import pandas as pd
import numpy as np
import re
import os

# Load models

The code will load a Transformer model from Hugging Face library.
It is expected that the model can be loaded using `AutoModelForMaskedLM` function of the Hugging Face library and its tokenizer can be initiated using `AutoTokenizer`.

This can be confirmed on [the model's page](https://huggingface.co/flaubert/flaubert_base_uncased) in the Hugging Face library.

In the code below we will be loading the following models:
- FlauBERT with 512 dimensions (trained on cased data)
- FlauBERT with 768 dimensions (trained on uncased data)
- FlauBERT with 768 dimensions (trained on cased data)
- FlauBERT with 1024 dimensions (trained on cased data)
- CamemBERT with 768 dimensions

In [4]:
# List of all models to compare
# Since model names can be quite long, label can be used to reference it in the reports. 
# If label is not present in the model object, its full name will be used in the reports instead.
# If label is present in the object, it can't be an empty string.

models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    {
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert/camembert-base',
        'label': 'cam_base'
    }
    
]

# Build dataset

In our case we build dataset by getting all nouns, verbs and adverbs from Morphalou and their grammatical information.
Then we attempt to obtain a word embedding for each of the words. If it exists (meaning that tokenizer tokenizes the word as one token), it's added to the dataset with the following information:
- Word 
- PoS
- Grammatical information if present
- All dimensions of WE

If there is any word that is both masculine and feminine, or other 2 grammatical characteristics at the same time, it was excluded from the dataset. 

In [31]:
all_verbs = pd.read_csv('../Data/Morphalou/all_verbs_v2.csv', index_col = 0)

In [32]:
all_nouns = pd.read_csv('../Data/Morphalou/all_nouns_v2.csv', index_col = 0)

In [33]:
all_adjs = pd.read_csv('../Data/Morphalou/all_adjs_v2.csv', index_col = 0)

Build the dataset of WE with features for each model in the list above for Verbs, Nouns and Adjectives.

Runtime per model is ~1-1.5 hour.

In [35]:
for model in models:
    # Get the label of the model if it's present, otherwise get its name.
    # Any slashes in the names are replaced with empty strings to work with file saving.
    model_label = re.sub('/', '', model.get('label', model['name']))
    file_path = f'../Data/{model_label}'
    # Create the folder for file storage if it's not created yet.
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    
    print(f'Initiating model {model["name"]}:')
    m, t = initiate_model(model['name'])
    print('Done')
    
    print('\nGenerating WE for nouns:')
    nouns_we_df = create_we_df(m, t, all_nouns, progress=True)
    print(f'\nNouns WE are being stored in the file: {file_path}/all_nouns_we.csv')
    nouns_we_df.to_csv(f'{file_path}/all_nouns_we.csv')

    print('\nGenerating WE for verbs:')
    verbs_we_df = create_we_df(m, t, all_verbs, progress=True)
    print(f'\nVerbs WE are being stored in the file: {file_path}/all_verbs_we.csv')
    verbs_we_df.to_csv(f'{file_path}/all_verbs_we.csv')
    
    print('\nGenerating WE for adjs:')
    adjs_we_df = create_we_df(m, t, all_adjs, progress=True)
    print(f'\nAdjs WE are being stored in the file: {file_path}/all_adjs_we.csv')
    adjs_we_df.to_csv(f'{file_path}/all_adjs_we.csv')
    
    print('============================')

Initiating model flaubert/flaubert_small_cased:
Done

Generating WE for nouns:
....................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/flau_small_c/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/flau_small_c/all_verbs_we.csv

Generating WE for adjs:
..................................................................................................
Adjs WE are being stored in the file: ../Data/flau_small_c/all_adjs_we.csv
Initiating model flaubert/flaubert_base

# NB !!

Some of the files are too big to be uploaded to Github, all the files can be found in the Google Drive [here](https://drive.google.com/drive/folders/10Ea62GRlq4t7bq-nK9tPtYFu0kbCciey?usp=drive_link).

# Corpora sizes

Below you will find the sizes of WE datasets for each PoS and for each model: it represents how many unique wordforms have a WE.

In [36]:
sizes = []

for m in models:
    model_label = re.sub('/', '', m.get('label', m['name']))
    file_path = f'../Data/{model_label}'
    
    noun_size = len(pd.read_csv(f'{file_path}/all_nouns_we.csv', index_col=0))
    verb_size = len(pd.read_csv(f'{file_path}/all_verbs_we.csv', index_col=0))
    adj_size = len(pd.read_csv(f'{file_path}/all_adjs_we.csv', index_col=0))
    
    sizes.append({
        'Model': model_label,
        'Nouns': noun_size,
        'Verbs': verb_size,
        'Adjs': adj_size
    })
    
sizes_df = pd.DataFrame(sizes)

sizes_df

Unnamed: 0,Model,Nouns,Verbs,Adjs
0,flau_small_c,14091,5425,8797
1,flau_base_u,17272,6377,10362
2,flau_base_c,14091,5425,8797
3,flau_large_c,14091,5425,8797
4,cam_base,9894,3852,6346


# A look into the datasets

As we can notice from the sizes table above, the number of nouns, verbs and adjectives recognized by cased FlauBERT models stays the same for any model size.

We can see if these are the same words.

In [8]:
f_s_adj = pd.read_csv('../Data/flau_small_c/all_adjs_we.csv', index_col=0)
f_b_adj = pd.read_csv('../Data/flau_base_c/all_adjs_we.csv', index_col=0)
f_l_adj = pd.read_csv('../Data/flau_base_c/all_adjs_we.csv', index_col=0)

f_s_noun = pd.read_csv('../Data/flau_small_c/all_nouns_we.csv', index_col=0)
f_b_noun = pd.read_csv('../Data/flau_base_c/all_nouns_we.csv', index_col=0)
f_l_noun = pd.read_csv('../Data/flau_base_c/all_nouns_we.csv', index_col=0)

f_s_verb = pd.read_csv('../Data/flau_small_c/all_verbs_we.csv', index_col=0)
f_b_verb = pd.read_csv('../Data/flau_base_c/all_verbs_we.csv', index_col=0)
f_l_verb = pd.read_csv('../Data/flau_base_c/all_verbs_we.csv', index_col=0)

Checking if the words (the index column of the dataset) are the same for all adjectives:

In [38]:
all(f_s_adj.index == f_b_adj.index), all(f_s_adj.index == f_l_adj.index)

(True, True)

And for nouns:

In [39]:
all(f_s_noun.index == f_b_noun.index), all(f_s_noun.index == f_l_noun.index)

(True, True)

And for verbs:

In [40]:
all(f_s_verb.index == f_b_verb.index), all(f_s_verb.index == f_l_verb.index)

(True, True)

_______________
We can also have a look into what type of words models trained on cased data don't recognize which the model trained on un-cased data recognize:

In [6]:
f_bu_adj = pd.read_csv('../Data/flau_base_u/all_adjs_we.csv', index_col=0)
f_bu_noun = pd.read_csv('../Data/flau_base_u/all_nouns_we.csv', index_col=0)
f_bu_verb = pd.read_csv('../Data/flau_base_u/all_verbs_we.csv', index_col=0)

For adjectives:

In [9]:
set(f_bu_adj.index) - set(f_b_adj.index)

{'éprouvant',
 'franco',
 'bronzant',
 'fleuries',
 'fleuriste',
 'respirant',
 'défendues',
 'maltais',
 'charpentier',
 'ténébreux',
 'articulaires',
 'orgueilleux',
 'ancrés',
 'vikings',
 'sulfureux',
 'arrachés',
 'référencé',
 'marquante',
 'gazier',
 'bobo',
 'jouissant',
 'intestins',
 'imparfaite',
 'chaussé',
 'automates',
 'enfantin',
 'joyeuses',
 'curieuses',
 'thaïlandais',
 'intemporelle',
 'réflecteur',
 'versante',
 'dénommés',
 'résidentielles',
 'dominical',
 'saphir',
 'armateurs',
 'pastoral',
 'désespérés',
 'planes',
 'scout',
 'itinérant',
 'baissée',
 'bruni',
 'haïtiens',
 'amé',
 'xénophobes',
 'allégé',
 'mérités',
 'alpins',
 'jacobins',
 'malicieux',
 'investies',
 'moelleuse',
 'érudits',
 'épars',
 'nulles',
 'imposants',
 'ferreux',
 'conductrices',
 'calculatrice',
 'comptabilisées',
 'effrayés',
 'résolues',
 'corrompue',
 'dissident',
 'optimum',
 'gallo',
 'inondé',
 'probants',
 'fabien',
 'mouillés',
 'racial',
 'assimilée',
 'ardue',
 'tibétains'

We can see relatively less frequent adjectives being recognized as one token by FlauBERT base uncased vs the cased model. We can compare the tokenization of several of the adjectives by the models:

In [11]:
f_bu_model, f_bu_tokenizer = initiate_model(models[1]['name'])
f_b_model, f_b_tokenizer = initiate_model(models[2]['name'])

Tokenization of `marquante` by the cased model:

In [14]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('marquante')]

['<s>', 'marqu', 'ante', '</s>']

Tokenization of `marquante` by the uncased model:

In [17]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('marquante')]

['<s>', 'marquante', '</s>']

Tokenization of `imparfaite` by the cased model:

In [16]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('imparfaite')]

['<s>', 'im', 'parfaite', '</s>']

Tokenization of `imparfaite` by the uncased model:

In [18]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('imparfaite')]

['<s>', 'imparfaite', '</s>']

We can repeat the same for nouns. We can notice among them some potentially foreign words like `irish`, `trust`, `skate` and others recognized as one token by the uncased model but not by the cased one.

We can have a look at different tokenization of nouns in the examples below.

In [43]:
set(f_bu_noun.index) - set(f_b_noun.index)

{'Saint-Père',
 'nanas',
 'bruges',
 'postérieurs',
 'riot',
 'brides',
 'rétribution',
 'dealer',
 'gade',
 'Lombard',
 'assimilées',
 'thaïlandais',
 'fracturation',
 'déclassement',
 'trust',
 'rica',
 'pistons',
 'épiscopal',
 'autrichiens',
 'psychotropes',
 'poirier',
 'skate',
 'devin',
 'théoriciens',
 'catéchisme',
 'oraison',
 'fragrance',
 'bouddha',
 'atmosphères',
 'bambins',
 'hamburger',
 'saint-germain',
 'proof',
 'irish',
 'obédience',
 'malhonnête',
 'oxford',
 'totem',
 'status',
 'friche',
 'politicienne',
 'confesse',
 'collet',
 'convoitises',
 'saint-esprit',
 'Yougoslave',
 'rift',
 'gates',
 'truffes',
 'Tutsi',
 'Indienne',
 'pitre',
 'synode',
 'celtes',
 'balistiques',
 'flottants',
 'baptiste',
 'cadastre',
 'vulgarité',
 'liberty',
 'opérées',
 'roquefort',
 'monochrome',
 'jasmin',
 'redondance',
 'gaule',
 'commandeur',
 'dissémination',
 'infortune',
 'Béninois',
 'volante',
 'anode',
 'rush',
 'jacobs',
 'tarot',
 'paf',
 'battement',
 'parapente',
 '

Tokenization of `déclassement` by the FlauBERT cased model:

In [20]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('déclassement')]

['<s>', 'décl', 'assement', '</s>']

Tokenization of `déclassement` by the uncased model:

In [21]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('déclassement')]

['<s>', 'déclassement', '</s>']

Tokenization of `postérieurs` by the cased model:

In [22]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('postérieurs')]

['<s>', 'post', 'érieurs', '</s>']

Tokenization of `postérieurs` by the uncased model:

In [23]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('postérieurs')]

['<s>', 'postérieurs', '</s>']



And now we can check the differences between the models for verbs:

In [44]:
set(f_bu_verb.index) - set(f_b_verb.index)

{'abandonna',
 'aboutira',
 'abritait',
 'abroger',
 'abrogé',
 'absorbent',
 'abstraits',
 'abîmé',
 'accablé',
 'accidenté',
 'accompagnaient',
 'accompagneront',
 'accomplissent',
 'accorda',
 'accordera',
 'accrochent',
 'accroitre',
 'accrédité',
 'accrédités',
 'accueillit',
 'accédant',
 'acharnés',
 'acheminé',
 'acheminés',
 'acheva',
 'acquit',
 'actionné',
 'activant',
 'adjoindre',
 'adorait',
 'adossé',
 'affichera',
 'agacé',
 'agendas',
 'agitant',
 'agréer',
 'aidait',
 'ajouterai',
 'alarmant',
 'allient',
 'allongés',
 'allumés',
 'allégé',
 'alternent',
 'altéré',
 'amenait',
 'amender',
 'amorti',
 'amputé',
 'amusait',
 'améliorera',
 'analysent',
 'ancrés',
 'annonçaient',
 'anéanti',
 'apercevant',
 'aperçoivent',
 'appartiendra',
 'appliquerons',
 'apposer',
 'apposé',
 'apprises',
 'approuvant',
 'appréciait',
 'appréciera',
 'appréhendé',
 'aras',
 'argumenter',
 'arpenter',
 'arrachés',
 'arrangeant',
 'arriverez',
 'articulent',
 'articulés',
 'aréna',
 'ass

Tokenization of `absorbent` (first by the cased model, then by the uncased model):

In [24]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('absorbent')]

['<s>', 'absorb', 'ent', '</s>']

In [25]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('absorbent')]

['<s>', 'absorbent', '</s>']

Tokenization of `accorda`:

In [29]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('accorda')]

['<s>', 'accor', 'da', '</s>']

In [30]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('accorda')]

['<s>', 'accorda', '</s>']

Tokenization of `accrédité`:

In [32]:
[f_b_tokenizer.decode(x) for x in f_b_tokenizer.encode('accrédité')]

['<s>', 'ac', 'crédité', '</s>']

In [33]:
[f_bu_tokenizer.decode(x) for x in f_bu_tokenizer.encode('accrédité')]

['<s>', 'accrédité', '</s>']

______________________
We can also notice that FlauBERT models (cased and uncased) recognize more words than CamemBERT model.

We can have a look at what words FlauBERT base-cased model recognizes that CamemBERT doesn't:

In [5]:
c_adj = pd.read_csv('../Data/cam_base/all_adjs_we.csv', index_col=0)
c_noun = pd.read_csv('../Data/cam_base/all_nouns_we.csv', index_col=0)
c_verb = pd.read_csv('../Data/cam_base/all_verbs_we.csv', index_col=0)

Below is the list of adjectives recognized as one token by FlauBERT base but recogrnized as several tokens by CamemBERT base:

In [46]:
set(f_b_adj.index) - set(c_adj.index)

{'idéologiques',
 'oubliées',
 'économistes',
 'poursuivies',
 'brunes',
 'traducteurs',
 'tunisiens',
 'dures',
 'dix-sept',
 'anglo-saxons',
 'arithmétique',
 'surveillée',
 'multipliée',
 'migrateurs',
 'inventée',
 'niçois',
 'avides',
 'antérieures',
 'cutanées',
 'dix-huitième',
 'moratoire',
 'postés',
 'solidaires',
 'paritaire',
 'réalistes',
 'socio-économiques',
 'estimés',
 'médiévales',
 'nets',
 'australiens',
 'avenant',
 'échangées',
 'prenante',
 'gore',
 'rapportées',
 'rapportée',
 'constitutifs',
 'tristes',
 'révisées',
 'tueurs',
 'acceptables',
 'méditerranéens',
 'jardiniers',
 'mobilières',
 'affluents',
 'chouettes',
 'prescrites',
 'quantitatives',
 'bibliques',
 'signalés',
 'volatile',
 'patriotes',
 'suivies',
 'épuisée',
 'controversée',
 'autobiographique',
 'réalisateurs',
 'gastronomiques',
 'trompée',
 'dirigées',
 'abandonnées',
 'cycliques',
 'polémiques',
 'fortifiée',
 'intergouvernementales',
 'pareils',
 'espérée',
 'communiqués',
 'détachée',
 

Similarly as for cased and uncased models we can compare how CamemBERT tokenizes some of the adjectives from the list:

In [34]:
cam_model, cam_tokenizer = initiate_model(models[-1]['name'])

In [35]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('dures')]

['<s>', 'dure', 's', '</s>']

In [36]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('brunes')]

['<s>', 'brune', 's', '</s>']

In [37]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('niçois')]

['<s>', 'ni', 'çois', '</s>']

In [38]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('inventée')]

['<s>', 'inventé', 'e', '</s>']

In [39]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('cutanées')]

['<s>', 'cutanée', 's', '</s>']

We can repeat the comparison of FlauBERT base models and CamemBERT base models for nouns recognized by the both of them as one token. Below is the list recognized only by FlauBERT.

In [47]:
set(f_b_noun.index) - set(c_noun.index)

{'noyaux',
 'disparitions',
 'humilité',
 'substrats',
 'mite',
 'rhinocéros',
 'dures',
 'amnistie',
 'anglo-saxons',
 'improvisation',
 'rétroaction',
 'élans',
 'NDLR',
 'amplitude',
 'idoles',
 'révisions',
 'Danois',
 'moratoire',
 'porcs',
 'non-discrimination',
 'réalistes',
 'potage',
 'sir',
 'nets',
 'étain',
 'tueurs',
 'chouettes',
 'canards',
 'bannières',
 'gaieté',
 'montages',
 'projecteurs',
 'corollaire',
 'IUT',
 'MM.',
 'cheikh',
 'polémiques',
 'anticipation',
 'entrepôts',
 'ais',
 'lady',
 'vendanges',
 'communiqués',
 'détachée',
 'filiales',
 'individualité',
 'high-tech',
 'édifices',
 'subtilités',
 'monologue',
 'rationalisation',
 'porte-parole',
 'professionnalisme',
 'interception',
 'populistes',
 'rotor',
 'blancheur',
 'plaines',
 'èche',
 'nef',
 'bordé',
 'jihadiste',
 'manoeuvres',
 'tam',
 'marginalisation',
 'dinde',
 'finalisation',
 'pollutions',
 'rainures',
 'artefacts',
 'interconnexion',
 'intimidation',
 'embarquement',
 'sieurs',
 'tifs',


CamemBERT tokenization of nouns:

In [41]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('humilité')]

['<s>', '', 'humilité', '</s>']

In [42]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('rétroaction')]

['<s>', 'rétro', 'action', '</s>']

In [43]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('gaieté')]

['<s>', 'ga', 'ie', 'té', '</s>']

In [44]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('polémiques')]

['<s>', 'polémique', 's', '</s>']

We can repeat the same experiment for verbs (below is the list of verbs recognized as one token by FlauBERT only):

In [48]:
set(f_b_verb.index) - set(c_verb.index)

{'revoit',
 'rencontrait',
 'visait',
 'couronner',
 'utilisaient',
 'affichait',
 'aimaient',
 'approuver',
 'cachant',
 'approchait',
 'conférer',
 'rayer',
 'étudient',
 'marquera',
 'paraîtra',
 'adressent',
 'appliquent',
 'postés',
 'interrompt',
 'desservi',
 'alourdir',
 'estimés',
 'occupera',
 'approprier',
 'avenant',
 'statué',
 'luttent',
 'choir',
 'partait',
 'garnir',
 'signalés',
 'collent',
 'espéraient',
 'cessait',
 'démolir',
 'éleva',
 'assurera',
 'occupa',
 'couter',
 'gagnait',
 'ressusciter',
 'briguer',
 'embarquer',
 'ressourcer',
 'reviendront',
 'communiqués',
 'abritent',
 'projeter',
 'obtenant',
 'demeura',
 'bénéficierez',
 'porta',
 'détaillés',
 'denté',
 'poivrer',
 'dispenser',
 'réservant',
 'orant',
 'conservera',
 'nécessitera',
 'tiller',
 'transférés',
 'dissiper',
 'tisés',
 'engagent',
 'redouter',
 'observent',
 'recourt',
 'remets',
 'déchiffrer',
 'courait',
 'procédant',
 'cessa',
 'forçant',
 'prêtant',
 'entraînent',
 'exécutés',
 'jet

In [45]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('approuver')]

['<s>', '', 'approuver', '</s>']

In [46]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('rencontrait')]

['<s>', 'rencontr', 'ait', '</s>']

In [47]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('luttent')]

['<s>', 'lutte', 'nt', '</s>']

In [48]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('collent')]

['<s>', 'colle', 'nt', '</s>']

In [49]:
[cam_tokenizer.decode(x) for x in cam_tokenizer.encode('démolir')]

['<s>', 'démoli', 'r', '</s>']

# Creating unique WE dataset

Some words can be interpretted as several parts of speech so to avoid the ambiguity, we will exlclude these words and we'll only keep words uniquely interpreted as one PoS.

In [49]:
for i in range(len(models)):

    nouns_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_nouns_we.csv', index_col=0)
    nouns_df['POS'] = 'NOUN'

    verbs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_verbs_we.csv', index_col=0)
    verbs_df['POS'] = 'VERB'

    adjs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_adjs_we.csv', index_col=0)
    adjs_df['POS'] = 'ADJ'

    combined_df = pd.concat([nouns_df, verbs_df, adjs_df])
    words, counts = np.unique(combined_df.index, return_counts=True)
    unique_words = [x[0] for x in zip(words, counts) if x[1] == 1]
    combined_df = combined_df[combined_df.index.isin(unique_words)]

    combined_df.to_csv(f'../Data/{models[i]["label"]}/all_unique_pos_we.csv')