In [1]:
import sys
sys.path.append('../Util')

In [2]:
from we import initiate_model, get_we, create_we_df

In [3]:
import pandas as pd
import numpy as np
import re
import os

# Load models

The code will load a Transformer model from Hugging Face library.
It is expected that the model can be loaded using `AutoModelForMaskedLM` function of the Hugging Face library and its tokenizer can be initiated using `AutoTokenizer`.

This can be confirmed on [the model's page](https://huggingface.co/flaubert/flaubert_base_uncased) in the Hugging Face library.

In [10]:
# List of all models to compare
# Since model names can be quite long, label can be used to reference it in the reports. 
# If label is not present in the model object, its full name will be used in the reports instead.
# If label is present in the object, it can't be an empty string.

models = [
    {
        'name': 'xlm-roberta-large',
        'label': 'xlm_large'
    },
    {
    
        'name': 'xlm-roberta-base', 
        'label': 'xlm_base'

    },
    {
        'name': 'bert-base-multilingual-uncased',
        'label': 'bert_base_u'
    },
    {
        'name': 'distilbert-base-multilingual-cased',
        'label': 'distilbert_base'
    },
    {
        'name': 'bert-base-multilingual-cased',
        'label': 'bert_base_c'
    }
    
]

# Build dataset

In our case we build dataset by getting all nouns, verbs and adverbs from Morphalou and their grammatical information.
Then we attempt to obtain a word embedding for each of the words. If it exists (meaning that tokenizer tokenizes the word as one token), it's added to the dataset with the following information:
- Word 
- PoS
- Grammatical information if present
- All dimensions of WE

If there is any word that is both masculine and feminine, or other 2 grammatical characteristics at the same time, it was excluded from the dataset. 

In [5]:
all_verbs = pd.read_csv('../Data/Morphalou/all_verbs_v2.csv', index_col = 0)

In [6]:
all_nouns = pd.read_csv('../Data/Morphalou/all_nouns_v2.csv', index_col = 0)

In [7]:
all_adjs = pd.read_csv('../Data/Morphalou/all_adjs_v2.csv', index_col = 0)

Build the dataset of WE with features for each model in the list above for Verbs, Nouns and Adjectives.

Runtime per model is ~1-1.5 hour.

In [8]:
for model in models:
    # Get the label of the model if it's present, otherwise get its name.
    # Any slashes in the names are replaced with empty strings to work with file saving.
    model_label = re.sub('/', '', model.get('label', model['name']))
    file_path = f'../Data/{model_label}'
    # Create the folder for file storage if it's not created yet.
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    
    print(f'Initiating model {model["name"]}:')
    m, t = initiate_model(model['name'])
    print('Done')
    
    print('\nGenerating WE for nouns:')
    nouns_we_df = create_we_df(m, t, all_nouns, progress=True)
    print(f'\nNouns WE are being stored in the file: {file_path}/all_nouns_we.csv')
    nouns_we_df.to_csv(f'{file_path}/all_nouns_we.csv')

    print('\nGenerating WE for verbs:')
    verbs_we_df = create_we_df(m, t, all_verbs, progress=True)
    print(f'\nVerbs WE are being stored in the file: {file_path}/all_verbs_we.csv')
    verbs_we_df.to_csv(f'{file_path}/all_verbs_we.csv')
    
    print('\nGenerating WE for adjs:')
    adjs_we_df = create_we_df(m, t, all_adjs, progress=True)
    print(f'\nAdjs WE are being stored in the file: {file_path}/all_adjs_we.csv')
    adjs_we_df.to_csv(f'{file_path}/all_adjs_we.csv')
    
    print('============================')

Initiating model xlm-roberta-base:
Done

Generating WE for nouns:
....................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/xlm_base/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/xlm_base/all_verbs_we.csv

Generating WE for adjs:
..................................................................................................
Adjs WE are being stored in the file: ../Data/xlm_base/all_adjs_we.csv
Initiating model bert-base-multilingual-uncased:
huggingface/tok

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done

Generating WE for nouns:
....................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/bert_base_u/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/bert_base_u/all_verbs_we.csv

Generating WE for adjs:
..................................................................................................
Adjs WE are being stored in the file: ../Data/bert_base_u/all_adjs_we.csv
Initiating model distilbert-base-multilingual-cased:


Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Done

Generating WE for nouns:
....................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/distilbert_base/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/distilbert_base/all_verbs_we.csv

Generating WE for adjs:
..................................................................................................
Adjs WE are being stored in the file: ../Data/distilbert_base/all_adjs_we.csv
Initiating model bert-base-multilingual-cased:


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done

Generating WE for nouns:
....................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/bert_base_c/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/bert_base_c/all_verbs_we.csv

Generating WE for adjs:
..................................................................................................
Adjs WE are being stored in the file: ../Data/bert_base_c/all_adjs_we.csv
Initiating model microsoft/mdeberta-v3-base":


HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: 'microsoft/mdeberta-v3-base"'.

# NB !!

Some of the files are too big to be uploaded to Github, all the files can be found in the Google Drive [here](https://drive.google.com/drive/folders/10Ea62GRlq4t7bq-nK9tPtYFu0kbCciey?usp=drive_link).

# Corpora sizes

Below you will find the sizes of WE datasets for each PoS and for each model: it represents how many unique wordforms have a WE.

In [11]:
sizes = []

for m in models:
    model_label = re.sub('/', '', m.get('label', m['name']))
    file_path = f'../Data/{model_label}'
    
    noun_size = len(pd.read_csv(f'{file_path}/all_nouns_we.csv', index_col=0))
    verb_size = len(pd.read_csv(f'{file_path}/all_verbs_we.csv', index_col=0))
    adj_size = len(pd.read_csv(f'{file_path}/all_adjs_we.csv', index_col=0))
    
    sizes.append({
        'Model': model_label,
        'Nouns': noun_size,
        'Verbs': verb_size,
        'Adjs': adj_size
    })
    
sizes_df = pd.DataFrame(sizes)

sizes_df

Unnamed: 0,Model,Nouns,Verbs,Adjs
0,xlm_large,3982,1233,1387
1,xlm_base,3982,1233,1387
2,bert_base_u,6982,2353,2907
3,distilbert_base,4494,925,1610
4,bert_base_c,4494,925,1610


# A look into the datasets

In [12]:
xlm_model, xlm_tokenizer = initiate_model(models[1]['name'])
bc_model, bc_tokenizer = initiate_model(models[-1]['name'])
bu_model, bu_tokenizer = initiate_model(models[2]['name'])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

In [14]:
tested_models = [models[1]['label'], models[-1]['label'], models[2]['label']]

In [15]:
test_words = ['marquante', 'imparfaite', 'déclassement', 'postérieurs', 'absorbent', 'accorda', 'accrédité',
             'humilité', 'dures', 'brune', 'reçoit', 'rétroaction', 'niçois', 'inventée', 'cutanées',
             'gaieté', 'polémiques', 'approuver', 'rencontrait', 'luttent', 'collent', 'démolir']

In [18]:
for w in test_words:
    print(f"""Word: {w}
    XLM tokenization: {[xlm_tokenizer.decode(x) for x in xlm_tokenizer.encode(w)]}
    BERT-cased tokenization: {[bc_tokenizer.decode(x) for x in bc_tokenizer.encode(w)]}
    BERT-uncased tokenization: {[bu_tokenizer.decode(x) for x in bu_tokenizer.encode(w)]}\n\n""")

Word: marquante
    XLM tokenization: ['<s>', 'mar', 'quant', 'e', '</s>']
    BERT-cased tokenization: ['[CLS]', 'mar', '##quant', '##e', '[SEP]']
    BERT-uncased tokenization: ['[CLS]', 'mar', '##quant', '##e', '[SEP]']


Word: imparfaite
    XLM tokenization: ['<s>', 'impar', 'fa', 'ite', '</s>']
    BERT-cased tokenization: ['[CLS]', 'im', '##par', '##fait', '##e', '[SEP]']
    BERT-uncased tokenization: ['[CLS]', 'imp', '##ar', '##fait', '##e', '[SEP]']


Word: déclassement
    XLM tokenization: ['<s>', 'dé', 'class', 'ement', '</s>']
    BERT-cased tokenization: ['[CLS]', 'dé', '##cla', '##ssement', '[SEP]']
    BERT-uncased tokenization: ['[CLS]', 'dec', '##lasse', '##ment', '[SEP]']


Word: postérieurs
    XLM tokenization: ['<s>', 'post', 'érie', 'urs', '</s>']
    BERT-cased tokenization: ['[CLS]', 'post', '##érieur', '##s', '[SEP]']
    BERT-uncased tokenization: ['[CLS]', 'poster', '##ieu', '##rs', '[SEP]']


Word: absorbent
    XLM tokenization: ['<s>', 'absorb', 'ent', '

In [23]:
model, tokenizer = initiate_model(models[-3]['name'])

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Creating unique WE dataset

Some words can be interpretted as several parts of speech so to avoid the ambiguity, we will exlclude these words and we'll only keep words uniquely interpreted as one PoS.

In [19]:
for i in range(len(models)):

    nouns_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_nouns_we.csv', index_col=0)
    nouns_df['POS'] = 'NOUN'

    verbs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_verbs_we.csv', index_col=0)
    verbs_df['POS'] = 'VERB'

    adjs_df = pd.read_csv(f'../Data/{models[i]["label"]}/all_adjs_we.csv', index_col=0)
    adjs_df['POS'] = 'ADJ'

    combined_df = pd.concat([nouns_df, verbs_df, adjs_df])
    words, counts = np.unique(combined_df.index, return_counts=True)
    unique_words = [x[0] for x in zip(words, counts) if x[1] == 1]
    combined_df = combined_df[combined_df.index.isin(unique_words)]

    combined_df.to_csv(f'../Data/{models[i]["label"]}/all_unique_pos_we.csv')