In [1]:
import sys
sys.path.append('../Util')

In [2]:
from we import initiate_model, get_we, create_we_df

In [3]:
import pandas as pd
import numpy as np
import re
import os

# Load models

The code will load a Transformer model from Hugging Face library.
It is expected that the model can be loaded using `AutoModelForMaskedLM` function of the Hugging Face library and its tokenizer can be initiated using `AutoTokenizer`.

This can be confirmed on [the model's page](https://huggingface.co/flaubert/flaubert_base_uncased) in the Hugging Face library.

In [4]:
# List of all models to compare
# Since model names can be quite long, label can be used to reference it in the reports. 
# If label is not present in the model object, its full name will be used in the reports instead.
# If label is present in the object, it can't be an empty string.

models = [
    {
        'name': 'flaubert/flaubert_small_cased',
        'label': 'flau_small_c'
    },
    
        'name': 'flaubert/flaubert_base_uncased', 
        'label': 'flau_base_u'

    },
    {
        'name': 'flaubert/flaubert_base_cased',
        'label': 'flau_base_c'
    },
    {
        'name': 'flaubert/flaubert_large_cased',
        'label': 'flau_large_c'
    },
    {
        'name': 'camembert-base',
        'label': 'cam_base'
    }
    
]

# Build dataset

In our case we build dataset by getting all nouns, verbs and adverbs from Morphalou and their grammatical information.
Then we attempt to obtain a word embedding for each of the words. If it exists (meaning that tokenizer tokenizes the word as one token), it's added to the dataset with the following information:
- Word 
- PoS
- Grammatical information if present
- All dimensions of WE

If there is any word that is both masculine and feminine, or other 2 grammatical characteristics at the same time, it was excluded from the dataset. 

In [5]:
all_verbs = pd.read_csv('../Data/Morphalou/all_verbs_v2.csv', index_col = 0)

In [6]:
all_nouns = pd.read_csv('../Data/Morphalou/all_nouns_v2.csv', index_col = 0)

In [7]:
all_adjs = pd.read_csv('../Data/Morphalou/all_adjs_v2.csv', index_col = 0)

Build the dataset of WE with features for each model in the list above for Verbs, Nouns and Adjectives.

Runtime per model is ~1-1.5 hour.

In [9]:
for model in models[]:
    # Get the label of the model if it's present, otherwise get its name.
    # Any slashes in the names are replaced with empty strings to work with file saving.
    model_label = re.sub('/', '', model.get('label', model['name']))
    file_path = f'../Data/{model_label}'
    # Create the folder for file storage if it's not created yet.
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    
    print(f'Initiating model {model["name"]}:')
    m, t = initiate_model(model['name'])
    print('Done')
    
    print('\nGenerating WE for nouns:')
    nouns_we_df = create_we_df(m, t, all_nouns, progress=True)
    print(f'\nNouns WE are being stored in the file: {file_path}/all_nouns_we.csv')
    nouns_we_df.to_csv(f'{file_path}/all_nouns_we.csv')

    print('\nGenerating WE for verbs:')
    verbs_we_df = create_we_df(m, t, all_verbs, progress=True)
    print(f'\nVerbs WE are being stored in the file: {file_path}/all_verbs_we.csv')
    verbs_we_df.to_csv(f'{file_path}/all_verbs_we.csv')
    
    print('\nGenerating WE for adjs:')
    adjs_we_df = create_we_df(m, t, all_adjs, progress=True)
    print(f'\nAdjs WE are being stored in the file: {file_path}/all_adjs_we.csv')
    adjs_we_df.to_csv(f'{file_path}/all_adjs_we.csv')
    
    print('============================')

Initiating model flaubert/flaubert_base_uncased:
Done

Generating WE for nouns:
.........................................................................................................................................................................................
Nouns WE are being stored in the file: ../Data/flau_base_u/all_nouns_we.csv

Generating WE for verbs:
.................................................................................................................................................................................................................................................................................................................................
Verbs WE are being stored in the file: ../Data/flau_base_u/all_verbs_we.csv

Generating WE for adjs:
...................................................................................................
Adjs WE are being stored in the file: ../Data/flau_base_u/all_adjs_we.csv
Initiating model flaubert/flaubert_

# Corpora sizes

Below you will find the sizes of WE datasets for each PoS and for each model: it represents how many unique wordforms have a WE.

In [14]:
sizes = []

for m in models:
    model_label = re.sub('/', '', m.get('label', m['name']))
    file_path = f'../Data/{model_label}'
    
    noun_size = len(pd.read_csv(f'{file_path}/all_nouns_we.csv', index_col=0))
    verb_size = len(pd.read_csv(f'{file_path}/all_verbs_we.csv', index_col=0))
    adj_size = len(pd.read_csv(f'{file_path}/all_adjs_we.csv', index_col=0))
    e
    sizes.append({
        'Model': model_label,
        'Nouns': noun_size,
        'Verbs': verb_size,
        'Adjs': adj_size
    })
    
sizes_df = pd.DataFrame(sizes)

sizes_df

Unnamed: 0,Model,Nouns,Verbs,Adjs
0,flau_small_c,15183,5425,8881
1,flau_base_u,18489,6377,10460
2,flau_base_c,15183,5425,8881
3,flau_large_c,15183,5425,8881
4,cam_base,10594,3852,6400
