In [49]:
from regex import regex
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, pipeline
import json
import os
from torch import nn

In [50]:
# There is no uncased multilang distilbert
MULTILANG_DISTILBERT_CHECKPOINT = "distilbert-base-multilingual-cased"

RUSSIAN_DISTILBERT_CHECKPOINT = "distilbert-base-russian-cased"

PATH_TO_NEW_MODEL = "../../../../data/ml/distilbert_russian/model"
PATH_TO_NEW_TOKENIZER = "../../../../data/ml/distilbert_russian/tokenizer"

In [51]:
multilang_tokenizer = DistilBertTokenizer.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_vocab = list(multilang_tokenizer.vocab.keys())
print(f"Initial distilbert vocab size: {len(multilang_vocab)}")
print(f"{multilang_tokenizer.bos_token}")

Using bos_token, but it is not set yet.


Initial distilbert vocab size: 119547
None


In [52]:
REGEXP_FOR_SPECIAL_TOKENS = "\[.*\]"
REGEXP_FOR_UNUSED_TOKENS = "\[unused\d+\]"
REGEXP_FOR_RUSSIAN_WORDPIECE = "#*[аАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]*"
REGEXP_FOR_FULL_HASHTAG_PIECE = "#+"
REGEXP_FOR_PUNCTUATION = "\p{Punct}"

In [53]:
russian_vocab = [token for token in multilang_vocab if (regex.fullmatch("|".join([REGEXP_FOR_SPECIAL_TOKENS, REGEXP_FOR_UNUSED_TOKENS, REGEXP_FOR_RUSSIAN_WORDPIECE, REGEXP_FOR_FULL_HASHTAG_PIECE, REGEXP_FOR_PUNCTUATION]), token))]
russian_vocab

['[PAD]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 '[unused11]',
 '[unused12]',
 '[unused13]',
 '[unused14]',
 '[unused15]',
 '[unused16]',
 '[unused17]',
 '[unused18]',
 '[unused19]',
 '[unused20]',
 '[unused21]',
 '[unused22]',
 '[unused23]',
 '[unused24]',
 '[unused25]',
 '[unused26]',
 '[unused27]',
 '[unused28]',
 '[unused29]',
 '[unused30]',
 '[unused31]',
 '[unused32]',
 '[unused33]',
 '[unused34]',
 '[unused35]',
 '[unused36]',
 '[unused37]',
 '[unused38]',
 '[unused39]',
 '[unused40]',
 '[unused41]',
 '[unused42]',
 '[unused43]',
 '[unused44]',
 '[unused45]',
 '[unused46]',
 '[unused47]',
 '[unused48]',
 '[unused49]',
 '[unused50]',
 '[unused51]',
 '[unused52]',
 '[unused53]',
 '[unused54]',
 '[unused55]',
 '[unused56]',
 '[unused57]',
 '[unused58]',
 '[unused59]',
 '[unused60]',
 '[unused61]',
 '[unused62]',
 '[unused63]',
 '[unused64]',
 '[unused65]',
 '[unused66]',
 '[unused

In [54]:
russian_num_tokens = len(russian_vocab)

In [55]:
multilang_model = DistilBertForMaskedLM.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
print(f"Multilang distilbert model has {multilang_model.num_parameters()} parameters")

Multilang distilbert model has 135445755 parameters


In [56]:
# Code partially borrowed from https://github.com/Geotrend-research/smaller-transformers/blob/main/notebooks/select_mBERT_vocabularies.ipynb

# Get old embeddings from model
multilang_embeddings = multilang_model.get_input_embeddings()
multilang_num_tokens,multilang_embedding_dim = multilang_embeddings.weight.size()

In [57]:
multilang_num_tokens, multilang_embedding_dim

(119547, 768)

In [58]:
# Build new embeddings
new_embeddings = nn.Embedding(russian_num_tokens, multilang_embedding_dim)
new_embeddings.to(multilang_embeddings.weight.device)

Embedding(11417, 768)

In [59]:
# Copy weights for similar tokens and drop others
i = 0
j = 0
for token in multilang_vocab:
    if token in russian_vocab:
        new_embeddings.weight.data[i, :] = multilang_embeddings.weight.data[j, :]
        i += 1
    j += 1

multilang_model.set_input_embeddings(new_embeddings)

print(multilang_model.get_input_embeddings())

# Update base model and current model config
multilang_model.config.vocab_size = russian_num_tokens
multilang_model.vocab_size = russian_num_tokens

# Tie weights
multilang_model.tie_weights()

print(multilang_model.get_input_embeddings())

Embedding(11417, 768)
Embedding(11417, 768)


In [60]:
# Save new model
multilang_model.save_pretrained(PATH_TO_NEW_MODEL)
print(PATH_TO_NEW_MODEL, " - ", " num_parameters : ", multilang_model.num_parameters())
print(PATH_TO_NEW_MODEL, " - ", " num_tokens : ", len(russian_vocab))

../../../../data/ml/distilbert_russian/model  -   num_parameters :  52293785
../../../../data/ml/distilbert_russian/model  -   num_tokens :  11417


In [61]:
# Save vocab
with open(os.path.join(PATH_TO_NEW_TOKENIZER, 'vocab.txt'), 'w+') as fw:
    for token in russian_vocab:
        fw.write(token + '\n')

# Save tokenizer config
with open(os.path.join(PATH_TO_NEW_TOKENIZER, 'tokenizer_config.json'), 'w+') as fw:
    json.dump({"do_lower_case": False, "model_max_length": 512}, fw)

In [62]:
russian_tokenizer = DistilBertTokenizer.from_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model = DistilBertForMaskedLM.from_pretrained(PATH_TO_NEW_MODEL)

In [63]:
russian_model.num_parameters()

52293785

In [64]:
russian_model.get_input_embeddings()

Embedding(11417, 768, padding_idx=0)

In [65]:
russian_tokenizer

PreTrainedTokenizer(name_or_path='../../../../data/ml/distilbert_russian/tokenizer', vocab_size=11417, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [66]:
# Save and reopen model and tokenizer
russian_tokenizer.save_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model.save_pretrained(PATH_TO_NEW_MODEL)

In [67]:
russian_tokenizer = DistilBertTokenizer.from_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model = DistilBertForMaskedLM.from_pretrained(PATH_TO_NEW_MODEL)

# Fast test on example

In [68]:
text = "Я люблю [MASK] Россию."

In [71]:
multilang_model = DistilBertForMaskedLM.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_tokenizer = DistilBertTokenizer.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_encoded_input = multilang_tokenizer(text, return_tensors='pt')
print(multilang_encoded_input)
multilang_output_original = multilang_model(**multilang_encoded_input)
print(multilang_output_original)

{'input_ids': tensor([[  101,   540,   552, 10593, 61394, 10593,   103, 89043,   119,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
MaskedLMOutput(loss=None, logits=tensor([[[-10.0335, -10.0463, -10.0795,  ...,  -9.7731,  -9.5484,  -9.6199],
         [-11.8299, -11.8983, -12.1971,  ..., -10.9337, -10.1853, -10.4831],
         [-13.4928, -13.5642, -13.2392,  ..., -11.6357, -11.8119, -11.6546],
         ...,
         [-10.4157, -10.7096,  -9.9264,  ...,  -9.3829,  -9.3726,  -8.8452],
         [-13.7720, -13.6973, -13.0738,  ..., -12.4249, -11.4396, -12.2219],
         [-12.2253, -11.8086, -11.6672,  ..., -10.8050, -10.0424, -10.2295]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)


In [72]:
russian_encoded_input = russian_tokenizer(text, return_tensors='pt')
print(russian_encoded_input)
russian_output_original = russian_model(**russian_encoded_input)
print(russian_output_original)

{'input_ids': tensor([[ 101,  166,  178,  349, 5592,  349,  103, 8750,  115,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
MaskedLMOutput(loss=None, logits=tensor([[[-10.0335, -10.0463, -10.0795,  ...,  -3.3749,  -2.9937,  -3.2878],
         [-11.8299, -11.8983, -12.1971,  ...,  -0.6611,   0.1207,   3.0330],
         [-13.4928, -13.5642, -13.2392,  ...,   1.4186,  -0.5307,   1.6961],
         ...,
         [-10.4157, -10.7096,  -9.9264,  ...,   0.4886,   1.7815,   2.3896],
         [-13.7720, -13.6973, -13.0738,  ...,  -0.1794,   1.1187,   1.2068],
         [-12.2253, -11.8086, -11.6672,  ...,   0.2578,   1.5360,   1.5543]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)


In [74]:
pipe = pipeline(task="fill-mask", model=multilang_model, tokenizer=multilang_tokenizer)
output_ = pipe(text)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

с в о ю 0.1130324974656105
э т у 0.07151170819997787
# # ю 0.048914585262537
# # с ь 0.04644572734832764
в 0.023181308060884476


In [75]:
pipe = pipeline(task="fill-mask", model=russian_model, tokenizer=russian_tokenizer)
output_ = pipe(text)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

с в о ю 0.10373394191265106
э т у 0.0814199224114418
в 0.05354791134595871
# # с ь 0.04612283781170845
э т о 0.025950049981474876


In [76]:
geo_model = DistilBertForMaskedLM.from_pretrained("Geotrend/distilbert-base-ru-cased")
geo_tokenizer = DistilBertTokenizer.from_pretrained("Geotrend/distilbert-base-ru-cased")

In [77]:
pipe = pipeline(task="fill-mask", model=geo_model, tokenizer=geo_tokenizer)
output_ = pipe(text)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

с в о ю 0.11730142682790756
э т у 0.08307341486215591
# # с ь 0.04491831362247467
# # ю 0.03141210600733757
я 0.027354605495929718


Here we have three distil models (multilang, own and from Geotrend). We need to pick most suitable one.
So let's see perplexity metrics on the anamnesis dataset and decide which one is winner