In [1]:
from regex import regex
from transformers import DebertaV2Tokenizer, DebertaV2TokenizerFast, DebertaV2ForMaskedLM, pipeline
import json
import os
from torch import nn

In [2]:
MULTILANG_DISTILBERT_CHECKPOINT = "microsoft/mdeberta-v3-base"

RUSSIAN_DISTILBERT_CHECKPOINT = "mdeberta-russian"

PATH_TO_NEW_MODEL = "../../../../data/ml/mdeberta-russian/model"
PATH_TO_NEW_TOKENIZER = "../../../../data/ml/mdeberta-russian/tokenizer"

In [3]:
multilang_tokenizer = DebertaV2Tokenizer.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_vocab = list(multilang_tokenizer.vocab.keys())
print(f"Initial vocab size: {len(multilang_vocab)}")
print(f"{multilang_tokenizer.bos_token}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initial vocab size: 250101
[CLS]


In [4]:
multilang_tokenizer

PreTrainedTokenizer(name_or_path='microsoft/mdeberta-v3-base', vocab_size=250101, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
REGEXP_FOR_SPECIAL_TOKENS = "\[.*\]"
REGEXP_FOR_UNUSED_TOKENS = "\[unused\d+\]"
REGEXP_FOR_RUSSIAN_WORDPIECE = "#*[аАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]*"
REGEXP_FOR_FULL_HASHTAG_PIECE = "#+"
REGEXP_FOR_PUNCTUATION = "\p{Punct}"
REGEXP_FOR_DIGITS = "#*\d+#*"

In [6]:
russian_vocab = [token for token in multilang_vocab if (regex.fullmatch("|".join([REGEXP_FOR_SPECIAL_TOKENS, REGEXP_FOR_UNUSED_TOKENS, REGEXP_FOR_RUSSIAN_WORDPIECE, REGEXP_FOR_FULL_HASHTAG_PIECE, REGEXP_FOR_PUNCTUATION, REGEXP_FOR_DIGITS]), token))]
russian_vocab

['[PAD]',
 '[CLS]',
 '[SEP]',
 '[UNK]',
 '.',
 ',',
 '-',
 ':',
 ')',
 '/',
 "'",
 'и',
 '_',
 '?',
 '、',
 '’',
 ';',
 '。',
 'а',
 '!',
 '"',
 '(',
 'е',
 '3',
 '2',
 '،',
 '1',
 'у',
 '”',
 '।',
 'я',
 '4',
 'о',
 '5',
 'ы',
 '»',
 ']',
 '8',
 '6',
 '0',
 '7',
 '9',
 'м',
 '\\',
 'й',
 '&',
 'на',
 'т',
 '“',
 '{',
 'х',
 'но',
 '・',
 'ом',
 'ки',
 '10',
 'ни',
 'ка',
 'ов',
 'н',
 '#',
 'для',
 'ю',
 '[',
 'их',
 'ой',
 '‘',
 'то',
 'ь',
 '】',
 '」',
 '%',
 'ого',
 '*',
 'р',
 'д',
 'г',
 'та',
 'л',
 '「',
 'ите',
 'да',
 'с',
 'Р',
 'не',
 'ку',
 'ж',
 'п',
 '11',
 'ий',
 'ей',
 'га',
 'к',
 'ш',
 'С',
 '@',
 'ть',
 '12',
 'им',
 'же',
 '20',
 'б',
 'ли',
 '·',
 'ен',
 'ата',
 '88',
 'ных',
 'от',
 '30',
 'ем',
 'ах',
 'те',
 '‚',
 'ные',
 'з',
 'в',
 '–',
 'ня',
 '003',
 'ить',
 '15',
 'ам',
 'ет',
 '}',
 'ок',
 'ти',
 'ны',
 'ат',
 'ного',
 '【',
 'ия',
 'ла',
 'ая',
 '16',
 '18',
 '24',
 'ами',
 'ч',
 '13',
 'ие',
 '00',
 'ной',
 'ке',
 'ный',
 '22',
 'них',
 'ый',
 '14',
 '17',
 '

In [7]:
russian_num_tokens = len(russian_vocab)

In [8]:
russian_num_tokens

22807

In [9]:
multilang_model = DebertaV2ForMaskedLM.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
print(f"Multilang model has {multilang_model.num_parameters()} parameters")

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForMaskedLM: ['deberta.embeddings.position_embeddings._weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'deberta.embeddings.word_embeddings._weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a 

Multilang model has 279061880 parameters


In [10]:
# Code partially borrowed from https://github.com/Geotrend-research/smaller-transformers/blob/main/notebooks/select_mBERT_vocabularies.ipynb

# Get old embeddings from model
multilang_embeddings = multilang_model.get_input_embeddings()
multilang_num_tokens,multilang_embedding_dim = multilang_embeddings.weight.size()

In [11]:
multilang_num_tokens, multilang_embedding_dim

(251000, 768)

In [12]:
# Build new embeddings
new_embeddings = nn.Embedding(russian_num_tokens, multilang_embedding_dim)
new_embeddings.to(multilang_embeddings.weight.device)

Embedding(22807, 768)

In [13]:
# Copy weights for similar tokens and drop others
i = 0
j = 0
for token in multilang_vocab:
    if token in russian_vocab:
        new_embeddings.weight.data[i, :] = multilang_embeddings.weight.data[j, :]
        i += 1
    j += 1

multilang_model.set_input_embeddings(new_embeddings)

print(multilang_model.get_input_embeddings())

# Update base model and current model config
multilang_model.config.vocab_size = russian_num_tokens
multilang_model.vocab_size = russian_num_tokens

# Tie weights
multilang_model.tie_weights()

print(multilang_model.get_input_embeddings())

Embedding(22807, 768)
Embedding(22807, 768)


In [14]:
# Save new model
multilang_model.save_pretrained(PATH_TO_NEW_MODEL)
print(PATH_TO_NEW_MODEL, " - ", " num_parameters : ", multilang_model.num_parameters())
print(PATH_TO_NEW_MODEL, " - ", " num_tokens : ", len(russian_vocab))

../../../../data/ml/mdeberta-russian/model  -   num_parameters :  103581463
../../../../data/ml/mdeberta-russian/model  -   num_tokens :  22807


In [15]:
set_russian_vocab = set(russian_vocab)
new_vocad = dict(filter(lambda pair: pair[0] in set_russian_vocab, multilang_tokenizer.vocab.items()))

In [16]:
new_vocad

{'[PAD]': 0,
 '[CLS]': 1,
 '[SEP]': 2,
 '[UNK]': 3,
 '.': 261,
 ',': 262,
 '-': 265,
 ':': 268,
 ')': 272,
 '/': 276,
 "'": 278,
 'и': 280,
 '_': 291,
 '?': 292,
 '、': 293,
 '’': 294,
 ';': 297,
 '。': 307,
 'а': 309,
 '!': 310,
 '"': 312,
 '(': 313,
 'е': 325,
 '3': 329,
 '2': 339,
 '،': 344,
 '1': 354,
 'у': 355,
 '”': 366,
 '।': 379,
 'я': 397,
 '4': 411,
 'о': 412,
 '5': 429,
 'ы': 434,
 '»': 437,
 ']': 440,
 '8': 450,
 '6': 452,
 '0': 461,
 '7': 488,
 '9': 496,
 'м': 508,
 '\\': 541,
 'й': 544,
 '&': 547,
 'на': 559,
 'т': 588,
 '“': 592,
 '{': 597,
 'х': 606,
 'но': 617,
 '・': 627,
 'ом': 638,
 'ки': 658,
 '10': 661,
 'ни': 669,
 'ка': 680,
 'ов': 686,
 'н': 687,
 '#': 718,
 'для': 736,
 'ю': 749,
 '[': 766,
 'их': 778,
 'ой': 818,
 '‘': 830,
 'то': 833,
 'ь': 834,
 '】': 837,
 '」': 880,
 '%': 882,
 'ого': 894,
 '*': 895,
 'р': 898,
 'д': 903,
 'г': 910,
 'та': 919,
 'л': 921,
 '「': 940,
 'ите': 945,
 'да': 946,
 'с': 947,
 'Р': 955,
 'не': 958,
 'ку': 990,
 'ж': 991,
 'п': 996,
 '

# Adjust sequence piece tokenizer 

In [None]:
! wget https://raw.githubusercontent.com/google/sentencepiece/master/src/sentencepiece_model.proto

In [None]:
! protoc --python_out=. sentencepiece_model.proto

In [13]:
multilang_tokenizer

PreTrainedTokenizerFast(name_or_path='microsoft/mdeberta-v3-base', vocab_size=250101, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [17]:
multilang_tokenizer.save_pretrained(PATH_TO_NEW_TOKENIZER)

('../../../../data/ml/mdeberta-russian/tokenizer/tokenizer_config.json',
 '../../../../data/ml/mdeberta-russian/tokenizer/special_tokens_map.json',
 '../../../../data/ml/mdeberta-russian/tokenizer/spm.model',
 '../../../../data/ml/mdeberta-russian/tokenizer/added_tokens.json')

In [18]:
import sentencepiece_model_pb2 as spmp

In [19]:
smp = multilang_tokenizer._tokenizer.spm.serialized_model_proto()

In [20]:
smp

In [22]:
m = spmp.ModelProto()
m.ParseFromString(smp)

4305025

In [23]:
m

In [24]:
# Disable splitting unknown chars to utf bytes, because we know all tokens what we need
m.trainer_spec.byte_fallback = False

In [32]:
print(m.pieces[:300])

[piece: "[PAD]"
score: 0.0
type: CONTROL
, piece: "[CLS]"
score: 0.0
type: CONTROL
, piece: "[SEP]"
score: 0.0
type: CONTROL
, piece: "[UNK]"
score: 0.0
type: UNKNOWN
, piece: "<0x00>"
score: 0.0
type: BYTE
, piece: "<0x01>"
score: 0.0
type: BYTE
, piece: "<0x02>"
score: 0.0
type: BYTE
, piece: "<0x03>"
score: 0.0
type: BYTE
, piece: "<0x04>"
score: 0.0
type: BYTE
, piece: "<0x05>"
score: 0.0
type: BYTE
, piece: "<0x06>"
score: 0.0
type: BYTE
, piece: "<0x07>"
score: 0.0
type: BYTE
, piece: "<0x08>"
score: 0.0
type: BYTE
, piece: "<0x09>"
score: 0.0
type: BYTE
, piece: "<0x0A>"
score: 0.0
type: BYTE
, piece: "<0x0B>"
score: 0.0
type: BYTE
, piece: "<0x0C>"
score: 0.0
type: BYTE
, piece: "<0x0D>"
score: 0.0
type: BYTE
, piece: "<0x0E>"
score: 0.0
type: BYTE
, piece: "<0x0F>"
score: 0.0
type: BYTE
, piece: "<0x10>"
score: 0.0
type: BYTE
, piece: "<0x11>"
score: 0.0
type: BYTE
, piece: "<0x12>"
score: 0.0
type: BYTE
, piece: "<0x13>"
score: 0.0
type: BYTE
, piece: "<0x14>"
score: 0.0
type

In [33]:
print('the loaded model has pieces:', len(m.pieces))
new_pieces = [m.pieces[idx] for idx in new_vocad.values()]
print('the new pieces:', len(new_pieces))

the loaded model has pieces: 250101
the new pieces: 22807


In [34]:
# replace the content of the first pieces
for i, p in enumerate(new_pieces):
    m.pieces[i].piece = p.piece
    m.pieces[i].score = p.score
    m.pieces[i].type = p.type

In [35]:
# drop the remaining pieces
n = len(new_pieces)
for i in range(len(m.pieces) - n):
    m.pieces.pop(len(m.pieces) - 1)
print(len(m.pieces))

22807


In [36]:
# save result tokenizer model
with open(PATH_TO_NEW_TOKENIZER+'/spm.model', 'wb') as f:
    f.write(m.SerializeToString())

In [24]:
# Save vocab
with open(os.path.join(PATH_TO_NEW_TOKENIZER, 'vocab.txt'), 'w+') as fw:
    for token in russian_vocab:
        fw.write(token + '\n')

# Save tokenizer config
with open(os.path.join(PATH_TO_NEW_TOKENIZER, 'tokenizer_config.json'), 'w+') as fw:
    json.dump({"do_lower_case": False, "model_max_length": 512}, fw)

In [37]:
t = DebertaV2Tokenizer(vocab_file=PATH_TO_NEW_TOKENIZER + '/spm.model')
t

PreTrainedTokenizer(name_or_path='', vocab_size=22807, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [38]:
t.mask_token_id

3

In [39]:
russian_tokenizer = DebertaV2Tokenizer.from_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model = DebertaV2ForMaskedLM.from_pretrained(PATH_TO_NEW_MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [40]:
russian_model.num_parameters()

103581463

In [41]:
russian_model.get_input_embeddings()

Embedding(22807, 768, padding_idx=0)

In [51]:
russian_tokenizer

PreTrainedTokenizer(name_or_path='../../../../data/ml/mdeberta-russian/tokenizer', vocab_size=22807, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [42]:
# Save and reopen model and tokenizer
russian_tokenizer.save_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model.save_pretrained(PATH_TO_NEW_MODEL)

In [6]:
russian_tokenizer = DebertaV2TokenizerFast.from_pretrained(PATH_TO_NEW_TOKENIZER)
russian_model = DebertaV2ForMaskedLM.from_pretrained(PATH_TO_NEW_MODEL)

# Fast test on example

In [3]:
text = "Я люблю [MASK] Россию."

In [10]:
multilang_model = DebertaV2ForMaskedLM.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_tokenizer = DebertaV2TokenizerFast.from_pretrained(MULTILANG_DISTILBERT_CHECKPOINT)
multilang_encoded_input = multilang_tokenizer(text, return_tensors='pt')
print(multilang_encoded_input)
multilang_output_original = multilang_model(**multilang_encoded_input)
print(multilang_output_original)

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings._weight', 'deberta.embeddings.word_embeddings._weight', 'mask_predictions.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a 

{'input_ids': tensor([[     1,   2554,   5510,   8333, 250101,  95666,    749,    261,      2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
MaskedLMOutput(loss=None, logits=tensor([[[-0.3833,  2.4777,  2.4006,  ..., -0.3270, -0.2702, -0.2534],
         [-0.1899,  0.6699,  0.6800,  ..., -0.0352,  0.0773, -0.3111],
         [-0.3125, -1.1108, -1.1087,  ..., -0.1949,  0.0143, -0.3542],
         ...,
         [ 0.5432,  1.4716,  1.4770,  ...,  0.5933,  0.5920,  0.5140],
         [-0.5995, -0.4898, -0.4728,  ..., -0.6239, -0.4977, -0.9380],
         [ 0.3003,  0.8728,  0.8526,  ...,  0.1912,  0.2536,  0.3930]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)


In [11]:
russian_encoded_input = russian_tokenizer(text, return_tensors='pt')
print(russian_encoded_input)
russian_output_original = russian_model(**russian_encoded_input)
print(russian_output_original)

{'input_ids': tensor([[    1,     3,   422,     3,  2463,   570, 22807,     3,  7862, 17005,
             4,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
MaskedLMOutput(loss=None, logits=tensor([[[ 0.3538,  0.4494,  0.4151,  ...,  0.5232,  0.5618,  0.2090],
         [ 0.1261,  0.1887,  0.1003,  ...,  0.2501,  0.3241, -0.0214],
         [ 0.5795,  0.5976,  0.6952,  ...,  0.6437,  0.7327,  0.4548],
         ...,
         [ 0.0966,  0.0562,  0.0822,  ...,  0.1305,  0.2235,  0.0555],
         [ 0.0272,  0.0501, -0.0039,  ...,  0.1392,  0.1625,  0.0808],
         [-0.0016, -0.0388, -0.0380,  ...,  0.0592,  0.1128, -0.0171]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)


In [12]:
pipe = pipeline(task="fill-mask", model=multilang_model, tokenizer=multilang_tokenizer)
output_ = pipe(text)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

льных 0.056287918239831924
łuch 0.038719743490219116
hleb 0.03622915968298912
ebal 0.03287938982248306
haudiere 0.02623041532933712


In [None]:
pipe = pipeline(task="fill-mask", model=russian_model, tokenizer=russian_tokenizer)
output_ = pipe(text)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

In [15]:
pipe = pipeline(task="fill-mask", model=russian_model, tokenizer=russian_tokenizer)
output_ = pipe("Молоко это [MASK].")
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

Обществ 0.09146923571825027
мело 0.06979215145111084
поверх 0.06024843081831932
уман 0.05661516264081001
джи 0.05083320662379265


In [16]:
pipe = pipeline(task="fill-mask", model=multilang_model, tokenizer=multilang_tokenizer)
output_ = pipe("Молоко это [MASK].")
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

screv 0.11209029704332352
neem 0.03488057851791382
riadi 0.031111158430576324
:0, 0.02865675836801529
ueber 0.021498851478099823


In [43]:
russian_tokenizer.save_pretrained("../../../../data/ml/mdeberta-russian/")

('../../../../data/ml/mdeberta-russian/tokenizer_config.json',
 '../../../../data/ml/mdeberta-russian/special_tokens_map.json',
 '../../../../data/ml/mdeberta-russian/spm.model',
 '../../../../data/ml/mdeberta-russian/added_tokens.json')