In [238]:
import math
import numpy as np

from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import AutoTokenizer, AutoModel, AdamW, AutoModelForMaskedLM
from functools import reduce

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from MLM.utils import MLMDataIteration

from omegaconf import DictConfig, open_dict, OmegaConf
import hydra

In [9]:
cfg = OmegaConf.load('/Users/ddmitriev/PycharmProjects/MIPTMasterThesis/config/mlm_config.yaml')
cfg

{'MLMData': {'path_data': 'mc4', 'language': 'ru', 'n_samples': 100}, 'faiss': {'embedding_model_name': 'distiluse-base-multilingual-cased-v1', 'dimensions': 512}}

In [187]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [188]:
class TestMLMDataset(Dataset):
    def __init__(self, cfg, tokenizer):
        self.cfg = cfg
        self.data = MLMDataIteration(cfg)
        self.dataset = [i for i in self.data]
        self.tokenizer = tokenizer
    
    def __len__(self):
        return self.cfg.MLMData.n_samples
    
    def __getitem__(self, idx):
        return self.tokenizer(self.dataset[idx])
    

In [189]:
test = TestMLMDataset(cfg, tokenizer)

In [190]:
test[0]

{'input_ids': [101, 100, 100142, 15356, 14741, 12577, 10336, 39298, 69189, 543, 67155, 10205, 17257, 45597, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [191]:
cfg

{'MLMData': {'path_data': 'mc4', 'language': 'ru', 'n_samples': 100}, 'faiss': {'embedding_model_name': 'distiluse-base-multilingual-cased-v1', 'dimensions': 512}}

In [192]:
data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)
test_iter = DataLoader(test, batch_size=5, collate_fn=data_collator)

In [193]:
for i in test_iter:
    pass

In [194]:
a = next(iter(test_iter))
a

{'input_ids': tensor([[   101,    100, 100142,  15356,  14741,  12577,  10336,    103,  69189,
             543,  67155,  10205,    103,    103,    102,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101, 100142,  15356,  14741,  12577,  10336,  39298,  69189,    103,
           67155,  10205,    103,    103,    102,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101,    103,    521,  49223,  60229,  10179,    131,    103,    524,
           10191,  10823,  57772,  12942,  38108,    543,  67155,  10205,    103,
             103,    103,    543, 110191,    107,    119,    102],
         [   101,  54247,  80311,  10191, 110216,    103,  80765,  10541,  73276,
             103,  67155,  10205,    102,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101,    103,  33852,    103,  39298,  69189,  9774

In [77]:
a['input_ids']

tensor([[   11,    10, 13183,  2142,    13,  1392,   396,  2949,    13,   800,
          2378,   277,   167,    13,   300,  2548,    13,    12]])

In [79]:
test[0]['input_ids']

[11,
 10,
 13183,
 2142,
 2009,
 1392,
 396,
 2949,
 1176,
 800,
 2378,
 277,
 167,
 9516,
 300,
 2548,
 6855,
 12]

In [88]:
tokenizer.decode(a['input_ids'][0])

'[CLS] [UNK] Купить [MASK]weed Or [MASK]ic Mask в [MASK]е раз [MASK] [SEP]'

In [89]:
tokenizer.decode(test[0]['input_ids'])

'[CLS] [UNK] Купить Seaweed Organic Mask в Новосибирске развод [SEP]'

In [122]:
for idx, i in enumerate(test_iter):
    print(idx)
    print(tokenizer.decode(i['input_ids'][0]))
    print(tokenizer.decode(test[idx]['input_ids']))
    print(i['input_ids'][0])
    print(test[idx]['input_ids'])
    print()

0
[CLS] [UNK] Купить Seaweed Organic [MASK] [MASK] в Новосибирске развод [SEP]
[CLS] [UNK] Купить Seaweed Organic Mask в Новосибирске развод [SEP]
tensor([   11,    10, 13183,  2142,  2009,  1392,   396,  2949,  1176,   800,
           13,    13,   167,  9516,   300,  2548,  6855,    12])
[11, 10, 13183, 2142, 2009, 1392, 396, 2949, 1176, 800, 2378, 277, 167, 9516, 300, 2548, 6855, 12]

1
[CLS] Куп [MASK] Seaweed Organic Mask ESO Новосибирске развод [SEP]
[CLS] Купить Seaweed Organic Mask в Новосибирске развод [SEP]
tensor([   11, 13183,    13,  2009,  1392,   396,  2949,  1176,   800,  2378,
          277,  6877,  9516,   300,  2548,  6855,    12])
[11, 13183, 2142, 2009, 1392, 396, 2949, 1176, 800, 2378, 277, 167, 9516, 300, 2548, 6855, 12]

2
[CLS] [MASK] Малышева [MASK] [MASK] Пигментация лица в Новосибирске осталась в [MASK] " [MASK] [SEP]
[CLS] Елена Малышева : " Пигментация лица в Новосибирске осталась в прошлом ". [SEP]
tensor([   11,    13,   147,  7337,  8728,   281,    13,  

In [111]:
tokenizer.mask_token_id

13

In [195]:
model = AutoModel.from_pretrained('distilbert-base-multilingual-cased')

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [196]:
class MaskedLanguageModel(nn.Module):
    def __init__(self, transformer, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
        self.transformer = transformer
        self.linear = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        transformer_output = self.transformer(x).pooler_out
        return self.softmax(self.linear(x))

In [213]:
a = next(iter(test_iter))
a

{'input_ids': tensor([[   101,    100, 100142,  15356,    103,    103,    103,  39298,  69189,
             543,  67155,  10205,  17257,  45597,    102,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101, 100142,  15356,  14741,    103,    103,  39298,  69189,    543,
           67155,  10205,  17257,  45597,    102,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101,  44027,    521,  49223,  60229,  10179,    131,    107,    103,
             103,    103,    103,    103,  38108,    543,  67155,  10205,  85854,
           61782,  11833,    543, 110191,    107,    119,    102],
         [   101,  54247,  80311,  10191,  10122,  23285,  80765,  10541,  73276,
             543,    103,    103,    102,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0],
         [   101,  14741,  12577,  10336,    103,  69189,  9774

In [198]:
tr_out = model(a['input_ids'])

In [208]:
tr_out[0].shape

torch.Size([5, 25, 768])

In [205]:
linear = nn.Linear(768, tokenizer.vocab_size)

In [206]:
softmax = nn.LogSoftmax(dim=-1)

In [209]:
l_r = linear(tr_out[0])

In [210]:
s_r = softmax(l_r)

In [268]:
s_r.shape

torch.Size([5, 25, 119547])

In [219]:
criterion = nn.NLLLoss(ignore_index=-100)

In [220]:
loss = criterion(s_r.transpose(1, 2), a["labels"])

In [221]:
loss

tensor(11.7162, grad_fn=<NllLoss2DBackward0>)

In [233]:
a['input_ids'][0]

tensor([   101,    100, 100142,  15356,    103,    103,    103,  39298,  69189,
           543,  67155,  10205,  17257,  45597,    102,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0])

In [269]:
tokenizer.decode(torch.argmax(s_r[0], -1))

'က šalies cup拨ذا掬 студентов niektórych peaks skogarצוותקרן corrida 剛 cup quyền quyền quyền quyền quyền quyền quyền quyền quyền quyền'

In [237]:
a["labels"][0]

tensor([ -100,  -100,  -100,  -100, 14741, 12577, 10336,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100])

In [234]:
tokenizer.convert_ids_to_tokens(100)

'[UNK]'

In [250]:
tokenizer.decode(test[0]['input_ids'])

'[CLS] [UNK] Купить Seaweed Organic Mask в Новосибирске развод [SEP]'

In [267]:
tokenizer.decode(a['input_ids'][1])

'[CLS] Купить Sea [MASK] [MASK] Organic Mask в Новосибирске развод [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [239]:
model_cool = AutoModelForMaskedLM.from_pretrained('distilbert-base-multilingual-cased')

In [251]:
inputs = tokenizer("Купить [MASK] [MASK] [MASK] Organic Mask в Новосибирске развод.", return_tensors="pt")
labels = tokenizer("Купить Seaweed Organic Mask в Новосибирске развод.", return_tensors="pt")["input_ids"]

In [253]:
outputs = model_cool(**inputs, labels=labels)

In [255]:
outputs.loss

tensor(4.0544, grad_fn=<NllLossBackward0>)

In [254]:
logits = outputs.logits

In [258]:
result = torch.argmax(logits, -1)

In [266]:
tokenizer.decode(result[0])

'. Купить новую игру компании Organic Mask в Новосибирске развод. О'

In [263]:
labels

tensor([[   101, 100142,  15356,  14741,  12577,  10336,  39298,  69189,    543,
          67155,  10205,  17257,  45597,    119,    102]])

In [264]:
inputs

{'input_ids': tensor([[   101, 100142,  15356,    103,    103,    103,  39298,  69189,    543,
          67155,  10205,  17257,  45597,    119,    102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}