In [1]:
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaConfig
from transformers import RobertaTokenizer
from transformers import AdamW
from transformers import pipeline
from tqdm import tqdm
import torch
import torch.cuda
from transformers import RobertaForMaskedLM
torch.cuda.empty_cache()

In [2]:
corpus_path = "../../data/rustaveli/v1/vef_full.txt"
corpus_text = open(corpus_path, "r").read()

corpus_max_len = max(map(lambda v: len(v), corpus_text.split('.')))
corpus_vocab_size = len(set(' '.join(corpus_text.split('.')).split(' ')))

print(f"Maximum sentence length: {corpus_max_len}")
print(f"Vocab Size: {corpus_vocab_size}")

Maximum sentence length: 1211
Vocab Size: 18808


In [None]:
tokenizer = ByteLevelBPETokenizer(
    add_prefix_space=True
)

tokenizer.train(
    files=[corpus_path], 
    vocab_size=20_000, 
    min_frequency=1,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

tokenizer.save_model('shotabert')

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('shotabert', max_len=512)

In [5]:
lines = corpus_text.split('.')
lines[:2]

['რომელმან შექმნა სამყარო ძალითა მით ძლიერითა, ზეგარდმო არსნი სულითა ყვნა ზეცით მონაბერითა, ჩვენ, კაცთა, მოგვცა ქვეყანა, გვაქვს უთვალავი ფერითა, და მისგან არს ყოვლი ხელმწიფე სახითა მის მიერითა',
 ' ჰე, ღმერთო ერთო, შენ შეჰქმენ სახე ყოვლისა ტანისა, შენ დამიფარე, ძლევა მეც დათრგუნვად მე სატანისა, მომეც მიჯნურთა სურვილი, სიკვდიდმდე გასატანისა, და ცოდვათა შესუბუქება, მუნ თანა წასატანისა']

In [6]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=False)
len(batch.input_ids)

1913

In [7]:
labels = torch.tensor(batch.input_ids)
mask = torch.tensor(batch.attention_mask)

input_ids = labels.detach().clone()
rand = torch.rand(input_ids.shape)
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
for i in range(input_ids.shape[0]):
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    input_ids[i, selection] = 3  # our custom [MASK] token == 3


encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

input_ids.shape

torch.Size([1913, 512])

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [9]:

config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [10]:
torch.cuda.empty_cache()

model = RobertaForMaskedLM(config)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
print("Nice.")

Nice.


In [11]:
model.train()
optim = AdamW(model.parameters(), lr=1e-4)



In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

epochs = 200
step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        writer.add_scalar("Loss/train", loss, step)
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1 

In [13]:
model.save_pretrained('./shotabert') 

In [15]:
fill = pipeline('fill-mask', model='shotabert', tokenizer='shotabert')

2022-06-19 19:03:35.461252: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-19 19:03:35.487374: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-06-19 19:03:35.487392: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-06-19 19:03:35.487758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [24]:
fill('იყო <mask> როსტევან?')

[{'score': 0.2885054349899292,
  'token': 4,
  'token_str': '<mask>',
  'sequence': 'იყო როსტევან?'},
 {'score': 0.0005480332183651626,
  'token': 0,
  'token_str': '<s>',
  'sequence': 'იყო როსტევან?'},
 {'score': 0.00030780278029851615,
  'token': 1218,
  'token_str': ' პირველ',
  'sequence': 'იყო პირველ როსტევან?'},
 {'score': 0.0003003068850375712,
  'token': 3633,
  'token_str': ' დიდისა',
  'sequence': 'იყო დიდისა როსტევან?'},
 {'score': 0.00026761656044982374,
  'token': 7996,
  'token_str': 'ვლილობა',
  'sequence': 'იყოვლილობა როსტევან?'}]

In [None]:
fill(f'თბილისი {fill.tokenizer.mask_token} დედაქალაქია')
