## Install the Data

In [None]:
%pip install datasets

In [4]:
import datasets

In [5]:
all_ds = datasets.list_datasets()
len(all_ds)

8802

In [6]:
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_az')

Downloading builder script:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/359k [00:00<?, ?B/s]

Downloading and preparing dataset oscar/unshuffled_deduplicated_az (download: 497.57 MiB, generated: 1.42 GiB, post-processed: Unknown size, total: 1.91 GiB) to /root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_az/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2...


Downloading data:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/522M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/626796 [00:00<?, ? examples/s]

Dataset oscar downloaded and prepared to /root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_az/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 626796
    })
})

In [8]:
dataset['train'][0]

{'id': 0,
 'text': 'AZTV-Artıq 7 ildir ki, Abşeron rayonu dotasiya almadan bütün xərclərini yerli daxilolmalar hesabına maliyyələşdirir.\nDünən, 10:49 | Azərbaycanda Konstitusiya GünüdürDünən, 10:46 | Yapon tədqiqatçılar Parkinson xəstəliyinin müalicəsi istiqamətində mühüm uğur qazanıblarDünən, 10:44 | Azərbaycan İstanbul Beynəlxalq Kitab Sərgisində təmsil olunur10-11-2018, 19:56 | İradə Gülməmmədova şahmat üzrə üçqat Avropa çempionu ilə görüşdü- FOTO10-11-2018, 11:18 | Süni intellektli ilk bankomat istifadəyə verilib10-11-2018, 11:16 | Çikaqoda Dövlət Bayrağı Günü münasibətilə tədbir keçirilib10-11-2018, 11:12 | Azərbaycan ilə Türkiyə arasında QHT-lərlə bağlı memorandum imzalanıb9-11-2018, 19:05 | Abşeronun yeniyetmə cüdoçuları zona mərhələsinin qalibi adını qazanıblar9-11-2018, 16:44 | Ceyranbatan 1 nömrəli məktəbdə Bayraq Günü tədbiri-Video9-11-2018, 15:25 | Xırdalanın 23 saylı bağçasında Dövlət Bayrağı Günü qeyd olunub (Foto+Video)9-11-2018, 15:09 | Xırdalan şəhər 3 saylı uşaq bağç

In [9]:
from tqdm.auto import tqdm  # for our loading bar

text_data = []
file_count = 0

for sample in tqdm(dataset['train']):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 5_000:
        # once we hit the 5K mark, save to file
        with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 5K chunks, we will have ~3808 leftover samples, we save those now too
with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))


  0%|          | 0/626796 [00:00<?, ?it/s]

## Build a Custom Transformer Tokenizer

In [10]:
from pathlib import Path
import os

In [None]:
paths = [str(x) for x in Path('').glob('**/*.txt')]
paths

In [12]:
%pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 8.1 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1


In [13]:
from tokenizers import ByteLevelBPETokenizer

In [14]:
tokenizer = ByteLevelBPETokenizer()

In [15]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [16]:
os.mkdir('bert_dayi')

tokenizer.save_model('bert_dayi')

['bert_dayi/vocab.json', 'bert_dayi/merges.txt']

#### Using the tokenizer

In [17]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.5 MB/s 
Installing collected packages: transformers
Successfully installed transformers-4.21.2


In [18]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('bert_dayi') # load the tokenizer

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [19]:
lorem_ipsum = (
    "AZTV-Artıq 7 ildir ki, Abşeron rayonu dotasiya almadan bütün"
    " xərclərini yerli daxilolmalar hesabına maliyyələşdirir"
)

In [20]:
tokenizer(lorem_ipsum, max_length=512, padding='max_length', truncation=True)

{'input_ids': [0, 2444, 8612, 17, 8890, 1085, 5236, 330, 16, 4915, 2657, 795, 24617, 23590, 789, 18876, 2192, 14777, 1787, 3639, 4942, 7829, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Building MLM Training Input Pipeline

In [21]:
with open('text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [22]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch)

2

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [23]:
import torch

def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = rand < 0.15 * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero())
        tensor[i, selection] = 4
    return tensor

In [24]:
from pathlib import Path

paths = [str(x) for x in Path('').glob('*.txt')]
paths[:5]

['text_78.txt', 'text_1.txt', 'text_113.txt', 'text_18.txt', 'text_94.txt']

In [25]:
from tqdm.auto import tqdm

input_ids = []
mask = []
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    # input_ids.append(batch['input_ids'])
    # mask.append(batch['attention_mask'])
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

  0%|          | 0/126 [00:00<?, ?it/s]

In [26]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [27]:
input_ids[0][:10]

tensor([    0,  2035, 18836,   721, 12775,  1463,   448,  2613, 28538,  2450])

In [28]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [29]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encoding):
        self.encoding = encoding
    def __len__(self):
        return self.encoding['input_ids'].shape[0]
    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encoding.items()}

In [30]:
dataset = Dataset(encodings)

In [31]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

## Training and Testing Azerbaijani BERT

In [32]:
from transformers import RobertaConfig
config =  RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [33]:
from transformers import RobertaForMaskedLM

In [34]:
model = RobertaForMaskedLM(config)

In [35]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [None]:
model

In [37]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/39175 [00:00<?, ?it/s]