## Install the Data

In [2]:
%pip install datasets

Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
     -------------------------------------- 365.7/365.7 KB 4.6 MB/s eta 0:00:00
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.0-py3-none-any.whl (120 kB)
     -------------------------------------- 120.5/120.5 KB 6.9 MB/s eta 0:00:00
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
     ---------------------------------------- 141.2/141.2 KB ? eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.0.0-cp38-cp38-win_amd64.whl (29 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py38-none-any.whl (131 kB)
     -------------------------------------- 131.4/131.4 KB 7.6 MB/s eta 0:00:00
Collecting pyarrow>=6.0.0
  Downloading pyarrow-9.0.0-cp38-cp38-win_amd64.whl (19.6 MB)
     --------------------------------------- 19.6/19.6 MB 12.4 MB/s eta 0:00:00
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp38-cp38-win_amd64.whl (555 kB)
     

You should consider upgrading via the 'C:\Users\elsha\anaconda3\python.exe -m pip install --upgrade pip' command.


In [1]:
import datasets

In [2]:
all_ds = datasets.list_datasets()
len(all_ds)

8763

In [3]:
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_la')

Reusing dataset oscar (C:\Users\elsha\.cache\huggingface\datasets\oscar\unshuffled_deduplicated_la\1.0.0\84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)
100%|██████████| 1/1 [00:00<00:00, 13.51it/s]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 18808
    })
})

In [5]:
dataset['train'][0]

{'id': 0,
 'text': 'Hæ sunt generationes Noë: Noë vir justus atque perfectus fuit in generationibus suis; cum Deo ambulavit.\nEcce ego adducam aquas diluvii super terram, ut interficiam omnem carnem, in qua spiritus vitæ est subter cælum: universa quæ in terra sunt, consumentur.\nTolles igitur tecum ex omnibus escis, quæ mandi possunt, et comportabis apud te: et erunt tam tibi, quam illis in cibum.'}

In [6]:
from tqdm.auto import tqdm  # for our loading bar

text_data = []
file_count = 0

for sample in tqdm(dataset['train']):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 5_000:
        # once we hit the 5K mark, save to file
        with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 5K chunks, we will have ~3808 leftover samples, we save those now too
with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))


100%|██████████| 18808/18808 [00:06<00:00, 2941.51it/s]


## Build a Custom Transformer Tokenizer

In [7]:
from pathlib import Path
import os

In [8]:
paths = [str(x) for x in Path('').glob('**/*.txt')]
paths

['text_0.txt', 'text_1.txt', 'text_2.txt', 'text_3.txt', 'bertius\\merges.txt']

In [19]:
%pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.12.1-cp38-cp38-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 10.0 MB/s eta 0:00:00
Installing collected packages: tokenizers
Successfully installed tokenizers-0.12.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\elsha\anaconda3\python.exe -m pip install --upgrade pip' command.


In [9]:
from tokenizers import ByteLevelBPETokenizer

In [10]:
tokenizer = ByteLevelBPETokenizer()

In [11]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [12]:
os.mkdir('bertius')

tokenizer.save_model('bertius')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'bertius'

#### Using the tokenizer

In [25]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
     ---------------------------------------- 4.7/4.7 MB 9.9 MB/s eta 0:00:00
Installing collected packages: transformers
Successfully installed transformers-4.21.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\elsha\anaconda3\python.exe -m pip install --upgrade pip' command.


In [13]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('bertius') # load the tokenizer

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [14]:
lorem_ipsum = (
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
    "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud "
    "exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute "
    "irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla "
    "pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia "
    "deserunt mollit anim id est laborum."
)

In [15]:
tokenizer(lorem_ipsum, max_length=512, padding='max_length', truncation=True)

{'input_ids': [0, 3587, 653, 1601, 461, 1788, 16, 2618, 3714, 3088, 16, 398, 3702, 13754, 3727, 16099, 330, 2219, 290, 1914, 1547, 1650, 18, 1376, 412, 320, 10178, 1931, 16, 632, 13322, 23666, 2438, 6332, 9089, 691, 330, 20864, 350, 507, 7542, 10803, 18, 15644, 380, 73, 2920, 2650, 1601, 285, 2068, 285, 1604, 1256, 361, 17171, 1914, 2514, 2074, 1089, 2524, 18, 15442, 28801, 909, 24536, 30305, 312, 20856, 16, 338, 285, 2527, 366, 2573, 3045, 17797, 581, 462, 297, 3562, 18, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Building MLM Training Input Pipeline

In [17]:
with open('text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [18]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch)

2

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [22]:
import torch

def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = rand < 0.15 * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero())
        tensor[i, selection] = 4
    return tensor

In [23]:
from pathlib import Path

paths = [str(x) for x in Path('').glob('*.txt')]
paths[:5]

['text_0.txt', 'text_1.txt', 'text_2.txt', 'text_3.txt']

In [28]:
from tqdm.auto import tqdm

input_ids = []
mask = []
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    # input_ids.append(batch['input_ids'])
    # mask.append(batch['attention_mask'])
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

100%|██████████| 4/4 [00:15<00:00,  3.80s/it]


In [33]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [34]:
input_ids[0][:10]

tensor([    0,    44,   837,   338,  7598, 21561,    30, 21561,   610, 14601])

In [35]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [30]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encoding):
        self.encoding = encoding
    def __len__(self):
        return self.encoding['input_ids'].shape[0]
    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encoding.items()}

In [36]:
dataset = Dataset(encodings)

In [37]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

## Training and Testing Latin BERT

In [38]:
from transformers import RobertaConfig
config =  RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [39]:
from transformers import RobertaForMaskedLM

In [40]:
model = RobertaForMaskedLM(config)

In [42]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [43]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor