The pre-processing code was obtained from: https://github.com/jamescalam/transformers/blob/main/course/training/04_mlm_training_Trainer.ipynb

The code was then amended to run on a TPU and to split the data before pre-processing it using a function implementing the above's code to cater for very large amounts of data.

The postprocessing code was made from scratch inspired by https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt


## Requirements

In [2]:
import transformers
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
import torch
from tqdm import tqdm
import os
import torch

## Logging

In [None]:
! wandb login [API KEY GOES HERE]

In [None]:
import wandb
wandb.init(project='fyp', resume=True)

## Setting the seeds

In [3]:
seed = 1043423
torch.manual_seed(seed) # for torch
transformers.set_seed(seed) # for transformers

## Loading the corpus

In [None]:
# adaptation of the LineByLineTextDataset from:
# https://github.com/huggingface/transformers/blob/main/src/transformers/data/datasets/language_modeling.py#

class DatasetĠurnalistiku(torch.utils.data.Dataset):
    def __init__(self, file_path, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.lines = [line.strip() for line in open(file_path, 'r', encoding='utf-8').read().splitlines() if (len(line) > 0 and not line.isspace())]
        print(f"Dataset has {len(self.lines)} lines")
        self.examples = []
        for line in tqdm(self.lines):
            example = tokenizer.encode(line, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
            self.examples.append({'input_ids': torch.tensor(example, dtype=torch.long)})

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [None]:
tokenizer = BertTokenizer.from_pretrained('MLRS/BERTu')
dataset = DatasetĠurnalistiku("Korpus_Ġurnalistiku.txt", tokenizer)

In [None]:
print(f"Splitting {len(dataset)} examples into 85-15% train-test")
train_size = int(len(dataset) * 0.85)
test_size = len(dataset) - train_size
train_split, test_split = torch.utils.data.random_split(dataset, [train_size, test_size])

## Utilities

In [None]:
def get_last_checkpoint(folder_path):
    checkpoints = [f for f in os.listdir(folder_path) if f.startswith("checkpoint-")]

    if not checkpoints:
        return None

    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
    return os.path.join(folder_path, last_checkpoint)

## Intermediate pre-training

In [None]:
model = BertForMaskedLM.from_pretrained('MLRS/BERTu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
torch.cuda.get_device_name(0)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15,
    return_tensors="pt"
)

In [None]:
args = TrainingArguments(
    output_dir='checkpoints',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    seed=seed,
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_split,
    eval_dataset=test_split,
    data_collator=data_collator,
)

In [None]:
if not os.listdir("checkpoints"):
    trainer.train()
else:
    trainer.train(resume_from_checkpoint=(get_last_checkpoint("checkpoints")))

## Save the model

In [None]:
trainer.save_model("BERTu Ġurnalistiku")

In [None]:
tokenizer.save_pretrained('BERTu Ġurnalistiku')