This code was inspired by: <br>
https://huggingface.co/blog/how-to-train <br>
https://huggingface.co/blog/pretraining-bert <br>
https://www.kaggle.com/code/arnabs007/pretrain-a-bert-language-model-from-scratch/notebook <br>
https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

## Requirements

In [None]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json
import torch
from tqdm import tqdm
import wandb

In [None]:
! wandb login [API KEY GOES HERE]

## Setting the seed

In [None]:
seed = 264806
torch.manual_seed(seed) # for torch
set_seed(seed) # for transformers

## Train a tokenizer

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_st = bert_tokenizer.all_special_tokens
bert_vocab = bert_tokenizer.vocab_size
bert_max = bert_tokenizer.model_max_length

In [None]:
print(
    f"""
    BERT's tokenizer has:
    - Special Tokens -> {bert_st}
    - Vocab Size -> {bert_vocab}
    - Max length -> {bert_max}
    """
)

In [None]:
tokenizer_path = "BertTokenizer_from_Scratch"
if not os.path.exists(tokenizer_path):
    lines = [line.strip() for line in open('FINAL_CORPUS_SEED_264806.txt', 'r', encoding='utf-8').read().splitlines() if (len(line) > 0 and not line.isspace())]

    def batch_iterator(batch_size=10000):
        for i in tqdm(range(0, len(lines), batch_size)):
            yield lines[i : i + batch_size]

    base_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    new_tokenizer = base_tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=bert_vocab)
    new_tokenizer.save_pretrained(tokenizer_path)

## Load the corpus

In [None]:
# adaptation of the LineByLineTextDataset from:
# https://github.com/huggingface/transformers/blob/main/src/transformers/data/datasets/language_modeling.py
class DownsizedDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.lines = [line.strip() for line in open(file_path, 'r', encoding='utf-8').read().splitlines() if (len(line) > 0 and not line.isspace())]
        print(f"Dataset has {len(self.lines)} lines")
        self.examples = []
        for line in tqdm(self.lines):
            example = tokenizer.encode(line, add_special_tokens=True, truncation=True, max_length=self.tokenizer.model_max_length, padding='max_length')
            self.examples.append({'input_ids': torch.tensor(example, dtype=torch.long)})


    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [None]:
tokenizer_path = "BertTokenizer_from_Scratch"
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)

In [None]:
dataset = DownsizedDataset("FINAL_CORPUS_SEED_264806.txt", tokenizer)

In [None]:
print(f"Splitting {len(dataset)} examples into 85-15% train-test")
torch.manual_seed(seed)
train_size = int(len(dataset) * 0.85)
test_size = len(dataset) - train_size
train_split, test_split = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
print(f"Training has {len(train_split)} examples \nValidation has {len(test_split)} examples")

In [None]:
train_split[0]

## Utilities

In [None]:
def get_last_checkpoint(folder_path):
    checkpoints = [f for f in os.listdir(folder_path) if f.startswith("checkpoint-")]

    if not checkpoints:
        return None

    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
    return os.path.join(folder_path, last_checkpoint)

## Training

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
    return_tensors="pt"
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(device))

In [None]:
configuration = BertConfig()
model = BertForMaskedLM(config=configuration)
model = model.to(device)

In [None]:
configuration

In [None]:
training_args = TrainingArguments(
    output_dir="checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="steps",
    save_steps=10000,
    seed=seed,
    report_to='wandb',
    logging_strategy='steps',
    run_name="Downsized BERT"
)

In [None]:
trainer = Trainer(
    model = model,
    args=training_args,
    data_collator=data_collator,
    eval_dataset=test_split,
    train_dataset=train_split,
)

In [None]:
if not os.listdir("checkpoints"):
    trainer.train()
else:
    trainer.train(resume_from_checkpoint=get_last_checkpoint("checkpoints"))

## Save the model

In [None]:
last_checkpoint = get_last_checkpoint("checkpoints")
print("VALIDATION LOSSES")
with open(f"{last_checkpoint}/trainer_state.json", 'r') as f:
    trainer_state = json.load(f)
    for e in trainer_state['log_history']:
        if 'eval_loss' in e:
            print(f"Epoch {e['epoch']} -> {e['eval_loss']}")

In [None]:
model_path = "Downsized_BERT"
best_checkpoint_model = BertForMaskedLM.from_pretrained('last checkpoint path goes here')
best_checkpoint_model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

## Test with pipeline

In [None]:
model = BertForMaskedLM.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
fill_mask("The [MASK] was cloudy yesterday, but today it's rainy.")