In [None]:
# Import necessary libraries and packages
import os
import requests
from zipfile import ZipFile

# Install required Python packages using pip
!pip install transformers datasets tokenizers

# Define the URL for downloading the Cornell Movie Dialogs Corpus
data_url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"

# Define a function to download and extract the dataset
def download_and_extract_data(url, target_dir):
    # Create the target directory if it doesn't exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Define the file path for saving the downloaded ZIP file
    zip_file_path = os.path.join(target_dir, "cornell_movie_dialogs_corpus.zip")

    # Download the ZIP file from the specified URL
    response = requests.get(url)
    with open(zip_file_path, "wb") as zip_file:
        zip_file.write(response.content)

    # Extract the downloaded ZIP file quietly
    with ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(target_dir)

    # Remove the downloaded ZIP file
    os.remove(zip_file_path)

# Specify the target directory for storing the dataset
target_directory = "./datasets"

# Download and extract the Cornell Movie Dialogs Corpus
download_and_extract_data(data_url, target_directory)

# Move the necessary dataset files to the datasets directory
os.rename(
    os.path.join(target_directory, "cornell movie-dialogs corpus/movie_conversations.txt"),
    os.path.join(target_directory, "movie_conversations.txt")
)
os.rename(
    os.path.join(target_directory, "cornell movie-dialogs corpus/movie_lines.txt"),
    os.path.join(target_directory, "movie_lines.txt")
)


Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux

In [None]:
# Necessary Libraries
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

In [None]:
# Define the maximum length for text sequences
MAX_LEN = 64

# Define file paths for the movie conversation and lines data
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'

# Read the conversation and lines data into memory
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as conv_file:
    conversation_data = conv_file.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as lines_file:
    lines_data = lines_file.readlines()

# Create a dictionary to store lines with their respective IDs
lines_dict = {}
for line in lines_data:
    line_parts = line.split(" +++$+++ ")
    lines_dict[line_parts[0]] = line_parts[-1].strip()

# Initialize a list to store question-answer pairs
pairs = []

# Iterate through conversations to create question-answer pairs
for conversation in conversation_data:
    conversation_ids = eval(conversation.split(" +++$+++ ")[-1])

    # Iterate through the conversation IDs
    for i in range(len(conversation_ids)):
        qa_pair = []

        # Skip the last ID if reached
        if i == len(conversation_ids) - 1:
            break

        # Get the text for the first and second parts of the pair
        first_line = lines_dict[conversation_ids[i]].strip()
        second_line = lines_dict[conversation_ids[i + 1]].strip()

        # Truncate and store the text within the defined maximum length
        qa_pair.append(' '.join(first_line.split()[:MAX_LEN]))
        qa_pair.append(' '.join(second_line.split()[:MAX_LEN]))

        # Add the question-answer pair to the list
        pairs.append(qa_pair)

# Example: Print a sample question-answer pair
print(pairs[30])


["Then that's all you had to say.", 'But']


In [None]:
# Create a directory to store the data
os.mkdir('./data')

# Initialize lists to store text data and a count for the saved files
text_data = []
file_count = 0

# Iterate through the text samples and save them to files
for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)

    # Check if we have accumulated 10,000 samples, then save to a file
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as file:
            file.write('\n'.join(text_data))
        text_data = []  # Reset the text_data list
        file_count += 1

# Get a list of file paths for the saved text files
file_paths = [str(file_path) for file_path in Path('./data').glob('**/*.txt')]

# Print the total number of saved files
print(len(file_paths))


100%|██████████| 221616/221616 [00:00<00:00, 1535818.35it/s]

22





In [None]:
# Initialize a BertWordPieceTokenizer with specified settings
tokenizer = BertWordPieceTokenizer(
    clean_text=True,              # Clean text
    handle_chinese_chars=False,   # Do not handle Chinese characters specially
    strip_accents=False,         # Do not strip accents
    lowercase=True               # Convert text to lowercase
)

# Train the tokenizer using the specified parameters
tokenizer.train(
    files=file_paths,                 # List of files containing text data
    vocab_size=30_000,           # Vocabulary size
    min_frequency=5,             # Minimum frequency for a word to be included in the vocabulary
    limit_alphabet=1000,         # Limit the alphabet size
    wordpieces_prefix='##',      # Prefix for wordpieces
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']  # Special tokens
)

# Create a directory to save the trained tokenizer
os.mkdir('./bert-it-1')

# Save the trained tokenizer to the specified directory
tokenizer.save_model('./bert-it-1', 'bert-it')

# Load the tokenizer from the saved model
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

# Tokenize a sample text and print token IDs and tokens
token_ids = tokenizer('I like surfboarding!')['input_ids']
print(token_ids)
print(tokenizer.convert_ids_to_tokens(token_ids))


[1, 48, 250, 4033, 3588, 154, 5, 2]
['[CLS]', 'i', 'like', 'surf', '##board', '##ing', '!', '[SEP]']




In [None]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):

        # Step 1: Get a random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(item)

        # Step 2: Replace random words in sentences with [MASK] or random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Add [CLS] and [SEP] tokens to the start and end of sentences
        # Add [PAD] tokens for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: Combine sentence 1 and 2 as one input
        # Add [PAD] tokens to make the sentence the same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # Remove [CLS] and [SEP] tokens
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance to change token to [MASK]
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance to change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance to keep the token unchanged
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # Flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        '''Return a random sentence pair, either negative or positive (for next sentence prediction)'''
        t1, t2 = self.get_corpus_line(index)

        # Negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''Return a sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''Return a random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]


In [None]:
# Create a BERTDataset instance for training data
train_data = BERTDataset(pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

# Create a DataLoader for training data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True)

# Get a sample batch from the training data loader
sample_data = next(iter(train_loader))
print('Batch Size:', sample_data['bert_input'].size())

# Get a random example from the training data
result = train_data[random.randrange(len(train_data))]

# Print the result
result


Batch Size: torch.Size([32, 64])


{'bert_input': tensor([    1, 19137,  1056,     3,   408,  1083,    17,     2,   335,    16,
           179,   182,    11,    58,   243, 11307,     3,     3,     3,     2,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'bert_label': tensor([  0,  48,   0, 266,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0, 253, 162,  34,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]),
 'segment_label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 

In [None]:
class PositionalEmbedding(torch.nn.Module):
    def __init__(self, d_model, max_len=128):
        super().__init__()

        # Initialize positional encodings matrix
        pe = torch.zeros(max_len, d_model).float()
        pe.requires_grad = False

        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))

        # Include the batch size dimension
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return self.pe

class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):
        super().__init__()
        self.embed_size = embed_size

        # Token embedding layer
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)

        # Segment embedding layer
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)

        # Positional embedding layer
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)

        # Dropout layer
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):
        # Combine token, positional, and segment embeddings
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

# Testing the embedding layer
embed_layer = BERTEmbedding(vocab_size=len(tokenizer.vocab), embed_size=768, seq_len=MAX_LEN)
embed_result = embed_layer(sample_data['bert_input'], sample_data['segment_label'])
print(embed_result.size())


torch.Size([32, 64, 768])


In [None]:
class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()

        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)

        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        context = torch.matmul(weights, value)

        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        return self.output_linear(context)

class FeedForward(torch.nn.Module):
    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()

        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model=768, heads=12, feed_forward_hidden=768 * 4, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

# Testing the attention layers
mask = (sample_data['bert_input'] > 0).unsqueeze(1).repeat(1, sample_data['bert_input'].size(1), 1).unsqueeze(1)
transformer_block = EncoderLayer()
transformer_result = transformer_block(embed_result, mask)
print(transformer_result.size())


torch.Size([32, 64, 768])


In [None]:
class BERT(torch.nn.Module):
    def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads
        self.feed_forward_hidden = d_model * 4
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)
        self.encoder_blocks = torch.nn.ModuleList([EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        x = self.embedding(x, segment_info)
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        return x

class NextSentencePrediction(torch.nn.Module):
    def __init__(self, hidden):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x[:, 0]))

class MaskedLanguageModel(torch.nn.Module):
    def __init__(self, hidden, vocab_size):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

class BERTLM(torch.nn.Module):
    def __init__(self, bert: BERT, vocab_size):
        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)
        self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)

# Testing the BERT model and related components
bert_model = BERT(len(tokenizer.vocab))
bert_result = bert_model(sample_data['bert_input'], sample_data['segment_label'])
print(bert_result.size())

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
final_result = bert_lm(sample_data['bert_input'], sample_data['segment_label'])
print(final_result[0].size(), final_result[1].size())


torch.Size([32, 64, 768])
torch.Size([32, 2]) torch.Size([32, 64, 21159])


In [None]:
class ScheduledOptim():
    def __init__(self, optimizer, d_model, n_warmup_steps):
        """
        A wrapper class for learning rate scheduling.

        :param optimizer: The inner optimizer (e.g., Adam).
        :param d_model: The model's hidden dimension size.
        :param n_warmup_steps: The number of warm-up steps for learning rate scheduling.
        """
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        """
        Step with the inner optimizer and update the learning rate.
        """
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        """
        Zero out the gradients by the inner optimizer.
        """
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        """
        Calculate the learning rate scale.
        """
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        """
        Update the learning rate based on the learning rate scheduling per step.
        """
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr


In [None]:
# Defining the class to train the model
class BERTTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        test_dataloader=None,
        lr=1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cuda',
    ):
        """
        BERT Trainer class for training BERT models.

        :param model: BERT model to be trained.
        :param train_dataloader: DataLoader for the training data.
        :param test_dataloader: DataLoader for the test data (optional).
        :param lr: Learning rate for optimization.
        :param weight_decay: Weight decay for regularization.
        :param betas: Betas for the Adam optimizer.
        :param warmup_steps: Number of warm-up steps for learning rate scheduling.
        :param log_freq: Logging frequency.
        :param device: Device for training ('cuda' or 'cpu').
        """

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-parameters
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        mode = "train" if train else "test"

        # Progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:
            # Move batch data to the specified device (GPU or CPU)
            data = {key: value.to(self.device) for key, value in data.items()}

            # Forward pass: Next sentence prediction and masked language model prediction
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # Calculate NLL loss for is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # Calculate NLLLoss for predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # Add next_loss and mask_loss as described in the pre-training procedure
            loss = next_loss + mask_loss

            # Backward and optimization (only in training mode)
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # Calculate next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        )

# Test example
train_data = BERTDataset(pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True)
bert_model = BERT(len(tokenizer.vocab))
bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')
epochs = 2

for epoch in range(epochs):
    bert_trainer.train(epoch)
