In [1]:
file= "/content/Hitchhiker's-Guide-to-the-Galaxy,-The.txt"

In [2]:
def process_file(file_path):
  paragraphs = []
  with open(file_path, 'r') as file:
          for line in file:
            if len(line.strip()) < 2:
              continue
            #line = "[START] " + line.strip() + " [END]"
            paragraphs.append(line.strip())
  return paragraphs

content = process_file(file)
print(len(content))

4688


In [3]:
content[11]

'3rd Revised Draft'

In [4]:
import torch
import torch.nn as nn
import random
from transformers import BertTokenizer, BertForMaskedLM

In [10]:
class TransformerLM(nn.Module):
    def __init__(self, model_name="bert-base-uncased", mask_token="[MASK]"):
        super(TransformerLM, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)

        # Add [MASK] token to tokenizer and vocabulary if not already present
        if mask_token not in self.tokenizer.get_vocab():
            self.tokenizer.add_special_tokens({"additional_special_tokens": [mask_token]})
            self.model.resize_token_embeddings(len(self.tokenizer))

        self.mask_token_id = self.tokenizer.convert_tokens_to_ids(mask_token)

    def mask_tokens(self, input_ids, mask_probability=0.2):
        """
        Masks tokens in the input with the given probability and stores the original tokens.

        Args:
            input_ids (torch.Tensor): Input IDs tensor.
            mask_probability (float): Probability of masking each token.

        Returns:
            masked_input_ids (torch.Tensor): Input IDs with tokens replaced by [MASK].
            masked_indices (list): Indices of masked tokens.
            correct_labels (torch.Tensor): The original token IDs for the masked positions.
            masked_sentence (list): List of tokens with [MASK] inserted.
        """
        masked_input_ids = input_ids.clone()

        # Exclude special tokens like [CLS] and [SEP] from being masked
        mask_candidates = (input_ids != self.tokenizer.cls_token_id) & (input_ids != self.tokenizer.sep_token_id)
        mask_indices = torch.nonzero(mask_candidates, as_tuple=True)[1].tolist()

        # Choose 20% of the tokens to mask, or 1 token if there are fewer than 5 tokens
        num_tokens_to_mask = max(1, int(len(mask_indices) * mask_probability))
        selected_indices = random.sample(mask_indices, num_tokens_to_mask)

        correct_labels = masked_input_ids.clone()

        # Mask the selected tokens and build the masked sentence list
        masked_sentence = self.tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
        for idx in selected_indices:
            masked_input_ids[0, idx] = self.mask_token_id  # Replace with [MASK]
            masked_sentence[idx] = self.tokenizer.mask_token  # Insert [MASK] in the sentence

        return masked_input_ids, selected_indices, correct_labels[0, selected_indices], masked_sentence

    def forward(self, input_paragraph):
        """
        Performs a forward pass on the model.

        Args:
            input_paragraph (str): The input paragraph as a string.

        Returns:
            loss (torch.Tensor): Computed loss for the masked tokens.
            predictions (torch.Tensor): Model predictions for the masked tokens.
            predicted_tokens (list): List of predicted tokens for the masked positions.
            masked_sentence (list): Masked sentence with [MASK] token inserted.
        """
        # Tokenize input paragraph
        tokenized = self.tokenizer(input_paragraph, return_tensors="pt")
        input_ids = tokenized.input_ids

        # Mask tokens
        masked_input_ids, masked_indices, correct_labels, masked_sentence = self.mask_tokens(input_ids)

        # Forward pass
        outputs = self.model(masked_input_ids)
        logits = outputs.logits

        # Extract logits corresponding to masked tokens
        masked_logits = logits[0, masked_indices]

        # Compute loss
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(masked_logits, correct_labels)

        # Get predicted token IDs for masked positions
        predicted_indices = torch.argmax(masked_logits, dim=-1).tolist()

        # Convert predicted token IDs to tokens
        predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indices)

        return loss, predicted_tokens, masked_sentence


In [11]:

# Initialize model
transformer_lm = TransformerLM()

# Example paragraph
paragraph = "dolphins leap over and interact with the opening titles."

# Forward pass
loss, predicted_tokens, masked_sentence = transformer_lm(paragraph)

print(f"Original Sentence: {paragraph.split()}")
print(f"Masked Sentence: {masked_sentence}")
print(f"Predicted Tokens for [MASK]: {predicted_tokens}")
print(f"Loss: {loss.item()}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original Sentence: ['dolphins', 'leap', 'over', 'and', 'interact', 'with', 'the', 'opening', 'titles.']
Masked Sentence: ['[CLS]', 'dolphins', 'leap', 'over', 'and', '[MASK]', 'with', 'the', '[MASK]', 'titles', '.', '[SEP]']
Predicted Tokens for [MASK]: ['two', 'play']
Loss: 7.281673431396484
