<a href="https://colab.research.google.com/github/7-4-7/Bert-Scratch/blob/main/Bert_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phase - 1

- Train bert to predict masked entities from a text

In [2]:
import pandas as pd

splits = {'train': 'data/train.parquet', 'test': 'data/eval.parquet'}
df = pd.read_parquet("hf://datasets/Gustavosta/Stable-Diffusion-Prompts/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df

Unnamed: 0,Prompt
0,realistic car 3 d render sci - fi car and sci ...
1,a comic potrait of a female necromamcer with b...
2,"steampunk market interior, colorful, 3 d scene..."
3,"“A portrait of a cyborg in a golden suit, D&D ..."
4,A full portrait of a beautiful post apocalypti...
...,...
73713,ismail inceoglu epic oil on canvas painting of...
73714,eating crayons and being reborn in the loving ...
73715,"ilya kuvshinov with long hair, sky blue hair, ..."
73716,cyberpunk woman with green hair wearing futuri...


## Importing Key Libraries

In [49]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import BertTokenizer # Tokenizer


import numpy as np
import pandas as pd

torch.__version__

'2.6.0+cu124'

In [5]:
splits = {'train': 'data/train.parquet', 'test': 'data/eval.parquet'}

df_train = pd.read_parquet("hf://datasets/Gustavosta/Stable-Diffusion-Prompts/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Gustavosta/Stable-Diffusion-Prompts/" + splits["test"])

In [6]:
len(df_train),len(df_test)

(73718, 8192)

In [8]:
text = df_train['Prompt'][0]

In [None]:
df_train['Prompt'].apply(len)

Unnamed: 0,Prompt
0,431
1,336
2,182
3,109
4,244
...,...
73713,397
73714,258
73715,251
73716,198


## Setting up custom dataset class

In [20]:
prompts = df['Prompt'].to_list()[:20_000]
len(prompts)

20000

In [24]:
prompts[6]

'ilya kuvshinov with long sky blue hair, gold eyes, professional digital painting, concept art, unreal engine 5, 8 k, cinematic, wlop, bubbles, tendrils in the background, art by greg rutkowski, pixiv art, junji ito, yoshitaka amano'

In [43]:
class MLMPrompts(Dataset):
  """ Custom dataset class that prepares the data for training"""

  def __init__(self, df, seq_len, stop_idx : int = 20_000):
    """Given a list of prompts, generate input, target pair"""

    self.prompts = df['Prompt'].to_list()[:stop_idx]
    self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    self.seq_len = seq_len
    self.tokenized_prompts = self._get_data()

  def _get_data(self):
    """ Return input-target list of list || [ [[],[]], [[],[]] ... ]"""

    pairs = []
    # Loop over all the prompts
    for prompt in self.prompts:

      # For each prompt get it's tokenized version
      encoded = self.tokenizer(prompt,
                               padding = "max_length",
                               truncation = True,
                               max_length = self.seq_len,
                               return_tensors = 'pt')

      # Convert them to token ids and add special tokens[By default]
      token_ids = encoded.input_ids

      # Apply Mask
      masked_ids, target_ids = self._apply_mask(token_ids)

      # Store masked data and correct outputs
      temp_dict = {
          'masked_ids':masked_ids,
          'target_ids' : target_ids,
          'attn_mask'  : encoded.attention_mask,
      }

      pairs.append(temp_dict)
    return pairs

  def _apply_mask(self, token_ids):
    """ Perform masking based on 15% and additional 80/10/10 rule on token_ids and return masked token along with it's original content """

    labels = token_ids.clone()

    # Each token has 15% chance of being masked
    p_matrix = torch.full(labels.shape, 0.15) # 15% masking

    # Avoid masking special tokens
    special_token_mask = [
        self.tokenizer.get_special_tokens_mask([token_id.item()], already_has_special_tokens = True)[0] for token_id in labels[0]
    ] # list

    # Update the p_matrix : Update the probabilities of above special tokens
    p_matrix.masked_fill_(torch.tensor(special_token_mask, dtype = torch.bool), value = 0.0)

    # Get masked indices
    masked_indices = torch.bernoulli(p_matrix).bool()

    # Update labels : Only contain masked tokens
    labels[~masked_indices] = -100

    # 80% of tokens marked to be mask -> [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    token_ids[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

    # 10% of masked token with random words
    indices_random = torch.bernoulli(torch.full(labels.shape,0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(self.tokenizer), labels.shape, dtype = torch.long)
    token_ids[indices_random] = random_words[indices_random]

    return token_ids, labels

  def __len__(self):
    return len(self.tokenized_prompts)

  def __getitem__(self, idx):
    return self.tokenized_prompts[idx]

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class MLMPrompts(Dataset):
    """Custom dataset class that prepares the data for MLM training"""

    def __init__(self, df, seq_len, stop_idx: int = 20_000):
        """
        Args:
            df: Pandas DataFrame with column 'Prompt'
            seq_len: Max sequence length
            stop_idx: Limit dataset size
        """
        self.prompts = df['Prompt'].to_list()[:stop_idx]
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.seq_len = seq_len
        self.tokenized_prompts = self._get_data()

    def _get_data(self):
        """Return list of dicts: {masked_ids, target_ids, attn_mask}"""

        pairs = []
        for prompt in self.prompts:

            # Tokenize
            encoded = self.tokenizer(
                prompt,
                padding="max_length",
                truncation=True,
                max_length=self.seq_len,
                return_tensors='pt'
            )

            # Remove batch dimension from tokenizer output
            token_ids = encoded.input_ids[0]         # shape: (seq_len,)
            attn_mask = encoded.attention_mask[0]    # shape: (seq_len,)

            # Apply Masking
            masked_ids, target_ids = self._apply_mask(token_ids)

            pairs.append({
                'masked_ids': masked_ids,
                'target_ids': target_ids,
                'attn_mask': attn_mask
            })

        return pairs

    def _apply_mask(self, token_ids):
        """
        Apply BERT-style 15% masking with 80/10/10 rule
        token_ids: Tensor of shape (seq_len,)
        Returns:
            masked_ids: Tensor (seq_len,) with some tokens replaced/masked
            target_ids: Tensor (seq_len,) with original tokens at masked positions, else -100
        """

        labels = token_ids.clone()

        # 15% probability for masking
        p_matrix = torch.full(labels.shape, 0.15)

        # Avoid masking special tokens ([CLS], [SEP], [PAD])
        special_token_mask = torch.tensor(
            self.tokenizer.get_special_tokens_mask(
                token_ids.tolist(),
                already_has_special_tokens=True
            ),
            dtype=torch.bool
        )
        p_matrix.masked_fill_(special_token_mask, value=0.0)

        # Select positions to mask
        masked_indices = torch.bernoulli(p_matrix).bool()
        labels[~masked_indices] = -100  # -100 ignored by loss

        # 80% of masked → [MASK] token
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        token_ids[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of masked → random token
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        token_ids[indices_random] = random_words[indices_random]

        # Remaining 10% stay the same

        return token_ids, labels

    def __len__(self):
        return len(self.tokenized_prompts)

    def __getitem__(self, idx):
        return self.tokenized_prompts[idx]


In [44]:
ds = MLMPrompts(df, 128)

In [48]:
ds.__getitem__(5)

{'masked_ids': tensor([  101,  3376,  6652, 10000,  3617,  4169,  1010,  2396,  2011,  2396,
           103,  2213,  1998,  6754, 21766,  2102, 15449,  1010,  2632,   103,
          3366,  2172,  2050,  1010,   103,  5620, 10085,  2666,   103,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

## Initializing Dataloader

In [51]:
seq_len = 128
stop_idx = 40_000

batch_size = 16

In [52]:
dataset = MLMPrompts(
    df = df,
    seq_len = seq_len,
    stop_idx = stop_idx,
)

# 80-20 split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_ds, test_ds = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(
    dataset=train_ds,
    batch_size = batch_size,
    shuffle = True,
)

test_loader = DataLoader(
    dataset = test_ds,
    batch_size = batch_size,
    shuffle = False,
)

In [53]:
len(train_loader), len(test_loader)

(2000, 500)

In [55]:
2000*16 + 500*16


40000

In [59]:
next(iter(train_loader))['masked_ids'].shape

torch.Size([16, 128])

In [31]:
lengths = [len(tokenizer.encode(t)) for t in df['Prompt'][:30_000]]
import numpy as np
np.percentile(lengths, [50, 75, 90, 95, 99])


array([ 61.,  76.,  87., 100., 138.])

In [38]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [15]:
tokens = tokenzier.tokenize(text)

In [17]:
len(tokens)

104

In [12]:
token_ids = tokenzier.convert_tokens_to_ids(tokens)
token_ids

[3793]

In [None]:
sentence = 'At future I want to qualify GATE DA 2026 and in long term I want to get in Amazon or Microsoft as applied scientict or principle scientist'

In [None]:
encodings = tokenzier(
    df["Prompt"].tolist()[:30_000],
    truncation=True,
    padding=True,
    max_length=64,
    return_tensors="pt"
)


In [None]:
encodings.token_type_ids.shape

torch.Size([30000, 64])