In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wikipedia-sentences/wikisent2.txt


# Loading dataset

In [7]:
# Load the raw lines from the .txt file
def load_wikipedia_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = f.read().splitlines()
    return sentences

In [24]:
# Load the dataset
file_path = '/kaggle/input/wikipedia-sentences/wikisent2.txt'
sentences = load_wikipedia_sentences(file_path)

# Preview a few examples
print("Total sentences:", len(sentences))
print("Example sentences:")
for i in range(10):
    print(f"{i+1}: {sentences[i]}")
type(sentences)

Total sentences: 7871825
Example sentences:
1: 0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc.
2: 000webhost is a free web hosting service, operated by Hostinger.
3: 0010x0010 is a Dutch-born audiovisual artist, currently living in Los Angeles.
4: 0-0-1-3 is an alcohol abuse prevention program developed in 2004 at Francis E. Warren Air Force Base based on research by the National Institute on Alcohol Abuse and Alcoholism regarding binge drinking in college students.
5: 0.01 is the debut studio album of H3llb3nt, released on February 20, 1996 by Fifth Colvmn Records.
6: 001 of 3 February 1997, which was signed between the Government of the Republic of Rwanda, and FAPADER.
7: 003230 is a South Korean food manufacturer.
8: 0.04%Gas molecules in soil are in continuous thermal motion according to the kinetic theory of gasses, there is also collision between molecules - a random walk.
9: 0.04% of the votes were invalid.
10: 005.1999.06 is the fifth studio album by the South

list

In [25]:
def preprocess_sentences(sentences):
    processed = [s.strip().lower() for s in sentences if len(s.strip()) > 0]
    return processed

# Preprocess sentences
sentences = preprocess_sentences(sentences[:100000])

# Tokenization

In [26]:
# hugging face tokenizer
!pip install -q tokenizers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence

# Save sentences to a file, required for training
with open("train_sentences.txt", "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(s.strip() + "\n")

# Initialize a tokenizer with a WordPiece model
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# Normalization: lowercase, remove accents
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Pre-tokenizer: basic whitespace splitting
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Trainer: WordPiece trainer
trainer = trainers.WordPieceTrainer(
    vocab_size=30_000,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# Train tokenizer
tokenizer.train(["train_sentences.txt"], trainer)

# Optional: save tokenizer for later use
tokenizer.save("custom_wordpiece_tokenizer.json")







In [28]:
# Reload tokenizer
tokenizer = Tokenizer.from_file("custom_wordpiece_tokenizer.json")

In [29]:
# Encode a sample
sample = "the curious fox jumped over the lazy dog"
output = tokenizer.encode(sample)

print("Tokens:", output.tokens)
print("Token IDs:", output.ids)

Tokens: ['the', 'cur', '##ious', 'fox', 'jump', '##ed', 'over', 'the', 'laz', '##y', 'dog']
Token IDs: [139, 2564, 999, 4839, 10135, 148, 669, 139, 12613, 94, 4524]


# Masking

In [30]:
import torch
from torch.utils.data import Dataset

MASK_PROB = 0.15        # 15% masking
MAX_LEN = 64            # Fixed sequence length
MASK_TOKEN_ID = tokenizer.token_to_id("[MASK]")
PAD_TOKEN_ID = tokenizer.token_to_id("[PAD]")
CLS_TOKEN_ID = tokenizer.token_to_id("[CLS]")
SEP_TOKEN_ID = tokenizer.token_to_id("[SEP]")

In [31]:
import random

def mask_input(input_ids):
    labels = [-100] * len(input_ids)  # default ignore index

    for i in range(1, len(input_ids) - 1):  # avoid masking [CLS] or [SEP]
        if random.random() < MASK_PROB:
            labels[i] = input_ids[i]

            # 80% replace with [MASK]
            if random.random() < 0.8:
                input_ids[i] = MASK_TOKEN_ID
            # 10% replace with random token
            elif random.random() < 0.5:
                input_ids[i] = random.randint(0, tokenizer.get_vocab_size() - 1)
            # 10% leave unchanged
    return input_ids, labels


In [32]:
class MLMDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        tokens = self.tokenizer.encode(text)

        input_ids = [CLS_TOKEN_ID] + tokens.ids[:self.max_len-2] + [SEP_TOKEN_ID]
        input_ids += [PAD_TOKEN_ID] * (self.max_len - len(input_ids))

        attention_mask = [1 if id != PAD_TOKEN_ID else 0 for id in input_ids]

        masked_input_ids, labels = mask_input(input_ids.copy())

        return {
            'input_ids': torch.tensor(masked_input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(labels)
        }


In [33]:
from torch.utils.data import DataLoader

# You can split sentences into train/val later
dataset = MLMDataset(sentences, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Preview one batch
batch = next(iter(dataloader))
print("Input IDs:", batch['input_ids'].shape)
print("Labels:", batch['labels'].shape)


Input IDs: torch.Size([64, 64])
Labels: torch.Size([64, 64])


# model making

In [34]:
import torch.nn as nn

class TransformerEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.position_embed = nn.Embedding(max_len, embed_dim)
        self.layer_norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        x = self.token_embed(input_ids) + self.position_embed(positions)
        return self.layer_norm(x)

In [35]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_hidden_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.attn_norm = nn.LayerNorm(embed_dim)

        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.GELU(),
            nn.Linear(ff_hidden_dim, embed_dim)
        )
        self.ff_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask):
        # x: (B, S, E)
        # attention_mask: (B, S) with 1 for real token, 0 for pad
        key_padding_mask = (attention_mask == 0)  # shape (B, S), True = pad
        attn_output, _ = self.attn(
            x, x, x,
            key_padding_mask=key_padding_mask  # shape: (B, S)
        )
    
        x = self.attn_norm(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.ff_norm(x + self.dropout(ff_output))
        return x



In [36]:
class MLMTransformer(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim=128, heads=4, depth=4, ff_dim=512):
        super().__init__()
        self.embedding = TransformerEmbeddings(vocab_size, embed_dim, max_len)
        self.encoder = nn.ModuleList([
            TransformerBlock(embed_dim, heads, ff_dim) for _ in range(depth)
        ])
        self.mlm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)

        # Convert attention mask to bool where 0 = True (masked), 1 = False
        attn_mask = (attention_mask == 0)

        for block in self.encoder:
            x = block(x, attn_mask)

        logits = self.mlm_head(x)
        return logits


# Training

In [37]:
import torch.optim as optim
from tqdm import tqdm

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [38]:
# Get vocab size from tokenizer
vocab_size = tokenizer.get_vocab_size()

# Instantiate model
model = MLMTransformer(vocab_size=vocab_size, max_len=MAX_LEN).to(device)

# Loss: ignore index -100 where labels are not masked
criterion = nn.CrossEntropyLoss(ignore_index=-100)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

In [None]:
EPOCHS = 3

model.train()

for epoch in range(EPOCHS):
    loop = tqdm(dataloader, leave=True)
    total_loss = 0

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()

        logits = model(input_ids, attention_mask)

        # Reshape for loss: [batch*seq_len, vocab]
        loss = criterion(logits.view(-1, vocab_size), labels.view(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss: {total_loss/len(dataloader):.4f}")


Epoch 1:  43%|████▎     | 677/1563 [02:46<03:31,  4.20it/s, loss=nan]

In [None]:
torch.save(model.state_dict(), "mlm_transformer.pt")

In [None]:
tokenizer.save("mlm_tokenizer.json")