In [139]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wikipedia-sentences/wikisent2.txt


# Loading dataset

In [140]:
# Load the raw lines from the .txt file
def load_wikipedia_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = f.read().splitlines()
    return sentences

In [141]:
# Load the dataset
file_path = '/kaggle/input/wikipedia-sentences/wikisent2.txt'
sentences = load_wikipedia_sentences(file_path)

# Preview a few examples
print("Total sentences:", len(sentences))
print("Example sentences:")
for i in range(10):
    print(f"{i+1}: {sentences[i]}")
type(sentences)

Total sentences: 7871825
Example sentences:
1: 0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc.
2: 000webhost is a free web hosting service, operated by Hostinger.
3: 0010x0010 is a Dutch-born audiovisual artist, currently living in Los Angeles.
4: 0-0-1-3 is an alcohol abuse prevention program developed in 2004 at Francis E. Warren Air Force Base based on research by the National Institute on Alcohol Abuse and Alcoholism regarding binge drinking in college students.
5: 0.01 is the debut studio album of H3llb3nt, released on February 20, 1996 by Fifth Colvmn Records.
6: 001 of 3 February 1997, which was signed between the Government of the Republic of Rwanda, and FAPADER.
7: 003230 is a South Korean food manufacturer.
8: 0.04%Gas molecules in soil are in continuous thermal motion according to the kinetic theory of gasses, there is also collision between molecules - a random walk.
9: 0.04% of the votes were invalid.
10: 005.1999.06 is the fifth studio album by the South

list

In [142]:
def preprocess_sentences(sentences):
    processed = [s.strip().lower() for s in sentences if len(s.strip()) > 0]
    return processed

# Preprocess sentences
sentences = preprocess_sentences(sentences[:300000])

# Tokenization

In [143]:
# hugging face tokenizer
!pip install -q tokenizers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [144]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence

# Save sentences to a file, required for training
with open("train_sentences.txt", "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(s.strip() + "\n")

# Initialize a tokenizer with a WordPiece model
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# Normalization: lowercase, remove accents
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Pre-tokenizer: basic whitespace splitting
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Trainer: WordPiece trainer
trainer = trainers.WordPieceTrainer(
    vocab_size=10_000,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# Train tokenizer
tokenizer.train(["train_sentences.txt"], trainer)

# Optional: save tokenizer for later use
tokenizer.save("custom_wordpiece_tokenizer.json")







In [145]:
# Reload tokenizer
tokenizer = Tokenizer.from_file("custom_wordpiece_tokenizer.json")

In [146]:
# Encode a sample
sample = "the curious fox jumped over the lazy dog"
output = tokenizer.encode(sample)

print("Tokens:", output.tokens)
print("Token IDs:", output.ids)

Tokens: ['the', 'cur', '##ious', 'fox', 'jump', '##ed', 'over', 'the', 'la', '##zy', 'dog']
Token IDs: [142, 1573, 643, 4549, 5389, 146, 611, 142, 1138, 3379, 3351]


# Masking

In [149]:
import torch
from torch.utils.data import Dataset

MASK_PROB = 0.15        # 15% masking
MAX_LEN = 32           # Fixed sequence length
MASK_TOKEN_ID = tokenizer.token_to_id("[MASK]")
PAD_TOKEN_ID = tokenizer.token_to_id("[PAD]")
CLS_TOKEN_ID = tokenizer.token_to_id("[CLS]")
SEP_TOKEN_ID = tokenizer.token_to_id("[SEP]")

In [150]:
import random
def mask_input(input_ids, tokenizer):
    MASK_TOKEN_ID = tokenizer.token_to_id("[MASK]")
    vocab_size = tokenizer.get_vocab_size()
    
    labels = [-100] * len(input_ids)
    
    for i in range(1, len(input_ids) - 1):  # avoid masking [CLS] or [SEP]
        if input_ids[i] == tokenizer.token_to_id("[PAD]"):  # don't mask padding
            continue
            
        if random.random() < 0.15:  # 15% masking probability
            labels[i] = input_ids[i]
            
            rand_val = random.random()
            if rand_val < 0.8:  # 80% replace with [MASK]
                input_ids[i] = MASK_TOKEN_ID
            elif rand_val < 0.9:  # 10% replace with random token
                # Ensure we don't use special tokens for random replacement
                input_ids[i] = random.randint(5, vocab_size - 1)  # Skip special tokens
            # 10% leave unchanged
    
    return input_ids, labels

In [151]:
class MLMDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentences = sentences
        
        # Cache token IDs for special tokens
        self.CLS_TOKEN_ID = tokenizer.token_to_id("[CLS]")
        self.SEP_TOKEN_ID = tokenizer.token_to_id("[SEP]")
        self.PAD_TOKEN_ID = tokenizer.token_to_id("[PAD]")

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        tokens = self.tokenizer.encode(text)

        # Build input_ids with proper truncation
        input_ids = [self.CLS_TOKEN_ID] + tokens.ids[:self.max_len-2] + [self.SEP_TOKEN_ID]
        
        # Pad to max_len
        padding_length = self.max_len - len(input_ids)
        input_ids += [self.PAD_TOKEN_ID] * padding_length

        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1 if id != self.PAD_TOKEN_ID else 0 for id in input_ids]

        # Apply masking
        masked_input_ids, labels = mask_input(input_ids.copy(), self.tokenizer)

        return {
            'input_ids': torch.tensor(masked_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


In [152]:
from torch.utils.data import DataLoader

# You can split sentences into train/val later
dataset = MLMDataset(sentences, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Preview one batch
batch = next(iter(dataloader))
print("Input IDs:", batch['input_ids'].shape)
print("Labels:", batch['labels'].shape)
batch['input_ids'][0], batch['labels'][0], tokenizer.token_to_id("[MASK]")
len(dataset)

Input IDs: torch.Size([32, 32])
Labels: torch.Size([32, 32])


300000

# model making

In [153]:
import torch.nn as nn

class TransformerEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.position_embed = nn.Embedding(max_len, embed_dim)
        self.layer_norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        x = self.token_embed(input_ids) + self.position_embed(positions)
        return self.layer_norm(x)

In [154]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_hidden_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.attn_norm = nn.LayerNorm(embed_dim)

        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.GELU(),
            nn.Linear(ff_hidden_dim, embed_dim)
        )
        self.ff_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask):
        # attention_mask: (B, S) with 1 for real token, 0 for pad
        # key_padding_mask needs True for positions to ignore
        key_padding_mask = (attention_mask == 0)  # (B, S) - True for padding
        
        attn_output, _ = self.attn(
            x, x, x,
            key_padding_mask=key_padding_mask
        )
        
        x = self.attn_norm(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.ff_norm(x + self.dropout(ff_output))
        return x

In [155]:
class MLMTransformer(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim=128, heads=4, depth=4, ff_dim=512):
        super().__init__()
        self.embedding = TransformerEmbeddings(vocab_size, embed_dim, max_len)
        self.encoder = nn.ModuleList([
            TransformerBlock(embed_dim, heads, ff_dim) for _ in range(depth)
        ])
        self.mlm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)

        # Pass attention_mask directly to each block
        for block in self.encoder:
            x = block(x, attention_mask)  # Fixed: pass attention_mask, not inverted

        logits = self.mlm_head(x)
        return logits

# Training

In [156]:
import torch.optim as optim
from tqdm import tqdm

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [158]:
# Get vocab size from tokenizer
vocab_size = tokenizer.get_vocab_size()

# Instantiate model
model = MLMTransformer(vocab_size=vocab_size, max_len=MAX_LEN).to(device)

# Loss: ignore index -100 where labels are not masked
criterion = nn.CrossEntropyLoss(ignore_index=-100)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

In [None]:
def train_model(model, dataloader, tokenizer, device, epochs=3):
    vocab_size = tokenizer.get_vocab_size()
    
    # Lower learning rate to prevent instability
    optimizer = optim.AdamW(model.parameters(), lr=1e-6)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)
    
    # Add learning rate scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs * len(dataloader))
    
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0
        
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(input_ids, attention_mask)
            
            # Calculate loss
            loss = criterion(logits.view(-1, vocab_size), labels.view(-1))
            
            # Check for NaN
            if torch.isnan(loss):
                print(f"NaN detected at batch {batch_idx}")
                print(f"Input IDs range: {input_ids.min()}-{input_ids.max()}")
                print(f"Logits stats: min={logits.min()}, max={logits.max()}, std={logits.std()}")
                break
            
            # Backward pass with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
            num_batches += 1
            
            if batch_idx % 100 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        avg_loss = total_loss / num_batches if num_batches > 0 else float('inf')
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

# 6. Initialize model with proper weight initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Embedding):
        torch.nn.init.normal_(m.weight, mean=0, std=0.02)

# Usage example:
model = MLMTransformer(vocab_size=vocab_size, max_len=MAX_LEN)
model.apply(init_weights)  # Apply proper weight initialization
model = model.to(device)
train_model(model, dataloader, tokenizer, device)

Epoch 1, Batch 0, Loss: 9.2313
Epoch 1, Batch 100, Loss: 9.2263
Epoch 1, Batch 200, Loss: 9.2306
Epoch 1, Batch 300, Loss: 9.1782
Epoch 1, Batch 400, Loss: 9.2021
Epoch 1, Batch 500, Loss: 9.1682
Epoch 1, Batch 600, Loss: 9.2036
Epoch 1, Batch 700, Loss: 9.1574
Epoch 1, Batch 800, Loss: 9.1645
Epoch 1, Batch 900, Loss: 9.1567
Epoch 1, Batch 1000, Loss: 9.1302
Epoch 1, Batch 1100, Loss: 9.0971
Epoch 1, Batch 1200, Loss: 9.0894
Epoch 1, Batch 1300, Loss: 9.1136
Epoch 1, Batch 1400, Loss: 9.0638
Epoch 1, Batch 1500, Loss: 9.0776
Epoch 1, Batch 1600, Loss: 9.0796
Epoch 1, Batch 1700, Loss: 9.0140
Epoch 1, Batch 1800, Loss: 9.0639
Epoch 1, Batch 1900, Loss: 8.9859
Epoch 1, Batch 2000, Loss: 9.0543
Epoch 1, Batch 2100, Loss: 8.9986
Epoch 1, Batch 2200, Loss: 8.9959
Epoch 1, Batch 2300, Loss: 9.0315
Epoch 1, Batch 2400, Loss: 9.0117
Epoch 1, Batch 2500, Loss: 9.0133
Epoch 1, Batch 2600, Loss: 8.9877
Epoch 1, Batch 2700, Loss: 8.9709
Epoch 1, Batch 2800, Loss: 8.9862
Epoch 1, Batch 2900, Loss:

In [135]:
torch.save(model.state_dict(), "mlm_transformer.pt")

In [136]:
tokenizer.save("mlm_tokenizer.json")

In [137]:
import torch
import torch.nn.functional as F
from tokenizers import Tokenizer
import numpy as np

class MLMPredictor:
    def __init__(self, model_path, tokenizer_path, device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        
        # Load tokenizer
        self.tokenizer = Tokenizer.from_file(tokenizer_path)
        self.vocab_size = self.tokenizer.get_vocab_size()
        
        # Get special token IDs
        self.MASK_TOKEN_ID = self.tokenizer.token_to_id("[MASK]")
        self.PAD_TOKEN_ID = self.tokenizer.token_to_id("[PAD]")
        self.CLS_TOKEN_ID = self.tokenizer.token_to_id("[CLS]")
        self.SEP_TOKEN_ID = self.tokenizer.token_to_id("[SEP]")
        print(self.MASK_TOKEN_ID)
        # Load model
        self.model = MLMTransformer(
            vocab_size=self.vocab_size, 
            max_len=32,  # Make sure this matches your training config
            embed_dim=128,
            heads=4,
            depth=4,
            ff_dim=512
        )
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.to(self.device)
        self.model.eval()
        
        print(f"Model loaded on {self.device}")

    def encode_with_mask(self, text):
        """
        Tokenize text and manually replace [MASK] with correct token ID.
        """
        tokens = []
        for word in text.lower().split():
            if word == '[mask]':
                tokens.append(self.MASK_TOKEN_ID)
            else:
                encoded = self.tokenizer.encode(word)
                tokens.extend(encoded.ids)
        
        return [self.CLS_TOKEN_ID] + tokens[:62] + [self.SEP_TOKEN_ID]

    
    def predict_masked_tokens(self, text, top_k=5):
        """
        Predict masked tokens in the input text.
        Text should contain [MASK] tokens where predictions are needed.
        """
        # Tokenize input
        tokens = self.tokenizer.encode(text.lower())
        input_ids = self.encode_with_mask(text)
        
        # Pad to max length
        padding_length = 32 - len(input_ids)
        input_ids += [self.PAD_TOKEN_ID] * padding_length
        
        # Create attention mask
        attention_mask = [1 if id != self.PAD_TOKEN_ID else 0 for id in input_ids]
        
        # Convert to tensors
        input_ids = torch.tensor([input_ids], dtype=torch.long).to(self.device)
        attention_mask = torch.tensor([attention_mask], dtype=torch.long).to(self.device)
        
        # Get predictions
        with torch.no_grad():
            logits = self.model(input_ids, attention_mask)
        
        # Find mask positions
        mask_positions = (input_ids[0] == self.MASK_TOKEN_ID).nonzero(as_tuple=True)[0]
        
        predictions = []
        for pos in mask_positions:
            # Get logits for this position
            token_logits = logits[0, pos, :]
            
            # Get top-k predictions
            top_k_logits, top_k_indices = torch.topk(token_logits, top_k)
            top_k_probs = F.softmax(top_k_logits, dim=-1)
            
            # Convert to tokens
            predicted_tokens = []
            for i, (idx, prob) in enumerate(zip(top_k_indices, top_k_probs)):
                token = self.tokenizer.decode([idx.item()])
                predicted_tokens.append({
                    'token': token,
                    'probability': prob.item(),
                    'rank': i + 1
                })
            
            predictions.append({
                'position': pos.item(),
                'predictions': predicted_tokens
            })
        
        return predictions
    
    def fill_mask(self, text, use_top_prediction=True):
        """
        Fill [MASK] tokens in text with predictions.
        If use_top_prediction=True, uses the highest probability token.
        Otherwise returns multiple options.
        """
        predictions = self.predict_masked_tokens(text)
        
        if use_top_prediction:
            # Replace each [MASK] with top prediction
            result_text = text
            for pred in predictions:
                top_token = pred['predictions'][0]['token']
                result_text = result_text.replace('[MASK]', top_token, 1)
            return result_text
        else:
            return predictions
    
    def interactive_prediction(self):
        """
        Interactive mode for testing predictions
        """
        print("MLM Transformer Interactive Prediction")
        print("Enter text with [MASK] tokens, or 'quit' to exit")
        print("Example: 'The cat sat on the [MASK]'")
        print("-" * 50)
        
        while True:
            text = input("\nEnter masked text: ").strip()
            
            if text.lower() in ['quit', 'exit', 'q']:
                break
                
            if '[MASK]' not in text:
                print("Please include at least one [MASK] token in your text.")
                continue
            
            try:
                # Show filled text
                filled_text = self.fill_mask(text, use_top_prediction=True)
                print(f"\nFilled text: {filled_text}")
                
                # Show detailed predictions
                predictions = self.predict_masked_tokens(text, top_k=3)
                
                print(f"\nDetailed predictions:")
                for i, pred in enumerate(predictions):
                    print(f"  [MASK] #{i+1}:")
                    for p in pred['predictions']:
                        print(f"    {p['rank']}. '{p['token']}' (prob: {p['probability']:.4f})")
                        
            except Exception as e:
                print(f"Error during prediction: {e}")

# Example usage functions
def test_predictions(predictor):
    """
    Test the model with some example sentences
    """
    test_sentences = [
        "The cat sat on the [MASK]",
        "I love to eat [MASK] for breakfast",
        "The [MASK] is shining brightly today",
        "She went to the [MASK] to buy groceries",
        "The dog was [MASK] in the park",
        "Python is a programming [MASK]",
        "The book was very [MASK] to read"
    ]
    
    print("Testing MLM Predictions")
    print("=" * 50)
    
    for sentence in test_sentences:
        print(f"\nOriginal: {sentence}")
        
        # Get filled sentence
        filled = predictor.fill_mask(sentence)
        print(f"Filled:   {filled}")
        
        # Get top 3 predictions for each mask
        predictions = predictor.predict_masked_tokens(sentence, top_k=3)
        for pred in predictions:
            print("  Top predictions:")
            for p in pred['predictions']:
                print(f"    {p['rank']}. '{p['token']}' ({p['probability']:.3f})")

def evaluate_model(predictor, test_sentences_with_answers):
    """
    Evaluate model performance on known answers
    """
    correct_predictions = 0
    total_predictions = 0
    
    for original_sentence, masked_sentence in test_sentences_with_answers:
        # Get prediction
        filled = predictor.fill_mask(masked_sentence)
        
        # Simple accuracy check (this is basic - you might want more sophisticated evaluation)
        if filled.lower() == original_sentence.lower():
            correct_predictions += 1
        total_predictions += 1
        
        print(f"Original: {original_sentence}")
        print(f"Masked:   {masked_sentence}")
        print(f"Predicted: {filled}")
        print(f"Correct: {'✓' if filled.lower() == original_sentence.lower() else '✗'}")
        print("-" * 30)
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"\nOverall Accuracy: {accuracy:.2%} ({correct_predictions}/{total_predictions})")

# Usage example:

# Load your trained model
predictor = MLMPredictor(
    model_path="mlm_transformer.pt",
    tokenizer_path="mlm_tokenizer.json"
)

# Test with examples
test_predictions(predictor)

# Start interactive mode
predictor.interactive_prediction()

# Or make single predictions
result = predictor.fill_mask("The weather is [MASK] today")
print(result)

# Get detailed predictions
predictions = predictor.predict_masked_tokens("I like to [MASK] books", top_k=5)
for pred in predictions:
    print(pred)


4
Model loaded on cuda
Testing MLM Predictions

Original: The cat sat on the [MASK]
Filled:   The cat sat on the .
  Top predictions:
    1. '.' (0.617)
    2. ',' (0.200)
    3. 'the' (0.183)

Original: I love to eat [MASK] for breakfast
Filled:   I love to eat . for breakfast
  Top predictions:
    1. '.' (0.785)
    2. 'the' (0.117)
    3. 'a' (0.099)

Original: The [MASK] is shining brightly today
Filled:   The . is shining brightly today
  Top predictions:
    1. '.' (0.788)
    2. 'the' (0.113)
    3. 'a' (0.100)

Original: She went to the [MASK] to buy groceries
Filled:   She went to the . to buy groceries
  Top predictions:
    1. '.' (0.688)
    2. 'the' (0.167)
    3. 'a' (0.146)

Original: The dog was [MASK] in the park
Filled:   The dog was . in the park
  Top predictions:
    1. '.' (0.678)
    2. 'the' (0.175)
    3. ',' (0.147)

Original: Python is a programming [MASK]
Filled:   Python is a programming .
  Top predictions:
    1. '.' (0.830)
    2. 'the' (0.093)
    3. '


Enter masked text:  The cat sat on the [MASK]



Filled text: The cat sat on the .

Detailed predictions:
  [MASK] #1:
    1. '.' (prob: 0.6168)
    2. ',' (prob: 0.2000)
    3. 'the' (prob: 0.1832)


KeyboardInterrupt: Interrupted by user