In [110]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [111]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()

        assert d_model%num_heads == 0 ,"d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model , d_model)
        self.W_k = nn.Linear(d_model , d_model)
        self.W_v = nn.Linear(d_model , d_model)
        self.W_o = nn.Linear(d_model , d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask = None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask==0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim = -1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_head(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_head(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask = None):
        Q = self.split_head(self.W_q(Q))
        K = self.split_head(self.W_k(K))
        V = self.split_head(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        output = self.W_o(self.combine_head(attn_output))

        return output

In [112]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()

 
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [113]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term) #even term
        pe[: ,1::2]= torch.cos(position * div_term) #odd term

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [114]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [115]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x , enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [116]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()

        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for i in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for i in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src!=0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt!=0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [117]:
# Model Configuration Parameters (MISSING - ADDING BACK)
src_vocab_size = 5000  # Will be updated with actual vocab size
tgt_vocab_size = 5000  # Will be updated with actual vocab size
d_model = 128             
num_heads = 4             
num_layers = 2            
d_ff = 512               
max_seq_length = 100
dropout = 0.1

print("Model configuration parameters set:")
print(f"d_model: {d_model}")
print(f"num_heads: {num_heads}")
print(f"num_layers: {num_layers}")
print(f"d_ff: {d_ff}")
print(f"max_seq_length: {max_seq_length}")
print(f"dropout: {dropout}")

Model configuration parameters set:
d_model: 128
num_heads: 4
num_layers: 2
d_ff: 512
max_seq_length: 100
dropout: 0.1


In [118]:
# Updated code for torchtext 0.6.0
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from collections import Counter

# Load dataset and tokenizer
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = get_tokenizer("basic_english")

# Build vocab manually for torchtext 0.6.0
def build_vocab_from_iterator_old(iterator, specials=None):
    """Build vocabulary for older torchtext version"""
    if specials is None:
        specials = ["<pad>", "<unk>"]
    
    # Collect all tokens
    all_tokens = []
    for tokens in iterator:
        all_tokens.extend(tokens)
    
    # Count frequencies
    counter = Counter(all_tokens)
    
    # Build vocab dictionary
    vocab_dict = {}
    
    # Add special tokens first
    for i, token in enumerate(specials):
        vocab_dict[token] = i
    
    # Add regular tokens
    idx = len(specials)
    for token, _ in counter.most_common():
        if token not in vocab_dict:
            vocab_dict[token] = idx
            idx += 1
    
    return vocab_dict

# Build vocabulary
def yield_tokens(data_iter):
    for example in data_iter:
        yield tokenizer(example['text'])

train_data = dataset['train']
vocab_dict = build_vocab_from_iterator_old(yield_tokens(train_data), specials=["<pad>", "<unk>"])

# Create a simple vocab class
class SimpleVocab:
    def __init__(self, vocab_dict):
        self.vocab_dict = vocab_dict
        self.unk_token = "<unk>"
    
    def __getitem__(self, token):
        return self.vocab_dict.get(token, self.vocab_dict[self.unk_token])
    
    def __len__(self):
        return len(self.vocab_dict)

vocab = SimpleVocab(vocab_dict)
pad_idx = vocab["<pad>"]
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")
print(f"Pad index: {pad_idx}")

# Tokenize entire dataset into a single long list
tokens = [vocab[token] for example in train_data for token in tokenizer(example['text'])]

# Set parameters
batch_size = 64
max_seq_length = 32
num_tokens = batch_size * max_seq_length * 10  # ensure enough tokens

# Truncate
tokens = tokens[:num_tokens]

# Create input chunks of size (seq_len + 1)
chunks = [tokens[i:i+max_seq_length+1] for i in range(0, len(tokens) - max_seq_length, max_seq_length+1)]

# Keep only first 64 chunks for now
chunks = chunks[:batch_size]

# Split into src and tgt
src_data = torch.tensor([chunk[:-1] for chunk in chunks])  # (batch_size, seq_length)
tgt_data = torch.tensor([chunk[1:] for chunk in chunks])   # (batch_size, seq_length)

print("Source shape:", src_data.shape)
print("Target shape:", tgt_data.shape)

Vocabulary size: 66059
Pad index: 0
Source shape: torch.Size([64, 32])
Target shape: torch.Size([64, 32])
Source shape: torch.Size([64, 32])
Target shape: torch.Size([64, 32])


In [119]:
# Model Setup and Initialization (Clean Version)
# Create transformer with correct vocabulary size
transformer = Transformer(
    vocab_size,  # Use actual vocabulary size
    vocab_size,  # Use actual vocabulary size
    d_model,
    num_heads,
    num_layers,
    d_ff,
    max_seq_length,
    dropout
)

print(f"✅ Transformer created with vocab size: {vocab_size}")
print(f"📊 Model Parameters: {sum(p.numel() for p in transformer.parameters()):,}")

# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(transformer.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

print("✅ Training setup completed (criterion + optimizer)")
print("🎯 Ready for training! Use the progress bar training cell below.")
print("⬇️  Run the 'ULTIMATE SOLUTION' cell for clean training!")

✅ Transformer created with vocab size: 66059
📊 Model Parameters: 26,358,411
✅ Training setup completed (criterion + optimizer)
🎯 Ready for training! Use the progress bar training cell below.
⬇️  Run the 'ULTIMATE SOLUTION' cell for clean training!


In [120]:
# CLEAN RESTART - Run this before training to avoid duplicates
import gc
import torch

# Clear training lock if set
if 'training_in_progress' in locals():
    training_in_progress = False
    print("🔓 Cleared training lock")

# Clear any previous training state
if 'epoch' in locals():
    del epoch
if 'loss' in locals():
    print(f"Previous loss was: {loss.item():.4f}")

# Force garbage collection
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

print("🧹 Cleared previous training state")
print("✅ Ready for clean training run")
print("⬇️  Now run the training cell below")

🔓 Cleared training lock
Previous loss was: 10.3521
🧹 Cleared previous training state
✅ Ready for clean training run
⬇️  Now run the training cell below
🧹 Cleared previous training state
✅ Ready for clean training run
⬇️  Now run the training cell below


In [None]:
# ULTIMATE SOLUTION - Progress Bar (NO PRINT STATEMENTS AT ALL!)
import time
from tqdm import tqdm
import torch

# Stop any existing training
if 'training_in_progress' in locals():
    training_in_progress = False

print("🎯 ULTIMATE SOLUTION: Using progress bar instead of print statements!")
print("📊 This will show clean progress without any duplicates!")

start_time = time.time()

# Use tqdm progress bar instead of print statements
with tqdm(total=1500, desc="Training Progress", unit="epoch", dynamic_ncols=True) as pbar:
    
    for epoch in range(1500):
        transformer.train()
        optimizer.zero_grad()
        
        # Forward pass
        output = transformer(src_data, tgt_data[:, :-1])
        output = output.reshape(-1, vocab_size)
        target = tgt_data[:, 1:].reshape(-1)
        
        # Backward pass
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(transformer.parameters(), max_norm=1.0)
        optimizer.step()
        
        # Update progress bar (no print statements!)
        if epoch % 20 == 0:
            pbar.set_postfix({"Current": f"{loss.item():.4f}"})
        pbar.update(1)

# Final results (only these print statements)
training_time = time.time() - start_time
print(f"\n🎉 TRAINING COMPLETED SUCCESSFULLY!")
print(f"✅ Final Loss: {loss.item():.4f}")
print(f"⏱️ Training Time: {training_time:.2f} seconds")
print(f"🎯 NO DUPLICATES with progress bar approach!")

# Quick evaluation
transformer.eval()
with torch.no_grad():
    out = transformer(src_data, tgt_data[:, :-1])
    predicted = torch.argmax(out, dim=-1)
    print("✅ Evaluation completed! Run analysis cell for detailed results.")

🎯 ULTIMATE SOLUTION: Using progress bar instead of print statements!
📊 This will show clean progress without any duplicates!


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
# Analysis of Training Results - Run only after training completes
if 'loss' in locals() and hasattr(transformer, 'training'):
    print("=== TRAINING ANALYSIS ===")
    print(f"Final Loss: {loss.item():.4f}")
    print(f"Vocabulary Size: {vocab_size}")
    print(f"Model Parameters: {sum(p.numel() for p in transformer.parameters()):,}")

    # Calculate accuracy
    transformer.eval()
    with torch.no_grad():
        correct_predictions = 0
        total_predictions = 0
        
        for i in range(min(10, src_data.size(0))):  # Check first 10 sequences
            output = transformer(src_data[i:i+1], tgt_data[i:i+1, :-1])
            predicted = torch.argmax(output, dim=-1)
            target = tgt_data[i:i+1, 1:]
            
            correct = (predicted == target).sum().item()
            total = target.numel()
            
            correct_predictions += correct
            total_predictions += total
            
            if i == 0:  # Show detailed comparison for first sequence
                print(f"\n=== SEQUENCE {i+1} DETAILED COMPARISON ===")
                pred_list = predicted[0].tolist()
                target_list = target[0].tolist()
                
                print("Position | Predicted | Target | Match")
                print("-" * 35)
                for j, (p, t) in enumerate(zip(pred_list, target_list)):
                    match = "✓" if p == t else "✗"
                    print(f"{j:8} | {p:9} | {t:6} | {match}")

    accuracy = correct_predictions / total_predictions * 100
    print(f"\n=== OVERALL METRICS ===")
    print(f"Token-level Accuracy: {accuracy:.2f}%")
    print(f"Correct Predictions: {correct_predictions:,} / {total_predictions:,}")

    # Perplexity calculation
    perplexity = torch.exp(loss).item()
    print(f"Perplexity: {perplexity:.2f}")

    print(f"\n=== TRAINING RECOMMENDATIONS ===")
    if loss.item() > 3.0:
        print("🔴 High loss - More training needed")
    elif loss.item() > 1.5:
        print("🟡 Moderate loss - Training progressing well")
    else:
        print("🟢 Low loss - Good convergence")

    if accuracy < 30:
        print("🔴 Low accuracy - Consider training longer or adjusting hyperparameters")
    elif accuracy < 60:
        print("🟡 Moderate accuracy - Training is progressing")
    else:
        print("🟢 Good accuracy for this task complexity")
else:
    print("⚠️  Training not completed yet. Run the training cell first.")

=== TRAINING ANALYSIS ===
Final Loss: 3.0343
Vocabulary Size: 66059
Model Parameters: 26,358,411

=== SEQUENCE 1 DETAILED COMPARISON ===
Position | Predicted | Target | Match
-----------------------------------
       0 |      3876 |   3876 | ✓
       1 |       882 |    882 | ✓
       2 |        10 |     10 | ✓
       3 |        10 |  18563 | ✗
       4 |      3813 |     84 | ✗
       5 |      3813 |   3813 | ✓
       6 |      3876 |     88 | ✗
       7 |         4 |  20923 | ✗
       8 |      3876 |   3876 | ✓
       9 |       882 |     22 | ✗
      10 |         3 |    781 | ✗
      11 |         6 |  24291 | ✗
      12 |         3 |      3 | ✓
      13 |      3813 |   5982 | ✗
      14 |         4 |      4 | ✓
      15 |        10 |   3813 | ✗
      16 |      3876 |      5 | ✗
      17 |         2 |      2 | ✓
      18 |        68 |   5038 | ✗
      19 |      3876 |     88 | ✗
      20 |         4 |     21 | ✗
      21 |         3 |      3 | ✓
      22 |      3813 |   1834 | ✗
      2

In [None]:
# QUICK TRAINING PROGRESS CHECK (Run this to see current status)
if 'loss' in locals():
    print(f"🎯 Current Loss: {loss.item():.4f}")
    print(f"📊 Model is {'training' if transformer.training else 'in eval mode'}")
    print(f"📈 To check full results, run the analysis cell after training completes")
else:
    print("❌ No training completed yet. Run the training cell first!")
    
# Check if model exists
if 'transformer' in locals():
    param_count = sum(p.numel() for p in transformer.parameters())
    print(f"✅ Model loaded with {param_count:,} parameters")
else:
    print("❌ Model not loaded. Run the model creation cells first!")