In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
import numpy as np
from collections import Counter
import os
import math

# TOKENIZER

In [19]:
class SimpleTokenizer:
    def __init__(self, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word2idx = {"<PAD>": 0, "<UNK>": 1, "<BOS>": 2, "<EOS>": 3}
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.next_idx = 4

    def build_vocab(self, texts):
        counter = Counter()
        for text in texts:
            counter.update(text.lower().split())

        for word, _ in counter.most_common(self.vocab_size - 4):
            self.word2idx[word] = self.next_idx
            self.idx2word[self.next_idx] = word
            self.next_idx += 1

    def encode(self, text):
        tokens = text.lower().split()
        return [self.word2idx.get(t, 1) for t in tokens]

    def decode(self, tokens):
        return ' '.join(self.idx2word.get(t, "<UNK>") for t in tokens if t not in [0, 2, 3])


# DATASET

In [12]:
class WikiDataset(Dataset):
    def __init__(self, texts, tokenizer, seq_length=128, max_samples=None):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.sequences = []

        # Handle both list of texts and file path
        if isinstance(texts, str):
            with open(texts, 'r', encoding='utf-8', errors='ignore') as f:
                texts = f.readlines()

        if max_samples:
            texts = texts[:max_samples]

        # Tokenize and create sequences
        for text in texts:
            tokens = tokenizer.encode(text.strip())
            if len(tokens) > 5:
                # Create overlapping sequences
                for i in range(0, len(tokens) - 1, max(1, self.seq_length // 4)):
                    seq = tokens[i:i + self.seq_length]
                    if len(seq) >= 10:
                        self.sequences.append(seq)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        # Pad sequence
        if len(seq) < self.seq_length:
            seq = seq + [0] * (self.seq_length - len(seq))
        else:
            seq = seq[:self.seq_length]

        # Input and target (shifted by 1)
        x = torch.tensor(seq[:-1], dtype=torch.long)
        y = torch.tensor(seq[1:], dtype=torch.long)

        return x, y


In [20]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len=2048):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# TRANSFORMER BLOCKS

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0

        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.scale = math.sqrt(self.head_dim)

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        self.attn_dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape

        q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale

        # Causal mask (for decoder)
        if mask is None:
            mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
            scores = scores.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

        attn = F.softmax(scores, dim=-1)
        attn = self.attn_dropout(attn)

        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous()
        out = out.view(batch_size, seq_len, self.d_model)
        out = self.out_proj(out)
        out = self.dropout(out)

        return out

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(F.gelu(self.linear1(x))))

class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention with pre-normalization (better convergence)
        attn_out = self.attn(self.norm1(x))
        x = x + self.dropout(attn_out)

        # Feed-forward with pre-normalization
        ff_out = self.ff(self.norm2(x))
        x = x + self.dropout(ff_out)

        return x


# DECODER TRANSFORMER

In [22]:
class DecoderTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=384, n_heads=6, n_layers=4,
                 d_ff=1536, max_seq_len=512, dropout=0.1):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)

        self.decoder_layers = nn.ModuleList([
            DecoderBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

        self.d_model = d_model
        self._init_weights()

    def _init_weights(self):
        """Initialize weights for better convergence"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x):
        # Embedding
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)

        # Decoder layers
        for layer in self.decoder_layers:
            x = layer(x)

        # Output
        x = self.norm(x)
        logits = self.output_proj(x)

        return logits


# TRAINING

In [23]:
class GradualWarmupScheduler(optim.lr_scheduler.LambdaLR):
    def __init__(self, optimizer, warmup_steps, total_steps):
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        super().__init__(optimizer, self.lr_lambda)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.total_steps - step) / float(max(1, self.total_steps - self.warmup_steps)))

def train_model(model, train_loader, val_loader, device, epochs=15, base_lr=1e-3):
    optimizer = optim.AdamW(model.parameters(), lr=base_lr, weight_decay=0.01, betas=(0.9, 0.98))

    total_steps = len(train_loader) * epochs
    warmup_steps = len(train_loader)  # 1 epoch warmup
    scheduler = GradualWarmupScheduler(optimizer, warmup_steps, total_steps)

    criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for batch_idx, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits.view(-1, logits.shape[-1]), y.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()

            if (batch_idx + 1) % 50 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits = model(x)
                loss = criterion(logits.view(-1, logits.shape[-1]), y.view(-1))
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"\nEpoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}\n")

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "/content/best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                model.load_state_dict(torch.load("/content/best_model.pt"))
                break

    return model

# GENERATION

In [24]:
def generate(model, tokenizer, prompt, max_len=100, device='cpu', temperature=0.9, top_k=50):
    model.eval()
    tokens = tokenizer.encode(prompt) + [2]  # Add BOS token

    with torch.no_grad():
        for _ in range(max_len):
            x = torch.tensor([tokens[-127:]], dtype=torch.long).to(device)
            logits = model(x)
            logits = logits[0, -1, :] / temperature

            # Top-k sampling
            top_k_logits, top_k_indices = torch.topk(logits, top_k)
            probs = F.softmax(top_k_logits, dim=-1)
            next_token = top_k_indices[torch.multinomial(probs, 1)].item()

            if next_token == 3:  # EOS token
                break

            tokens.append(next_token)

    return tokenizer.decode(tokens[1:])

# MAIN

In [25]:
if __name__ == "__main__":
    # Configuration - OPTIMIZED
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    VOCAB_SIZE = 5000
    D_MODEL = 384  # Reduced for faster training
    N_HEADS = 6    # Reduced
    N_LAYERS = 4   # Reduced
    D_FF = 1536    # Proportional to d_model
    BATCH_SIZE = 64  # Increased for better gradient estimates
    EPOCHS = 15
    SEQ_LENGTH = 128
    BASE_LR = 2e-3  # Higher initial learning rate

    print(f"Using device: {DEVICE}")
    print(f"Model config: d_model={D_MODEL}, n_heads={N_HEADS}, n_layers={N_LAYERS}")

    # Load data
    data_path = "/content/gdrive/MyDrive/practical_data/assignment4"

    # Read Wikipedia sentences
    all_texts = []
    print("Loading data...")
    for file in os.listdir(data_path):
        if file.endswith('.txt'):
            with open(os.path.join(data_path, file), 'r', encoding='utf-8', errors='ignore') as f:
                all_texts.extend(f.readlines()[:10000])  # More samples for better training

    print(f"Loaded {len(all_texts)} texts")

    # Build tokenizer
    print("Building tokenizer...")
    tokenizer = SimpleTokenizer(VOCAB_SIZE)
    tokenizer.build_vocab(all_texts)
    print(f"Vocabulary size: {len(tokenizer.word2idx)}")

    # Split data
    split_idx = int(0.8 * len(all_texts))
    train_texts = all_texts[:split_idx]
    val_texts = all_texts[split_idx:]

    # Create datasets
    print("Creating datasets...")
    train_dataset = WikiDataset(train_texts, tokenizer, SEQ_LENGTH)
    val_dataset = WikiDataset(val_texts, tokenizer, SEQ_LENGTH)

    print(f"Train sequences: {len(train_dataset)}")
    print(f"Val sequences: {len(val_dataset)}")

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0)

    # Model
    print("Building model...")
    model = DecoderTransformer(VOCAB_SIZE, D_MODEL, N_HEADS, N_LAYERS, D_FF).to(DEVICE)
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

    # Train
    print("Training...")
    model = train_model(model, train_loader, val_loader, DEVICE, EPOCHS, BASE_LR)

    # Generate
    print("\nGenerating text:")
    prompts = ["the quick brown", "artificial intelligence", "deep learning"]
    for prompt in prompts:
        generated = generate(model, tokenizer, prompt, device=DEVICE)
        print(f"Prompt: '{prompt}' -> '{generated}'")

    # Save model
    torch.save(model.state_dict(), "/content/final_model.pt")
    print("\nModel saved to /content/final_model.pt")

Using device: cuda
Model config: d_model=384, n_heads=6, n_layers=4
Loading data...
Loaded 10000 texts
Building tokenizer...
Vocabulary size: 5000
Creating datasets...
Train sequences: 7113
Val sequences: 1957
Building model...
Model parameters: 10,943,624
Training...
Epoch 1/15, Batch 50/112, Loss: 6.0709, LR: 0.000893
Epoch 1/15, Batch 100/112, Loss: 4.7779, LR: 0.001786

Epoch 1/15 - Train Loss: 5.9635, Val Loss: 5.4205

Epoch 2/15, Batch 50/112, Loss: 4.2700, LR: 0.001936
Epoch 2/15, Batch 100/112, Loss: 3.6386, LR: 0.001872

Epoch 2/15 - Train Loss: 4.2559, Val Loss: 4.9565

Epoch 3/15, Batch 50/112, Loss: 3.8001, LR: 0.001793
Epoch 3/15, Batch 100/112, Loss: 4.1239, LR: 0.001730

Epoch 3/15 - Train Loss: 3.7471, Val Loss: 4.8485

Epoch 4/15, Batch 50/112, Loss: 3.6247, LR: 0.001651
Epoch 4/15, Batch 100/112, Loss: 3.7202, LR: 0.001587

Epoch 4/15 - Train Loss: 3.4381, Val Loss: 4.7857

Epoch 5/15, Batch 50/112, Loss: 3.3562, LR: 0.001508
Epoch 5/15, Batch 100/112, Loss: 3.4348, L