In [15]:
#Importing Necessary Libraries

import os
import json
import random
import math
import re
import numpy as np
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
from collections import Counter

In [16]:
# URDU TEXT NORMALIZATION

def normalize_urdu_text(text):
    
    # Remove diacritics (Unicode range U+064B to U+065F and U+0670)
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    
    # Standardize Alef forms
    text = re.sub(r'[آأإٱ]', 'ا', text)
    
    # Standardize Yeh forms
    text = re.sub(r'ے', 'ی', text)
    text = re.sub(r'ۓ', 'ی', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.strip()

In [17]:
# DATA PREPROCESSING

def preprocess_urdu_dataset(data_path, output_dir="preprocessed"):
    
    os.makedirs(output_dir, exist_ok=True)
    
    print("Loading dataset")
    df = pd.read_csv(data_path, sep="\t")
    sentences = df["sentence"].dropna().astype(str).tolist()
    
    # Normalize all sentences
    print("Normalizing Urdu text")
    sentences = [normalize_urdu_text(s) for s in sentences]
    sentences = [s for s in sentences if len(s.strip()) > 3]
    print(f"✅ Total sentences after normalization: {len(sentences)}")
    
    # Create conversation pairs
    print("Creating conversation pairs")
    pairs = []
    for i in range(len(sentences) - 1):
        src = sentences[i]
        tgt = sentences[i + 1]
        if len(src.split()) > 1 and len(tgt.split()) > 1:
            pairs.append({"src": src, "tgt": tgt})
    
    print(f"Created {len(pairs)} conversation pairs")
    
    # Save all text for tokenizer training
    corpus_path = os.path.join(output_dir, "corpus.txt")
    with open(corpus_path, "w", encoding="utf-8") as f:
        for text in sentences:
            f.write(text + "\n")
    
    print(f"Saved corpus to {corpus_path}")
    return pairs

def train_tokenizer(corpus_path, output_prefix, vocab_size=16000):
    """Train SentencePiece tokenizer for Urdu"""
    print(f"Training SentencePiece tokenizer (vocab_size={vocab_size})...")
    spm.SentencePieceTrainer.Train(
        f"--input={corpus_path} "
        f"--model_prefix={output_prefix} "
        f"--vocab_size={vocab_size} "
        f"--model_type=bpe "
        f"--character_coverage=1.0 "
        f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3"
    )
    print(f"Tokenizer trained: {output_prefix}.model")

def prepare_dataset(pairs, sp_model, output_dir):
    
    print("Loading tokenizer...")
    sp = spm.SentencePieceProcessor(model_file=sp_model)
    
    # Tokenize pairs
    print("Tokenizing all pairs...")
    data = []
    for pair in tqdm(pairs, desc="Tokenizing"):
        src_ids = sp.encode(pair["src"], out_type=int)
        tgt_ids = sp.encode(pair["tgt"], out_type=int)
        data.append({
            "src": pair["src"],
            "tgt": pair["tgt"],
            "src_ids": src_ids,
            "tgt_ids": tgt_ids
        })
    
    # Shuffle and split (80/10/10)
    random.shuffle(data)
    n = len(data)
    train_idx = int(0.8 * n)
    val_idx = int(0.9 * n)
    
    splits = {
        "train": data[:train_idx],
        "val": data[train_idx:val_idx],
        "test": data[val_idx:]
    }
    
    print(f"\n📊 Dataset Split:")
    for split_name, split_data in splits.items():
        path = os.path.join(output_dir, f"{split_name}.jsonl")
        with open(path, "w", encoding="utf-8") as f:
            for item in split_data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        print(f"  - {split_name}: {len(split_data)} samples ({len(split_data)/n*100:.1f}%)")
    
    return splits

In [18]:
# DATASET CLASS

class ConversationDataset(Dataset):
    """PyTorch Dataset for conversation pairs"""
    def __init__(self, jsonl_path, max_len=64):
        self.data = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                self.data.append(json.loads(line))
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        # Add BOS and EOS tokens
        src_ids = [2] + item["src_ids"][:self.max_len-2] + [3]
        tgt_ids = [2] + item["tgt_ids"][:self.max_len-2] + [3]
        
        # Pad sequences
        src_len = len(src_ids)
        tgt_len = len(tgt_ids)
        
        src_ids += [0] * (self.max_len - src_len)
        tgt_ids += [0] * (self.max_len - tgt_len)
        
        return {
            "src": torch.tensor(src_ids[:self.max_len], dtype=torch.long),
            "tgt": torch.tensor(tgt_ids[:self.max_len], dtype=torch.long),
            "src_len": min(src_len, self.max_len),
            "tgt_len": min(tgt_len, self.max_len),
            "src_text": item["src"],
            "tgt_text": item["tgt"]
        }

In [19]:
# TRANSFORMER COMPONENTS

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def split_heads(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1, self.num_heads, self.d_k)
        return x.transpose(1, 2)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.split_heads(self.W_q(query))
        K = self.split_heads(self.W_k(key))
        V = self.split_heads(self.W_v(value))
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        context = torch.matmul(attn_weights, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        return self.W_o(context)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_out = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Feed-forward with residual connection
        ff_out = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_out))
        
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Self-attention with residual connection
        attn_out = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Cross-attention with encoder output
        attn_out = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_out))
        
        # Feed-forward with residual connection
        ff_out = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_out))
        
        return x

In [20]:
# FULL TRANSFORMER MODEL

class TransformerSeq2Seq(nn.Module):
    """
    Complete Transformer Encoder-Decoder Model (REQUIREMENT)
    Built from scratch without pre-trained models
    """
    def __init__(self, vocab_size, d_model=256, num_heads=4, 
                 num_encoder_layers=2, num_decoder_layers=2, 
                 d_ff=512, dropout=0.1, max_len=512):
        super().__init__()
        
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        # Encoder stack
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        # Decoder stack
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        self.output_layer = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def create_padding_mask(self, seq):
        """Create mask for padding tokens"""
        return (seq != 0).unsqueeze(1).unsqueeze(2)
    
    def create_look_ahead_mask(self, size):
        """Create causal mask for decoder"""
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return ~mask
    
    def encode(self, src):
        """Encoder: Process full input context"""
        src_mask = self.create_padding_mask(src)
        x = self.embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, enc_output, src_mask=None):
        """Decoder: Generate response token-by-token"""
        tgt_mask = self.create_padding_mask(tgt)
        look_ahead_mask = self.create_look_ahead_mask(tgt.size(1)).to(tgt.device)
        combined_mask = tgt_mask & look_ahead_mask.unsqueeze(0)
        
        x = self.embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        for layer in self.decoder_layers:
            x = layer(x, enc_output, src_mask, combined_mask)
        
        return self.output_layer(x)
    
    def forward(self, src, tgt):
        """Forward pass for training"""
        src_mask = self.create_padding_mask(src)
        enc_output = self.encode(src)
        output = self.decode(tgt[:, :-1], enc_output, src_mask)
        return output

In [21]:
# TRAINING WITH TEACHER FORCING

def train_model(model, train_loader, val_loader, device, epochs=20, lr=3e-4, 
                teacher_forcing_ratio=0.5):
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    best_val_loss = float('inf')
    training_history = []
    
    print(f"🚀 Starting Training (Teacher Forcing Ratio: {teacher_forcing_ratio})")
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in pbar:
            src = batch["src"].to(device)
            tgt = batch["tgt"].to(device)
            
            optimizer.zero_grad()
            
            # Teacher forcing
            output = model(src, tgt)
            
            # Calculate loss
            loss = criterion(output.reshape(-1, output.size(-1)), 
                           tgt[:, 1:].reshape(-1))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_train_loss = train_loss / len(train_loader)
        
        # ===== VALIDATION PHASE =====
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                src = batch["src"].to(device)
                tgt = batch["tgt"].to(device)
                output = model(src, tgt)
                loss = criterion(output.reshape(-1, output.size(-1)), 
                               tgt[:, 1:].reshape(-1))
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        perplexity = math.exp(avg_val_loss)
        
        # Log metrics
        metrics = {
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'perplexity': perplexity
        }
        training_history.append(metrics)
        
        print(f"\n📊 Epoch {epoch+1} Results:")
        print(f"   Train Loss: {avg_train_loss:.4f}")
        print(f"   Val Loss: {avg_val_loss:.4f}")
        print(f"   Perplexity: {perplexity:.2f}")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': avg_val_loss,
                'perplexity': perplexity,
                'training_history': training_history
            }, 'best_model.pth')
            print(f"   ✅ Model saved! (Best Val Loss: {avg_val_loss:.4f})")
        print()
    
    return training_history

In [22]:
# INFERENCE FUNCTIONS

def greedy_decode(model, src, max_len, device, bos_id=2, eos_id=3):
   
    model.eval()
    src = src.to(device)
    
    with torch.no_grad():
        enc_output = model.encode(src)
        src_mask = model.create_padding_mask(src)
        
        tgt = torch.tensor([[bos_id]], dtype=torch.long, device=device)
        
        for _ in range(max_len):
            output = model.decode(tgt, enc_output, src_mask)
            next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
            tgt = torch.cat([tgt, next_token], dim=1)
            
            if next_token.item() == eos_id:
                break
        
        return tgt[0].cpu().tolist()

def beam_search_decode(model, src, beam_size, max_len, device, bos_id=2, eos_id=3):
    
    model.eval()
    src = src.to(device)
    
    with torch.no_grad():
        enc_output = model.encode(src)
        src_mask = model.create_padding_mask(src)
        
        # Initialize beam with BOS token
        beams = [(torch.tensor([[bos_id]], dtype=torch.long, device=device), 0.0)]
        completed = []
        
        for _ in range(max_len):
            candidates = []
            
            for seq, score in beams:
                if seq[0, -1].item() == eos_id:
                    completed.append((seq, score))
                    continue
                
                output = model.decode(seq, enc_output, src_mask)
                log_probs = F.log_softmax(output[:, -1, :], dim=-1)
                top_probs, top_indices = log_probs.topk(beam_size)
                
                for prob, idx in zip(top_probs[0], top_indices[0]):
                    new_seq = torch.cat([seq, idx.unsqueeze(0).unsqueeze(0)], dim=1)
                    new_score = score + prob.item()
                    candidates.append((new_seq, new_score))
            
            if not candidates:
                break
            
            # Select top beam_size candidates
            beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]
            
            if len(completed) >= beam_size:
                break
        
        # Return best completed sequence
        if completed:
            best_seq = max(completed, key=lambda x: x[1])[0]
        else:
            best_seq = beams[0][0]
        
        return best_seq[0].cpu().tolist()

def generate_response(model, sp, input_text, max_len=64, device='cpu', 
                     strategy='greedy', beam_size=3):
    
    # Normalize input
    input_text = normalize_urdu_text(input_text)
    
    # Encode input
    src_ids = [2] + sp.encode(input_text, out_type=int) + [3]
    src_tensor = torch.tensor([src_ids], dtype=torch.long)
    
    # Generate output
    if strategy == 'beam':
        output_ids = beam_search_decode(model, src_tensor, beam_size, max_len, device)
    else:
        output_ids = greedy_decode(model, src_tensor, max_len, device)
    
    # Decode output
    response = sp.decode([id for id in output_ids if id not in [0, 2, 3]])
    
    return response

In [23]:
# EVALUATION METRICS

def calculate_bleu_score(references, hypotheses):
    
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    
    smooth = SmoothingFunction()
    scores = []
    
    for ref, hyp in zip(references, hypotheses):
        ref_tokens = [ref.split()]
        hyp_tokens = hyp.split()
        
        try:
            score = sentence_bleu(ref_tokens, hyp_tokens, 
                                smoothing_function=smooth.method1)
            scores.append(score)
        except:
            scores.append(0.0)
    
    return np.mean(scores) if scores else 0.0

def calculate_rouge_l(references, hypotheses):
    
    try:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
        
        scores = []
        for ref, hyp in zip(references, hypotheses):
            score = scorer.score(ref, hyp)
            scores.append(score['rougeL'].fmeasure)
        
        return np.mean(scores) if scores else 0.0
    except ImportError:
        print("⚠️ rouge-score not installed. Install: pip install rouge-score")
        return 0.0

def calculate_chrf(references, hypotheses):
    
    def char_ngrams(text, n):
        return [text[i:i+n] for i in range(len(text)-n+1)]
    
    scores = []
    for ref, hyp in zip(references, hypotheses):
        ref_chars = set(char_ngrams(ref, 3))
        hyp_chars = set(char_ngrams(hyp, 3))
        
        if len(hyp_chars) == 0:
            scores.append(0.0)
            continue
        
        precision = len(ref_chars & hyp_chars) / len(hyp_chars)
        recall = len(ref_chars & hyp_chars) / len(ref_chars) if len(ref_chars) > 0 else 0
        
        f_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        scores.append(f_score)
    
    return np.mean(scores) if scores else 0.0

def evaluate_model(model, test_loader, sp, device, num_samples=100):
    
    print("EVALUATING MODEL")
    
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    references = []
    hypotheses = []
    total_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_loader, desc="Evaluating")):
            if i >= num_samples // test_loader.batch_size:
                break
            
            src = batch["src"].to(device)
            tgt = batch["tgt"].to(device)
            
            # Calculate loss for perplexity
            output = model(src, tgt)
            loss = criterion(output.reshape(-1, output.size(-1)), 
                           tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
            
            # Generate predictions for BLEU/ROUGE/chrF
            for j in range(min(4, src.size(0))):  # Evaluate 4 per batch
                src_tensor = src[j:j+1]
                pred_ids = greedy_decode(model, src_tensor, 64, device)
                pred_text = sp.decode([id for id in pred_ids if id not in [0, 2, 3]])
                
                ref_text = batch["tgt_text"][j]
                
                references.append(ref_text)
                hypotheses.append(pred_text)
    
    # Calculate metrics
    perplexity = math.exp(total_loss / len(test_loader))
    bleu = calculate_bleu_score(references, hypotheses)
    rouge_l = calculate_rouge_l(references, hypotheses)
    chrf = calculate_chrf(references, hypotheses)
    
    metrics = {
        'perplexity': perplexity,
        'bleu': bleu,
        'rouge_l': rouge_l,
        'chrf': chrf
    }
    
    print(f"\n{'='*60}")
    print("📊 EVALUATION RESULTS")
    print(f"{'='*60}")
    print(f"  Perplexity: {perplexity:.2f}")
    print(f"  BLEU Score: {bleu:.4f}")
    print(f"  ROUGE-L: {rouge_l:.4f}")
    print(f"  chrF: {chrf:.4f}")
    print(f"{'='*60}\n")
    
    # Show qualitative examples
    print("🔍 QUALITATIVE EXAMPLES:\n")
    for i in range(min(5, len(references))):
        print(f"Input: {references[i][:50]}...")
        print(f"Predicted: {hypotheses[i]}")
        print(f"Reference: {references[i]}")
        print("-" * 60)
    
    # Save evaluation results
    with open('evaluation_results.json', 'w', encoding='utf-8') as f:
        json.dump({
            'metrics': metrics,
            'examples': [{'ref': r, 'hyp': h} for r, h in zip(references[:10], hypotheses[:10])]
        }, f, ensure_ascii=False, indent=2)
    
    print("✅ Evaluation results saved to 'evaluation_results.json'\n")
    
    return metrics


In [24]:
# HUMAN EVALUATION FRAMEWORK

def conduct_human_evaluation(model, sp, device, test_samples=20):

    evaluation_data = []
    
    test_inputs = [
        "آپ کیسے ہیں؟",
        "آج موسم کیسا ہے؟",
        "پاکستان کا دارالحکومت کیا ہے؟",
        "میں آپ کی مدد کیسے کر سکتا ہوں؟",
        "شکریہ"
    ]
    
    for i, input_text in enumerate(test_inputs[:test_samples], 1):
        print(f"\n{'='*60}")
        print(f"Sample {i}/{min(test_samples, len(test_inputs))}")
        print(f"{'='*60}")
        
        response = generate_response(model, sp, input_text, device=device)
        
        print(f"\nInput: {input_text}")
        print(f"Generated Response: {response}")
        print()
        
        # In production, collect these from human evaluators
        # For now, create a template
        evaluation_data.append({
            'input': input_text,
            'response': response,
            'fluency': None,  # 1-5 score
            'relevance': None,  # 1-5 score
            'adequacy': None  # 1-5 score
        })
    
    # Save template for human evaluation
    with open('human_evaluation_template.json', 'w', encoding='utf-8') as f:
        json.dump(evaluation_data, f, ensure_ascii=False, indent=2)
    
    print("\n✅ Human evaluation template saved to 'human_evaluation_template.json'")
    print("   Please fill in the fluency, relevance, and adequacy scores (1-5)")
    
    return evaluation_data


In [25]:
# MAIN EXECUTION

def main():
    # CONFIGURATION
    CONFIG = {
        'data_path': "/kaggle/input/urdu-dataset-20000/final_main_dataset.tsv",
        'output_dir': "preprocessed",
        'vocab_size': 16000,
        'd_model': 256,
        'num_heads': 4,
        'num_encoder_layers': 2,
        'num_decoder_layers': 2,
        'd_ff': 512,
        'dropout': 0.2,
        'batch_size': 32,
        'epochs': 20,
        'lr': 3e-4,
        'max_len': 64,
        'device': "cuda" if torch.cuda.is_available() else "cpu"
    }
    
    for key, value in CONFIG.items():
        print(f"  {key}: {value}")
    
    print(f"\n  Device: {CONFIG['device']}")
    if CONFIG['device'] == 'cuda':
        print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print()
    
    # PREPROCESSING
    print("STEP 1: DATA PREPROCESSING")
    
    pairs = preprocess_urdu_dataset(CONFIG['data_path'], CONFIG['output_dir'])
    
    # TOKENIZER TRAINING
    print("STEP 2: TOKENIZER TRAINING")
    
    train_tokenizer(
        corpus_path=os.path.join(CONFIG['output_dir'], "corpus.txt"),
        output_prefix=os.path.join(CONFIG['output_dir'], "sp_urdu"),
        vocab_size=CONFIG['vocab_size']
    )
    
    # Dataset Prepare
    print("STEP 3: DATASET PREPARATION")
    
    sp_model_path = os.path.join(CONFIG['output_dir'], "sp_urdu.model")
    prepare_dataset(pairs, sp_model_path, CONFIG['output_dir'])
    
    # Data loading
    print("STEP 4: CREATING DATALOADERS")
    
    train_ds = ConversationDataset(
        os.path.join(CONFIG['output_dir'], "train.jsonl"),
        max_len=CONFIG['max_len']
    )
    val_ds = ConversationDataset(
        os.path.join(CONFIG['output_dir'], "val.jsonl"),
        max_len=CONFIG['max_len']
    )
    test_ds = ConversationDataset(
        os.path.join(CONFIG['output_dir'], "test.jsonl"),
        max_len=CONFIG['max_len']
    )
    
    train_loader = DataLoader(train_ds, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=CONFIG['batch_size'])
    test_loader = DataLoader(test_ds, batch_size=CONFIG['batch_size'])
    
    print(f"  Training batches: {len(train_loader)}")
    print(f"  Validation batches: {len(val_loader)}")
    print(f"  Test batches: {len(test_loader)}")
    
    # Model Initialize
    print(f"\n{'='*60}")
    print("STEP 5: MODEL INITIALIZATION")
    print(f"{'='*60}\n")
    
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    vocab_size = sp.get_piece_size()
    
    model = TransformerSeq2Seq(
        vocab_size=vocab_size,
        d_model=CONFIG['d_model'],
        num_heads=CONFIG['num_heads'],
        num_encoder_layers=CONFIG['num_encoder_layers'],
        num_decoder_layers=CONFIG['num_decoder_layers'],
        d_ff=CONFIG['d_ff'],
        dropout=CONFIG['dropout'],
        max_len=512
    )
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"  Total Parameters: {total_params:,}")
    print(f"  Trainable Parameters: {trainable_params:,}")
    print(f"  Model Size: {total_params * 4 / 1024 / 1024:.2f} MB (float32)")
    
    # Training
    print("STEP 6: MODEL TRAINING")
    
    training_history = train_model(
        model, 
        train_loader, 
        val_loader, 
        CONFIG['device'],
        epochs=CONFIG['epochs'],
        lr=CONFIG['lr'],
        teacher_forcing_ratio=0.5
    )
    
    # Save training history
    with open('training_history.json', 'w') as f:
        json.dump(training_history, f, indent=2)
    
    # LOAD BEST MODEL
    print("STEP 7: LOADING BEST MODEL")
    
    checkpoint = torch.load('best_model.pth', map_location=CONFIG['device'])
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f" Loaded best model from epoch {checkpoint['epoch']+1}")
    print(f"   Validation Loss: {checkpoint['val_loss']:.4f}")
    print(f"   Perplexity: {checkpoint['perplexity']:.2f}")
    
    # Evaluation
    print("STEP 8: MODEL EVALUATION")
    
    metrics = evaluate_model(model, test_loader, sp, CONFIG['device'], num_samples=100)
    
    # Testing
    print("STEP 9: INTERACTIVE TESTING")
    
    test_inputs = [
        "آپ کا نام کیا ہے؟",
        "آج موسم کیسا ہے؟",
        "پاکستان کے بارے میں بتائیں",
        "شکریہ",
        "خدا حافظ",
        "میں آپ کی مدد کیسے کر سکتا ہوں؟"
    ]
    
    print("Testing with sample inputs:\n")
    for inp in test_inputs:
        print(f"{'='*60}")
        print(f"Input: {inp}")
        
        # Greedy decoding
        greedy_response = generate_response(
            model, sp, inp, 
            device=CONFIG['device'], 
            strategy='greedy'
        )
        print(f"Greedy Response: {greedy_response}")
        
        # Beam search decoding
        beam_response = generate_response(
            model, sp, inp, 
            device=CONFIG['device'], 
            strategy='beam',
            beam_size=3
        )
        print(f"Beam Search Response: {beam_response}")
        print()
    
    # HUMAN EVALUATION SETUP
    print("STEP 10: HUMAN EVALUATION SETUP")
    conduct_human_evaluation(model, sp, CONFIG['device'], test_samples=5)
    
    # Save Model
    print("SAVING COMPLETE MODEL")
    
    torch.save({
        'config': CONFIG,
        'model_state_dict': model.state_dict(),
        'vocab_size': vocab_size,
        'training_history': training_history,
        'evaluation_metrics': metrics
    }, 'complete_model.pth')
    
    # Summary

    print("\n Final Metrics:")
    print(f"   - Perplexity: {metrics['perplexity']:.2f}")
    print(f"   - BLEU: {metrics['bleu']:.4f}")
    print(f"   - ROUGE-L: {metrics['rouge_l']:.4f}")
    print(f"   - chrF: {metrics['chrf']:.4f}")

In [26]:
# RUN MAIN

if __name__ == "__main__":
    # Set random seeds for reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    
    main()

  data_path: /kaggle/input/urdu-dataset-20000/final_main_dataset.tsv
  output_dir: preprocessed
  vocab_size: 16000
  d_model: 256
  num_heads: 4
  num_encoder_layers: 2
  num_decoder_layers: 2
  d_ff: 512
  dropout: 0.2
  batch_size: 32
  epochs: 20
  lr: 0.0003
  max_len: 64
  device: cuda

  Device: cuda
  GPU: Tesla P100-PCIE-16GB

STEP 1: DATA PREPROCESSING
Loading dataset
Normalizing Urdu text
✅ Total sentences after normalization: 19979
Creating conversation pairs
Created 19310 conversation pairs
Saved corpus to preprocessed/corpus.txt
STEP 2: TOKENIZER TRAINING
Training SentencePiece tokenizer (vocab_size=16000)...
Tokenizer trained: preprocessed/sp_urdu.model
STEP 3: DATASET PREPARATION
Loading tokenizer...
Tokenizing all pairs...


Tokenizing: 100%|██████████| 19310/19310 [00:00<00:00, 20736.32it/s]



📊 Dataset Split:
  - train: 15448 samples (80.0%)
  - val: 1931 samples (10.0%)
  - test: 1931 samples (10.0%)
STEP 4: CREATING DATALOADERS
  Training batches: 483
  Validation batches: 61
  Test batches: 61

STEP 5: MODEL INITIALIZATION

  Total Parameters: 10,843,776
  Trainable Parameters: 10,843,776
  Model Size: 41.37 MB (float32)
STEP 6: MODEL TRAINING
🚀 Starting Training (Teacher Forcing Ratio: 0.5)


Epoch 1/20: 100%|██████████| 483/483 [00:13<00:00, 35.42it/s, loss=6.2857]



📊 Epoch 1 Results:
   Train Loss: 6.5703
   Val Loss: 6.1026
   Perplexity: 447.01
   ✅ Model saved! (Best Val Loss: 6.1026)



Epoch 2/20: 100%|██████████| 483/483 [00:13<00:00, 35.42it/s, loss=6.1280]



📊 Epoch 2 Results:
   Train Loss: 5.8215
   Val Loss: 5.7691
   Perplexity: 320.26
   ✅ Model saved! (Best Val Loss: 5.7691)



Epoch 3/20: 100%|██████████| 483/483 [00:13<00:00, 35.46it/s, loss=5.5364]



📊 Epoch 3 Results:
   Train Loss: 5.4313
   Val Loss: 5.5939
   Perplexity: 268.79
   ✅ Model saved! (Best Val Loss: 5.5939)



Epoch 4/20: 100%|██████████| 483/483 [00:13<00:00, 35.40it/s, loss=5.4471]



📊 Epoch 4 Results:
   Train Loss: 5.1359
   Val Loss: 5.3975
   Perplexity: 220.85
   ✅ Model saved! (Best Val Loss: 5.3975)



Epoch 5/20: 100%|██████████| 483/483 [00:13<00:00, 35.22it/s, loss=5.0505]



📊 Epoch 5 Results:
   Train Loss: 4.8960
   Val Loss: 5.2642
   Perplexity: 193.28
   ✅ Model saved! (Best Val Loss: 5.2642)



Epoch 6/20: 100%|██████████| 483/483 [00:13<00:00, 35.29it/s, loss=4.5092]



📊 Epoch 6 Results:
   Train Loss: 4.6900
   Val Loss: 5.1516
   Perplexity: 172.70
   ✅ Model saved! (Best Val Loss: 5.1516)



Epoch 7/20: 100%|██████████| 483/483 [00:13<00:00, 35.36it/s, loss=4.5522]



📊 Epoch 7 Results:
   Train Loss: 4.5079
   Val Loss: 5.0341
   Perplexity: 153.56
   ✅ Model saved! (Best Val Loss: 5.0341)



Epoch 8/20: 100%|██████████| 483/483 [00:13<00:00, 35.54it/s, loss=4.6771]



📊 Epoch 8 Results:
   Train Loss: 4.3460
   Val Loss: 4.9486
   Perplexity: 140.97
   ✅ Model saved! (Best Val Loss: 4.9486)



Epoch 9/20: 100%|██████████| 483/483 [00:13<00:00, 35.55it/s, loss=4.1521]



📊 Epoch 9 Results:
   Train Loss: 4.1985
   Val Loss: 4.8633
   Perplexity: 129.45
   ✅ Model saved! (Best Val Loss: 4.8633)



Epoch 10/20: 100%|██████████| 483/483 [00:13<00:00, 35.54it/s, loss=4.1079]



📊 Epoch 10 Results:
   Train Loss: 4.0631
   Val Loss: 4.7906
   Perplexity: 120.37
   ✅ Model saved! (Best Val Loss: 4.7906)



Epoch 11/20: 100%|██████████| 483/483 [00:13<00:00, 35.57it/s, loss=4.0965]



📊 Epoch 11 Results:
   Train Loss: 3.9391
   Val Loss: 4.7011
   Perplexity: 110.07
   ✅ Model saved! (Best Val Loss: 4.7011)



Epoch 12/20: 100%|██████████| 483/483 [00:13<00:00, 35.62it/s, loss=3.8738]



📊 Epoch 12 Results:
   Train Loss: 3.8222
   Val Loss: 4.6635
   Perplexity: 106.00
   ✅ Model saved! (Best Val Loss: 4.6635)



Epoch 13/20: 100%|██████████| 483/483 [00:13<00:00, 35.35it/s, loss=3.9045]



📊 Epoch 13 Results:
   Train Loss: 3.7116
   Val Loss: 4.6236
   Perplexity: 101.86
   ✅ Model saved! (Best Val Loss: 4.6236)



Epoch 14/20: 100%|██████████| 483/483 [00:13<00:00, 34.95it/s, loss=3.8084]



📊 Epoch 14 Results:
   Train Loss: 3.6157
   Val Loss: 4.5562
   Perplexity: 95.22
   ✅ Model saved! (Best Val Loss: 4.5562)



Epoch 15/20: 100%|██████████| 483/483 [00:13<00:00, 35.38it/s, loss=3.5970]



📊 Epoch 15 Results:
   Train Loss: 3.5123
   Val Loss: 4.4819
   Perplexity: 88.40
   ✅ Model saved! (Best Val Loss: 4.4819)



Epoch 16/20: 100%|██████████| 483/483 [00:13<00:00, 35.25it/s, loss=3.3389]



📊 Epoch 16 Results:
   Train Loss: 3.4214
   Val Loss: 4.4707
   Perplexity: 87.42
   ✅ Model saved! (Best Val Loss: 4.4707)



Epoch 17/20: 100%|██████████| 483/483 [00:13<00:00, 35.52it/s, loss=3.3299]



📊 Epoch 17 Results:
   Train Loss: 3.3380
   Val Loss: 4.4177
   Perplexity: 82.91
   ✅ Model saved! (Best Val Loss: 4.4177)



Epoch 18/20: 100%|██████████| 483/483 [00:13<00:00, 35.42it/s, loss=3.2765]



📊 Epoch 18 Results:
   Train Loss: 3.2546
   Val Loss: 4.3806
   Perplexity: 79.88
   ✅ Model saved! (Best Val Loss: 4.3806)



Epoch 19/20: 100%|██████████| 483/483 [00:13<00:00, 35.43it/s, loss=3.4271]



📊 Epoch 19 Results:
   Train Loss: 3.1836
   Val Loss: 4.3564
   Perplexity: 77.97
   ✅ Model saved! (Best Val Loss: 4.3564)



Epoch 20/20: 100%|██████████| 483/483 [00:13<00:00, 35.47it/s, loss=3.3447]



📊 Epoch 20 Results:
   Train Loss: 3.1029
   Val Loss: 4.3094
   Perplexity: 74.40
   ✅ Model saved! (Best Val Loss: 4.3094)

STEP 7: LOADING BEST MODEL
 Loaded best model from epoch 20
   Validation Loss: 4.3094
   Perplexity: 74.40
STEP 8: MODEL EVALUATION
EVALUATING MODEL


Evaluating:   5%|▍         | 3/61 [00:00<00:07,  8.05it/s]


⚠️ rouge-score not installed. Install: pip install rouge-score

📊 EVALUATION RESULTS
  Perplexity: 1.24
  BLEU Score: 0.0053
  ROUGE-L: 0.0000
  chrF: 0.0448

🔍 QUALITATIVE EXAMPLES:

Input: کالی کافی اور سگاروں پہ گزارا کرتی۔...
Predicted: اس کی خلاف ورزی پر غور کیا ہی؟
Reference: کالی کافی اور سگاروں پہ گزارا کرتی۔
------------------------------------------------------------
Input: کی سن پیدائش کی سب سی ہٹ فلم...
Predicted: اس سی زیادہ کیا جا رہا ہی؟
Reference: کی سن پیدائش کی سب سی ہٹ فلم
------------------------------------------------------------
Input: جب سیاست دانوں کی ساتھ عوام کا عملی تعلق ہو گا۔...
Predicted: میں نی اپنی طرف دیکھ لیا
Reference: جب سیاست دانوں کی ساتھ عوام کا عملی تعلق ہو گا۔
------------------------------------------------------------
Input: ہماری تاریخ میں اس کی بی شمار مثالیں مو جود ہیں۔...
Predicted: یہ کوئی نہیں کہا جائی گا کہ فلاں ادمی دائرۂ اسلام سی خارج کررہی ہیں
Reference: ہماری تاریخ میں اس کی بی شمار مثالیں مو جود ہیں۔
------------------------------