In [38]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import random
from torch.utils.data import Dataset, DataLoader
import sentencepiece as sp
from tqdm import tqdm
import math
import numpy as np
import torch.nn.functional as F
import torch.optim as optim

In [39]:
!pip install rouge_score sacrebleu torch



In [40]:
from sacrebleu.metrics import BLEU, CHRF
from rouge_score import rouge_scorer

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
tokenizer = sp.SentencePieceProcessor()
tokenizer.Load('/content/drive/MyDrive/NLP_ASS2/TEST/bpe_tokenizer.model')


True

In [44]:
VOCAB_SIZE = tokenizer.get_piece_size()
PAD_ID = tokenizer.piece_to_id('<pad>')
UNK_ID = tokenizer.piece_to_id('<unk>')
SOS_ID = tokenizer.piece_to_id('<s>')
EOS_ID = tokenizer.piece_to_id('</s>')
print(f"Vocabulary size: {VOCAB_SIZE}")
print(f"PAD ID: {PAD_ID}")


Vocabulary size: 768
PAD ID: 0


In [45]:

with open('/content/drive/MyDrive/NLP_ASS2/TEST/sentences_cleaned.txt', 'r', encoding='utf-8') as f:
        data = [line.strip() for line in f if line.strip()]


In [46]:
# Split dataset: 80% train, 10% validation, 10% test
print("Splitting dataset...")
random.shuffle(data)
train_size = int(0.8 * len(data))

train_groups = data[:train_size]
val_groups = data[train_size:]

print(f"Train: {len(train_groups)}, Val: {len(val_groups)}")



Splitting dataset...
Train: 8197, Val: 2050


In [47]:

class UrduChatbotDataset(Dataset):
    """Dataset for Urdu chatbot with teacher forcing"""

    def __init__(self, sentence_groups, max_len=50):
        self.sentence_groups = sentence_groups
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence_groups)

    def __getitem__(self, idx):
        # Split based on word count (2/5th for input, 3/5th for target)
        group = self.sentence_groups[idx]
        words = group.split()

        # Calculate split point based on word count
        total_words = len(words)
        split_point = max(1, total_words * 2 // 5)  # At least 1 word for input

        input_text = ' '.join(words[:split_point])
        target_text = ' '.join(words[split_point:])

        # Tokenize input and target
        input_tokens = tokenizer.Encode(input_text)
        target_tokens = tokenizer.Encode(target_text)
        # Truncate and pad
        input_ids = input_tokens[:self.max_len] + [PAD_ID] * (self.max_len - len(input_tokens))
        target_ids = target_tokens[:self.max_len] + [PAD_ID] * (self.max_len - len(target_tokens))

        # Teacher forcing: decoder input is [START] + target[:-1], decoder target is target
        decoder_input_ids = [SOS_ID] + target_ids[:-1]
        decoder_target_ids = target_ids

        return {
            'encoder_input': torch.tensor(input_ids, dtype=torch.long),
            'decoder_input': torch.tensor(decoder_input_ids, dtype=torch.long),
            'decoder_target': torch.tensor(decoder_target_ids, dtype=torch.long)
        }


In [48]:
train_dataset = UrduChatbotDataset(train_groups)
val_dataset = UrduChatbotDataset(val_groups)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


# --------------------TRANSFORMER CODE--------------------------

In [49]:

class PositionalEncoding(nn.Module):
    """Positional encoding using sine and cosine"""
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    """Multi-Head Attention mechanism"""
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        x = x.view(batch_size, seq_len, self.num_heads, self.d_k)
        return x.transpose(1, 2)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, V)
        return output, attention_weights

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        Q = self.split_heads(self.W_q(query))
        K = self.split_heads(self.W_k(key))
        V = self.split_heads(self.W_v(value))
        attn_output, _ = self.scaled_dot_product_attention(Q, K, V, mask)
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.d_model)
        output = self.W_o(attn_output)
        return output

class FeedForward(nn.Module):
    """Position-wise Feed-Forward Network"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class EncoderLayer(nn.Module):
    """Single Transformer Encoder Layer"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attention(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))
        return x

class DecoderLayer(nn.Module):
    """Single Transformer Decoder Layer"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        self_attn_output = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(self_attn_output))
        cross_attn_output = self.cross_attention(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout3(ff_output))
        return x

class Transformer(nn.Module):
    """Complete Transformer Encoder-Decoder Model"""
    def __init__(self, vocab_size, d_model=256, num_heads=2, d_ff=1024,
                 num_encoder_layers=2, num_decoder_layers=2, max_len=512,
                 dropout=0.1, pad_idx=0):
        super().__init__()
        self.d_model = d_model
        self.pad_idx = pad_idx

        self.encoder_embedding = nn.Embedding(vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(vocab_size, d_model)
        self.encoder_pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        self.decoder_pos_encoding = PositionalEncoding(d_model, max_len, dropout)

        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])

        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])

        self.output_projection = nn.Linear(d_model, vocab_size)
        self._init_parameters()

    def _init_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def make_src_mask(self, src):
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_tgt_mask(self, tgt):
        batch_size, tgt_len = tgt.size()
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return tgt_mask

    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)

        # Encoder
        x = self.encoder_embedding(src) * math.sqrt(self.d_model)
        x = self.encoder_pos_encoding(x)
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        encoder_output = x

        # Decoder
        x = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        x = self.decoder_pos_encoding(x)
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)

        output = self.output_projection(x)
        return output


# --------------------TRAINING-TEST CODE--------------------------

In [50]:
def train_epoch(model, train_loader, criterion, optimizer):
    """Train for one epoch with teacher forcing"""
    model.train()
    epoch_loss = 0
    pbar = tqdm(train_loader, desc="Training")

    for batch in pbar:
        encoder_input = batch['encoder_input'].to(device)
        decoder_input = batch['decoder_input'].to(device)
        decoder_target = batch['decoder_target'].to(device)

        optimizer.zero_grad()
        output = model(encoder_input, decoder_input)
        output = output.reshape(-1, output.size(-1))
        target = decoder_target.reshape(-1)
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        epoch_loss += loss.item()
        pbar.set_postfix({'Train loss': f'{loss.item():.4f}'})

    return epoch_loss / len(train_loader)

In [51]:
def calculate_bleu(predictions, references):
    """Calculate BLEU score"""
    bleu = BLEU()
    score = bleu.corpus_score(predictions, [references])
    return score.score

def calculate_rouge_l(predictions, references):
    """Calculate ROUGE-L score"""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
    scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores.append(score['rougeL'].fmeasure)
    return np.mean(scores)

def calculate_chrf(predictions, references):
    """Calculate chrF score"""
    chrf = CHRF()
    score = chrf.corpus_score(predictions, [references])
    return score.score

def calculate_perplexity(loss):
    """Calculate perplexity from loss"""
    return math.exp(min(loss, 100))

In [52]:
def evaluate(model, dataloader, criterian):
    model.eval()
    total_loss = 0
    predictions = []
    references = []

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating (no teacher forcing)")
        for batch in progress_bar:
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            decoder_target = batch['decoder_target'].to(device)

            output = model(encoder_input, decoder_input)
            loss = criterian(output.reshape(-1, output.size(-1)), decoder_target.reshape(-1))

            total_loss += loss.item()
            progress_bar.set_postfix({'val loss': loss.item()})

            # Decode predictions and references for metrics
            pred_ids = output.argmax(dim=-1).cpu().tolist()
            tgt_ids = decoder_target.cpu().tolist()
            for pred, ref in zip(pred_ids, tgt_ids):
                pred_text = tokenizer.decode(pred)
                ref_text = tokenizer.decode(ref)
                predictions.append(pred_text)
                references.append(ref_text)

    avg_loss = total_loss / len(dataloader)

    bleu_score = calculate_bleu(predictions, references)
    rouge_score = calculate_rouge_l(predictions, references)
    chrf_score = calculate_chrf(predictions, references)
    perplexity = calculate_perplexity(avg_loss)

    print("\nEvaluation Results:")
    print(f"  Val Loss:   {avg_loss:.4f}")
    print(f"  BLEU:       {bleu_score:.4f}")
    print(f"  ROUGE-L:    {rouge_score:.4f}")
    print(f"  chrF:       {chrf_score:.4f}")
    print(f"  Perplexity: {perplexity:.4f}")

    return {
        'loss': avg_loss,
        'bleu': bleu_score,
        'rouge_l': rouge_score,
        'chrf': chrf_score,
        'perplexity': perplexity,
        'predictions': predictions[:10],
        'references': references[:10]
    }

In [70]:

def generate_text(model, input_text, max_length=30, temperature=0.8, device='cpu'):
    """Generate text continuation using beam search"""
    model.eval()

    with torch.no_grad():
        token_ids = tokenizer.encode(input_text)
        encoder_ids = token_ids[:50] + [PAD_ID] * (50 - len(token_ids))
        encoder_input = torch.tensor([encoder_ids], dtype=torch.long).to(device)
        decoder_input = torch.tensor([[SOS_ID]], dtype=torch.long).to(device)

        generated_tokens = []
        for _ in range(max_length):
            output = model(encoder_input, decoder_input)
            next_token_logits = output[0, -1, :] / temperature
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            if next_token.item() == EOS_ID:
                break

            generated_tokens.append(next_token.item())
            decoder_input = torch.cat([decoder_input, next_token.unsqueeze(0)], dim=1)

        output_tokens = []
        for idx in generated_tokens:
            if idx not in [PAD_ID,SOS_ID,EOS_ID]:
                token = tokenizer.id_to_piece(idx)
                output_tokens.append(token)

        text = ''.join(output_tokens).replace('▁', ' ')
        return text.strip()


In [71]:

model = Transformer(
    vocab_size=768,
    d_model=512,
    num_heads=2,
    d_ff=1024,
    num_encoder_layers=2,
    num_decoder_layers=2,
    max_len=50,
    dropout=0.1,
    pad_idx=PAD_ID
).to(device)


print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")


Model parameters: 11,694,848


In [55]:
criterian=nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer =torch.optim.Adam(model.parameters(),lr=1e-4)
#scheduler =torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=3,factor=0.5)

In [56]:
output=None
best_bleu = 0


In [57]:
NUM_EPOCHS=30
for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
        #teacher_forcing=max(0,teacher_forcing-0.1)

        loss =train_epoch(model,train_loader,criterian,optimizer)
        #scheduler.step(loss)
        output = evaluate(model, val_loader,criterian)


        if output['bleu'] > best_bleu:
            best_bleu = output['bleu']
            torch.save({
                'model_state_dict': model.state_dict(),
            }, 'best_model.pt')
            print(f"\n  ✓ New best model saved! BLEU: {best_bleu:.2f}")




Epoch 1/30


Training: 100%|██████████| 257/257 [00:15<00:00, 16.76it/s, Train loss=5.5494]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 61.12it/s, val loss=5.97]



Evaluation Results:
  Val Loss:   5.6342
  BLEU:       0.0062
  ROUGE-L:    0.0000
  chrF:       4.5318
  Perplexity: 279.8299

  ✓ New best model saved! BLEU: 0.01

Epoch 2/30


Training: 100%|██████████| 257/257 [00:12<00:00, 20.92it/s, Train loss=5.1574]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 40.71it/s, val loss=5.6]



Evaluation Results:
  Val Loss:   5.0428
  BLEU:       0.0063
  ROUGE-L:    0.0000
  chrF:       5.7832
  Perplexity: 154.9019

  ✓ New best model saved! BLEU: 0.01

Epoch 3/30


Training: 100%|██████████| 257/257 [00:11<00:00, 23.28it/s, Train loss=4.9907]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.12it/s, val loss=5.54]



Evaluation Results:
  Val Loss:   4.8019
  BLEU:       0.0148
  ROUGE-L:    0.0000
  chrF:       6.8308
  Perplexity: 121.7471

  ✓ New best model saved! BLEU: 0.01

Epoch 4/30


Training: 100%|██████████| 257/257 [00:11<00:00, 23.14it/s, Train loss=4.1929]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.75it/s, val loss=5.1]



Evaluation Results:
  Val Loss:   4.6578
  BLEU:       0.0194
  ROUGE-L:    0.0000
  chrF:       7.2050
  Perplexity: 105.4067

  ✓ New best model saved! BLEU: 0.02

Epoch 5/30


Training: 100%|██████████| 257/257 [00:10<00:00, 23.56it/s, Train loss=4.3769]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 39.22it/s, val loss=5.15]



Evaluation Results:
  Val Loss:   4.5835
  BLEU:       0.0267
  ROUGE-L:    0.0000
  chrF:       8.1727
  Perplexity: 97.8527

  ✓ New best model saved! BLEU: 0.03

Epoch 6/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.68it/s, Train loss=3.6282]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.94it/s, val loss=5.11]



Evaluation Results:
  Val Loss:   4.5402
  BLEU:       0.0224
  ROUGE-L:    0.0000
  chrF:       8.4670
  Perplexity: 93.7114

Epoch 7/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.00it/s, Train loss=4.3411]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.37it/s, val loss=5.07]



Evaluation Results:
  Val Loss:   4.5090
  BLEU:       0.0273
  ROUGE-L:    0.0000
  chrF:       8.3609
  Perplexity: 90.8295

  ✓ New best model saved! BLEU: 0.03

Epoch 8/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.07it/s, Train loss=3.5479]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.90it/s, val loss=4.85]



Evaluation Results:
  Val Loss:   4.4906
  BLEU:       0.0284
  ROUGE-L:    0.0000
  chrF:       8.4936
  Perplexity: 89.1730

  ✓ New best model saved! BLEU: 0.03

Epoch 9/30


Training: 100%|██████████| 257/257 [00:10<00:00, 23.60it/s, Train loss=3.7320]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 53.94it/s, val loss=4.76]



Evaluation Results:
  Val Loss:   4.4726
  BLEU:       0.0305
  ROUGE-L:    0.0000
  chrF:       8.4665
  Perplexity: 87.5871

  ✓ New best model saved! BLEU: 0.03

Epoch 10/30


Training: 100%|██████████| 257/257 [00:11<00:00, 23.15it/s, Train loss=3.7056]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.02it/s, val loss=4.9]



Evaluation Results:
  Val Loss:   4.4841
  BLEU:       0.0410
  ROUGE-L:    0.0000
  chrF:       8.5080
  Perplexity: 88.5933

  ✓ New best model saved! BLEU: 0.04

Epoch 11/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.30it/s, Train loss=3.7916]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.74it/s, val loss=4.88]



Evaluation Results:
  Val Loss:   4.5004
  BLEU:       0.0344
  ROUGE-L:    0.0000
  chrF:       8.3853
  Perplexity: 90.0504

Epoch 12/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.65it/s, Train loss=2.9426]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.81it/s, val loss=4.94]



Evaluation Results:
  Val Loss:   4.5058
  BLEU:       0.0341
  ROUGE-L:    0.0000
  chrF:       8.6475
  Perplexity: 90.5445

Epoch 13/30


Training: 100%|██████████| 257/257 [00:10<00:00, 23.39it/s, Train loss=3.3101]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.37it/s, val loss=5]



Evaluation Results:
  Val Loss:   4.5230
  BLEU:       0.0259
  ROUGE-L:    0.0000
  chrF:       8.2935
  Perplexity: 92.1159

Epoch 14/30


Training: 100%|██████████| 257/257 [00:12<00:00, 21.26it/s, Train loss=3.4180]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.55it/s, val loss=4.78]



Evaluation Results:
  Val Loss:   4.5373
  BLEU:       0.0324
  ROUGE-L:    0.0000
  chrF:       8.6119
  Perplexity: 93.4396

Epoch 15/30


Training: 100%|██████████| 257/257 [00:11<00:00, 23.25it/s, Train loss=3.0368]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.30it/s, val loss=4.75]



Evaluation Results:
  Val Loss:   4.5590
  BLEU:       0.0368
  ROUGE-L:    0.0000
  chrF:       8.7821
  Perplexity: 95.4863

Epoch 16/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.45it/s, Train loss=2.9470]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.52it/s, val loss=4.96]



Evaluation Results:
  Val Loss:   4.5956
  BLEU:       0.0340
  ROUGE-L:    0.0000
  chrF:       8.6529
  Perplexity: 99.0483

Epoch 17/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.50it/s, Train loss=3.1337]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 56.28it/s, val loss=4.94]



Evaluation Results:
  Val Loss:   4.6367
  BLEU:       0.0367
  ROUGE-L:    0.0000
  chrF:       8.7548
  Perplexity: 103.2007

Epoch 18/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.55it/s, Train loss=2.3492]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.54it/s, val loss=4.91]



Evaluation Results:
  Val Loss:   4.6558
  BLEU:       0.0374
  ROUGE-L:    0.0000
  chrF:       8.7341
  Perplexity: 105.1897

Epoch 19/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.61it/s, Train loss=2.7102]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 57.47it/s, val loss=5.11]



Evaluation Results:
  Val Loss:   4.7087
  BLEU:       0.0473
  ROUGE-L:    0.0000
  chrF:       8.6619
  Perplexity: 110.9067

  ✓ New best model saved! BLEU: 0.05

Epoch 20/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.55it/s, Train loss=2.4269]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 55.74it/s, val loss=5.3]



Evaluation Results:
  Val Loss:   4.7549
  BLEU:       0.0432
  ROUGE-L:    0.0000
  chrF:       8.6021
  Perplexity: 116.1550

Epoch 21/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.54it/s, Train loss=2.9418]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 59.16it/s, val loss=5.03]



Evaluation Results:
  Val Loss:   4.7887
  BLEU:       0.0478
  ROUGE-L:    0.0000
  chrF:       8.8925
  Perplexity: 120.1429

  ✓ New best model saved! BLEU: 0.05

Epoch 22/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.02it/s, Train loss=2.4626]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 61.14it/s, val loss=5.32]



Evaluation Results:
  Val Loss:   4.8267
  BLEU:       0.0523
  ROUGE-L:    0.0000
  chrF:       8.7062
  Perplexity: 124.8017

  ✓ New best model saved! BLEU: 0.05

Epoch 23/30


Training: 100%|██████████| 257/257 [00:10<00:00, 23.74it/s, Train loss=2.4467]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 60.09it/s, val loss=5]



Evaluation Results:
  Val Loss:   4.8591
  BLEU:       0.0783
  ROUGE-L:    0.0000
  chrF:       8.8810
  Perplexity: 128.9071

  ✓ New best model saved! BLEU: 0.08

Epoch 24/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.45it/s, Train loss=2.3209]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 61.68it/s, val loss=5.51]



Evaluation Results:
  Val Loss:   4.9235
  BLEU:       0.0842
  ROUGE-L:    0.0000
  chrF:       8.7398
  Perplexity: 137.4854

  ✓ New best model saved! BLEU: 0.08

Epoch 25/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.41it/s, Train loss=1.8674]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 62.37it/s, val loss=5.36]



Evaluation Results:
  Val Loss:   4.9682
  BLEU:       0.0945
  ROUGE-L:    0.0000
  chrF:       8.7450
  Perplexity: 143.7674

  ✓ New best model saved! BLEU: 0.09

Epoch 26/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.54it/s, Train loss=2.1724]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 62.50it/s, val loss=5.5]



Evaluation Results:
  Val Loss:   5.0183
  BLEU:       0.0935
  ROUGE-L:    0.0000
  chrF:       8.7836
  Perplexity: 151.1550

Epoch 27/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.51it/s, Train loss=2.5280]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 62.39it/s, val loss=5.69]



Evaluation Results:
  Val Loss:   5.0722
  BLEU:       0.0971
  ROUGE-L:    0.0000
  chrF:       8.8030
  Perplexity: 159.5248

  ✓ New best model saved! BLEU: 0.10

Epoch 28/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.28it/s, Train loss=1.3407]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 61.93it/s, val loss=5.84]



Evaluation Results:
  Val Loss:   5.1351
  BLEU:       0.1084
  ROUGE-L:    0.0000
  chrF:       8.9288
  Perplexity: 169.8768

  ✓ New best model saved! BLEU: 0.11

Epoch 29/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.33it/s, Train loss=2.3528]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 62.05it/s, val loss=5.89]



Evaluation Results:
  Val Loss:   5.1955
  BLEU:       0.0975
  ROUGE-L:    0.0000
  chrF:       8.8381
  Perplexity: 180.4644

Epoch 30/30


Training: 100%|██████████| 257/257 [00:10<00:00, 24.35it/s, Train loss=1.9651]
Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 62.53it/s, val loss=5.78]



Evaluation Results:
  Val Loss:   5.2524
  BLEU:       0.1037
  ROUGE-L:    0.0000
  chrF:       8.8631
  Perplexity: 191.0191


In [58]:
print("Examples:")
for i in range(10):

    print(f"\n  Example {i+1}:")
    print(f"    Prediction: {output['predictions'][i]}")
    print(f"    Reference:  {output['references'][i]}")

Examples:

  Example 1:
    Prediction: کو قد طرح گرداں کی جیسے خون گرداں م جیسےٹٹیایایایایایایایایایایایایایایایا کییایایایایایا ہےیایایایایایایایایا
    Reference:  میں اسی طرح گرداں ہے جیسے خون گرداں ہے

  Example 2:
    Prediction: کی نہیںنےےگی اور نہیں ہے رہیلی ہے ہے ہے ہے ہے ہے ہے ہے ہے رہی رہی رہی ہے ہے ہے ہے ہے ہے ہے ہے رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی رہی
    Reference:  کو سنجیدگی سے کون لے؟

  Example 3:
    Prediction: کسی گئیانہانےا کے ہی ہوتا ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے رہی رہی کرنا ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے ہے
    Reference:  ہو روز جزا ایسے نہیں ہوتا

  Example 4:
    Prediction: تصلہ میں تلاف تھےدقرالہ ہیں ہیں ہیں تھے تھے تھے تھے تھےاتات تھے تھے تھے تھے تھے ہیں ہیں ہیں تھے تھےہہاتات تھے تھے تھے تھے تھے تھے تھے تھے تھے تھے تھے تھےاتات
    Reference:  حملے کیخلاف جلوس نکالیں

  Example 5:
    Prediction: کرسادھ ہوانول کارار ہیںتےے گے ہیں ہیں ہیں ہیں ہیں ہیں ہیں ہیں ہیں 

In [59]:
adsad

NameError: name 'adsad' is not defined

In [76]:
test_inputs = ["کبھی کبھار ہی", "پاکستان کے", "دنیا بھر میں"]

for input_text in test_inputs:
    generated = generate_text(model, input_text, max_length=20, device=device)
    print(f"Input:  {input_text}")
    print(f"Output: {generated}")
    print()

Input:  کبھی کبھار ہی
Output: ہو جاتا ہوں یا یا جلاو زدہ ہو جاتا ہوں گے؟؟اتا ہوں؟چ

Input:  پاکستان کے
Output: مطلب چندے لائن ڈیوسہ برابر ہوا ہے؟را ہوگ

Input:  دنیا بھر میں
Output: اپنی انکار کرپیشن کیس کا ہے؟ٹرول جاری ہیں؟ پہلے ہوں



In [None]:
save_path = "/content/drive/MyDrive/NLP_ASS2/TEST/model_state.pth"
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
}, save_path)
print("Model saved as 'model_state.pth'")



In [72]:
model.load_state_dict(torch.load('best_model.pt')['model_state_dict'])
output = evaluate(model, val_loader,criterian)

Evaluating (no teacher forcing): 100%|██████████| 65/65 [00:01<00:00, 58.29it/s, val loss=5.84]



Evaluation Results:
  Val Loss:   5.1351
  BLEU:       0.1084
  ROUGE-L:    0.0000
  chrF:       8.9288
  Perplexity: 169.8768


In [None]:
test_inputs = ["کبھی کبھار ہی", "پاکستان کے", "دنیا بھر میں"]

for input_text in test_inputs:
    generated = generate_text(model, tokenizer, input_text, max_length=20, device=device)
    print(f"Input:  {input_text}")
    print(f"Output: {generated}")
    print()

In [77]:
save_path = "/content/drive/MyDrive/NLP_ASS2/TEST/model_state.pth"
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
}, save_path)
print("Model saved as 'model_state.pth'")



Model saved as 'model_state.pth'
