In [None]:
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [None]:
import os
# Set environment variable for CUDA memory fragmentation mitigation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import re
import nltk
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
!pip install rouge-score

# Download required NLTK data
nltk.download('punkt')

#############################################
# 1. Load Preprocessed Data and Build Vocabulary
#############################################

# Load CSV files (ensure these paths are correct)
train_df = pd.read_csv('/content/train_processed.csv')
val_df   = pd.read_csv('/content/val_processed.csv')
test_df  = pd.read_csv('/content/test_processed.csv')

# Build a vocabulary from the training set (from both text and title)
min_freq = 0.001 * len(train_df)
counter = Counter()

def tokenize(text):
    # Data is preprocessed (punctuation removed, etc.)
    return text.split()

for text in train_df['text']:
    counter.update(tokenize(text))
for title in train_df['title']:
    counter.update(tokenize(title))

# Initialize vocabulary with special tokens
vocab = {'<pad>': 0, '<bos>': 1, '<eos>': 2, '<unk>': 3}
for token, freq in counter.items():
    if freq >= min_freq:
        vocab[token] = len(vocab)

# Inverse vocabulary mapping for decoding
inv_vocab = {idx: token for token, idx in vocab.items()}
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

def text_to_sequence(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

def sequence_to_text(seq, inv_vocab):
    tokens = [inv_vocab.get(idx, '<unk>') for idx in seq]
    return " ".join(tokens)

#############################################
# 2. Create Dataset and DataLoader
#############################################

class TitleDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_seq = text_to_sequence(row['text'], self.vocab)
        target_seq = text_to_sequence(row['title'], self.vocab)
        # Add <bos> and <eos> tokens to target sequence
        target_seq = [self.vocab['<bos>']] + target_seq + [self.vocab['<eos>']]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lengths = [len(seq) for seq in inputs]
    target_lengths = [len(seq) for seq in targets]
    max_input = max(input_lengths)
    max_target = max(target_lengths)
    padded_inputs = [F.pad(seq, (0, max_input - len(seq)), value=vocab['<pad>']) for seq in inputs]
    padded_targets = [F.pad(seq, (0, max_target - len(seq)), value=vocab['<pad>']) for seq in targets]
    return torch.stack(padded_inputs), torch.stack(padded_targets)

# Use a smaller batch size to save memory
train_dataset = TitleDataset(train_df, vocab)
val_dataset   = TitleDataset(val_df, vocab)
test_dataset  = TitleDataset(test_df, vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

#############################################
# 3. Define Model Components
#############################################

# --- Basic EncoderRNN (bidirectional GRU) ---
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq):
        embedded = self.dropout(self.embedding(input_seq))
        outputs, hidden = self.gru(embedded)
        # Concatenate forward and backward hidden states (assumes one layer)
        hidden_cat = torch.cat((hidden[0:1], hidden[1:2]), dim=2)
        return outputs, hidden_cat

    # Setter for loading pre-trained embeddings (e.g., GloVe)
    def load_embeddings(self, embeddings):
        self.embedding.weight.data.copy_(embeddings)
        # Optionally freeze embeddings:
        # self.embedding.weight.requires_grad = False

# --- Basic DecoderRNN (unidirectional GRU) ---
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden):
        input_token = input_token.unsqueeze(1)
        embedded = self.dropout(self.embedding(input_token))
        output, hidden = self.gru(embedded, hidden)
        output = self.fc(output.squeeze(1))
        output = F.log_softmax(output, dim=1)
        return output, hidden

# --- Hierarchical Encoder (HierEncoderRNN) ---
class HierEncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, word_hidden_dim, sent_hidden_dim, dropout=0.2):
        super(HierEncoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.word_gru = nn.GRU(embed_dim, word_hidden_dim, bidirectional=True, batch_first=True)
        self.sent_gru = nn.GRU(word_hidden_dim*2, sent_hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq):
        embedded = self.dropout(self.embedding(input_seq))
        word_outputs, _ = self.word_gru(embedded)
        # For simplicity, treat the entire article as one sentence by averaging word outputs
        sent_embedding = torch.mean(word_outputs, dim=1, keepdim=True)
        sent_outputs, sent_hidden = self.sent_gru(sent_embedding)
        return sent_outputs, sent_hidden

# --- Decoder with 2 GRUs (Decoder2RNN) ---
class Decoder2RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2):
        super(Decoder2RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru1 = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden):
        input_token = input_token.unsqueeze(1)
        embedded = self.dropout(self.embedding(input_token))
        out1, hidden1 = self.gru1(embedded, hidden)
        out2, hidden2 = self.gru2(out1, hidden)
        output = self.fc(out2.squeeze(1))
        output = F.log_softmax(output, dim=1)
        return output, hidden2

# --- Seq2seq Model (selectable encoder/decoder) ---
class Seq2seqRNN(nn.Module):
    def __init__(self, encoder, decoder, bos_token_idx, eos_token_idx, max_new_tokens=20):
        super(Seq2seqRNN, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.bos_token_idx = bos_token_idx
        self.eos_token_idx = eos_token_idx
        self.max_new_tokens = max_new_tokens

    def forward(self, input_seq, target_seq=None, teacher_forcing_ratio=0.5, use_beam_search=False, beam_width=3):
        batch_size = input_seq.size(0)
        encoder_outputs, hidden = self.encoder(input_seq)
        decoder_input = torch.full((batch_size,), self.bos_token_idx, dtype=torch.long, device=input_seq.device)
        outputs = []
        if target_seq is not None:
            target_length = target_seq.size(1)
            for t in range(target_length):
                decoder_output, hidden = self.decoder(decoder_input, hidden)
                outputs.append(decoder_output.unsqueeze(1))
                if torch.rand(1).item() < teacher_forcing_ratio:
                    decoder_input = target_seq[:, t]
                else:
                    decoder_input = decoder_output.argmax(dim=1)
            outputs = torch.cat(outputs, dim=1)
        else:
            if not use_beam_search:
                generated_tokens = []
                for _ in range(self.max_new_tokens):
                    decoder_output, hidden = self.decoder(decoder_input, hidden)
                    top1 = decoder_output.argmax(dim=1)
                    generated_tokens.append(top1.unsqueeze(1))
                    decoder_input = top1
                    if (top1 == self.eos_token_idx).all():
                        break
                outputs = torch.cat(generated_tokens, dim=1)
            else:
                outputs = self.beam_search_decoding(decoder_input, hidden, beam_width)
        return outputs

    def beam_search_decoding(self, decoder_input, hidden, beam_width):
        batch_size = decoder_input.size(0)
        all_outputs = []
        for i in range(batch_size):
            beams = [([self.bos_token_idx], 0.0, hidden[:, i:i+1, :])]
            for _ in range(self.max_new_tokens):
                new_beams = []
                for seq, score, hidden_i in beams:
                    last_token = torch.tensor([seq[-1]], device=hidden.device)
                    decoder_output, hidden_new = self.decoder(last_token, hidden_i)
                    topv, topi = decoder_output.topk(beam_width)
                    for k in range(beam_width):
                        token = topi[0][k].item()
                        new_seq = seq + [token]
                        new_score = score + topv[0][k].item()
                        new_beams.append((new_seq, new_score, hidden_new))
                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
                if all(seq[-1] == self.eos_token_idx for seq, _, _ in beams):
                    break
            best_seq = beams[0][0]
            all_outputs.append(torch.tensor(best_seq[1:], device=hidden.device))
        max_len = max(seq.size(0) for seq in all_outputs)
        padded = []
        for seq in all_outputs:
            if seq.size(0) < max_len:
                pad = torch.full((max_len - seq.size(0),), self.eos_token_idx, device=seq.device)
                seq = torch.cat([seq, pad])
            padded.append(seq.unsqueeze(0))
        return torch.cat(padded, dim=0)

#############################################
# 4. Instantiate Model and (Optionally) Load GloVe
#############################################

embed_dim   = 300
hidden_dim  = 300
bos_token_idx = vocab['<bos>']
eos_token_idx = vocab['<eos>']

# You can choose your variant:
# Basic:
# encoder = EncoderRNN(vocab_size, embed_dim, hidden_dim)
# decoder = DecoderRNN(vocab_size, embed_dim, hidden_dim)

# Improved (hierarchical encoder and 2-GRU decoder)
encoder = HierEncoderRNN(vocab_size, embed_dim, word_hidden_dim=hidden_dim, sent_hidden_dim=hidden_dim)
decoder = Decoder2RNN(vocab_size, embed_dim, hidden_dim)

model = Seq2seqRNN(encoder, decoder, bos_token_idx, eos_token_idx, max_new_tokens=20)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Optionally load GloVe embeddings
def load_glove_embeddings(glove_path, vocab, embed_dim):
    embeddings = torch.randn(len(vocab), embed_dim)
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            token = parts[0]
            if token in vocab:
                vec = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float)
                embeddings[vocab[token]] = vec
    return embeddings

# Uncomment if you have a GloVe file:
# glove_path = '/content/glove.6B.300d.txt'
# glove_embeddings = load_glove_embeddings(glove_path, vocab, embed_dim)
# encoder.load_embeddings(glove_embeddings)

#############################################
# 5. Training and Evaluation Functions (using Mixed Precision)
#############################################

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=vocab['<pad>'])
scaler = torch.cuda.amp.GradScaler()  # For mixed precision training

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for input_seq, target_seq in dataloader:
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            output = model(input_seq, target_seq, teacher_forcing_ratio=0.5)
            output_dim = output.size(-1)
            output = output.view(-1, output_dim)
            target_seq = target_seq.view(-1)
            loss = criterion(output, target_seq)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        epoch_loss += loss.item()
    torch.cuda.empty_cache()  # clear cache after each epoch
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for input_seq, target_seq in dataloader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq, target_seq, teacher_forcing_ratio=0)
            output_dim = output.size(-1)
            output = output.view(-1, output_dim)
            target_seq = target_seq.view(-1)
            loss = criterion(output, target_seq)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

num_epochs = 10
for epoch in range(num_epochs):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    end_time = time.time()
    print(f"Epoch {epoch+1:02}: Train Loss {train_loss:.3f} | Val Loss {val_loss:.3f} | Time {end_time-start_time:.2f}s")

#############################################
# 6. Inference and ROUGE Evaluation
#############################################

from rouge_score import rouge_scorer

def generate_title(model, input_seq, device, use_beam_search=False):
    model.eval()
    input_seq = input_seq.to(device)
    with torch.no_grad():
        generated_seq = model(input_seq.unsqueeze(0), target_seq=None, use_beam_search=use_beam_search)
    generated_seq = generated_seq.squeeze(0).tolist()
    if eos_token_idx in generated_seq:
        generated_seq = generated_seq[:generated_seq.index(eos_token_idx)]
    return sequence_to_text(generated_seq, inv_vocab)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_rouge(model, dataloader, device):
    model.eval()
    rouge1_total, rouge2_total, rougeL_total = 0, 0, 0
    count = 0
    with torch.no_grad():
        for input_seq, target_seq in dataloader:
            input_seq = input_seq.to(device)
            for i in range(input_seq.size(0)):
                gt_seq = target_seq[i].tolist()
                gt_seq = [x for x in gt_seq if x not in [vocab['<bos>'], vocab['<eos>'], vocab['<pad>']]]
                gt_title = sequence_to_text(gt_seq, inv_vocab)
                pred_title = generate_title(model, input_seq[i], device, use_beam_search=False)
                scores = scorer.score(gt_title, pred_title)
                rouge1_total += scores['rouge1'].fmeasure
                rouge2_total += scores['rouge2'].fmeasure
                rougeL_total += scores['rougeL'].fmeasure
                count += 1
    return rouge1_total/count, rouge2_total/count, rougeL_total/count

r1, r2, rL = evaluate_rouge(model, test_loader, device)
print(f"ROUGE-1: {r1:.4f} | ROUGE-2: {r2:.4f} | ROUGE-L: {rL:.4f}")
print("\n--- Test‑Set GT vs Pred ---")
model.eval()
with torch.no_grad():
    for idx, (input_seq, target_seq) in enumerate(test_loader, 1):
        input_seq = input_seq.to(device)
        pred = generate_title(model, input_seq.squeeze(0), device, use_beam_search=False)

        # Recover GT title string
        gt_idxs = [i for i in target_seq.squeeze(0).tolist()
                   if i not in (vocab['<bos>'], vocab['<eos>'], vocab['<pad>'])]
        gt = sequence_to_text(gt_idxs, inv_vocab)

        print(f"Example {idx:03d}\n  GT : {gt}\n  Pred: {pred}\n")



Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e0cafff4cadfe992ae9630b1f58d1c2ecc14ad56d7242cf4396d89bbd20a7208
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary Size: 54473


  scaler = torch.cuda.amp.GradScaler()  # For mixed precision training
  with torch.cuda.amp.autocast():


Epoch 01: Train Loss 1.833 | Val Loss 1.729 | Time 477.03s
Epoch 02: Train Loss 1.629 | Val Loss 1.696 | Time 496.12s
Epoch 03: Train Loss 1.353 | Val Loss 1.358 | Time 481.90s
Epoch 04: Train Loss 1.112 | Val Loss 1.174 | Time 476.16s
Epoch 05: Train Loss 0.981 | Val Loss 1.141 | Time 480.35s
Epoch 06: Train Loss 0.860 | Val Loss 1.029 | Time 481.64s
Epoch 07: Train Loss 0.756 | Val Loss 1.012 | Time 480.40s
Epoch 08: Train Loss 0.669 | Val Loss 0.953 | Time 477.56s
Epoch 09: Train Loss 0.592 | Val Loss 0.940 | Time 482.45s
Epoch 10: Train Loss 0.539 | Val Loss 0.958 | Time 481.69s
ROUGE-1: 0.6387 | ROUGE-2: 0.3545 | ROUGE-L: 0.6387

--- Test‑Set GT vs Pred ---
Example 001
  GT : <unk>
  Pred: <bos> <unk> <unk>

Example 002
  GT : <unk> High <unk> <unk>
  Pred: <bos> <unk> High School

Example 003
  GT : Minnesota <unk> <unk>
  Pred: <bos> West <unk> offense

Example 004
  GT : List of people from Louisiana
  Pred: <bos> List of people from <unk>

Example 005
  GT : <unk>
  Pred: <bos

In [None]:

import os
# Mitigate CUDA fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('punkt')
!pip install rouge-score
from rouge_score import rouge_scorer
import os
# Set environment variable for CUDA memory fragmentation mitigation

import pandas as pd
import re
import nltk
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
!pip install rouge-score

# Download required NLTK data
nltk.download('punkt')
# ------------------------
# 1. Runtime Configuration
# ------------------------
use_glove     = input("Use GloVe embeddings? (y/n): ").strip().lower().startswith('y')
glove_path = None
if use_glove:
    glove_path = input("→ Path to your GloVe file (e.g. /content/glove.6B.300d.txt): ").strip()
    if not os.path.isfile(glove_path):
        raise FileNotFoundError(f"GloVe file not found at '{glove_path}'")
    # Infer embedding dimension from the first line
    with open(glove_path, 'r', encoding='utf-8') as f:
        first = f.readline().split()
        inferred_dim = len(first) - 1
    print(f"Detected GloVe embedding dimension: {inferred_dim}")
else:
    inferred_dim = 300  # default if not using GloVe

use_hier      = input("Use Hierarchical Encoder? (y/n): ").strip().lower().startswith('y')
use_decoder2  = input("Use 2‑GRU Decoder? (y/n): ").strip().lower().startswith('y')
use_beam      = input("Use Beam Search? (y/n): ").strip().lower().startswith('y')
beam_width    = int(input("Beam width (e.g. 3): ").strip() or 3)

print(f"\nConfig → GloVe: {use_glove}, Hier: {use_hier}, Decoder2: {use_decoder2}, Beam: {use_beam}, k={beam_width}\n")

# ------------------------
# 2. Load Data & Build Vocab
# ------------------------
train_df = pd.read_csv('/content/train_processed (1).csv')
val_df   = pd.read_csv('/content/val_processed (1).csv')
test_df  = pd.read_csv('/content/test_processed (1).csv')

def tokenize(txt): return txt.split()

min_freq = 0.001 * len(train_df)
ctr = Counter()
for col in ['text','title']:
    for s in train_df[col]:
        ctr.update(tokenize(s))

vocab = {'<pad>':0,'<bos>':1,'<eos>':2,'<unk>':3}
for tok,f in ctr.items():
    if f >= min_freq:
        vocab[tok] = len(vocab)
inv_vocab = {i:t for t,i in vocab.items()}
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def text2seq(s):
    return [vocab.get(t, vocab['<unk>']) for t in tokenize(s)]
def seq2text(seq):
    return " ".join(inv_vocab.get(i,'<unk>') for i in seq)

# ------------------------
# 3. Dataset & Dataloader
# ------------------------
class TitleDS(Dataset):
    def __init__(self,df): self.df=df
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        row=self.df.iloc[i]
        inp=text2seq(row['text'])
        tgt=[vocab['<bos>']] + text2seq(row['title']) + [vocab['<eos>']]
        return torch.tensor(inp), torch.tensor(tgt)

def collate_fn(batch):
    ins,ts = zip(*batch)
    Mi, Mt = max(len(x) for x in ins), max(len(x) for x in ts)
    ins = torch.stack([F.pad(x,(0,Mi-len(x)),value=vocab['<pad>']) for x in ins])
    ts  = torch.stack([F.pad(x,(0,Mt-len(x)),value=vocab['<pad>']) for x in ts])
    return ins, ts

train_loader = DataLoader(TitleDS(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TitleDS(val_df),   batch_size=8, shuffle=False,collate_fn=collate_fn)
test_loader  = DataLoader(TitleDS(test_df),  batch_size=1, shuffle=False,collate_fn=collate_fn)

# ------------------------
# 4. Model Components
# ------------------------
class EncoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
    def forward(self,x):
        return self.gru(self.emb(x))
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class HierEncoderRNN(nn.Module):
    def __init__(self,vs,ed,wh,sh):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.wgru=nn.GRU(ed,wh,batch_first=True)
        self.sgru=nn.GRU(wh,sh,batch_first=True)
    def forward(self,x):
        wo,wh = self.wgru(self.emb(x))
        se = wo.mean(dim=1,keepdim=True)
        return self.sgru(se)
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class DecoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o,h = self.gru(self.emb(tk.unsqueeze(1)),h)
        return F.log_softmax(self.fc(o.squeeze(1)),dim=1), h

class Decoder2RNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.g1 = nn.GRU(ed,hd,batch_first=True)
        self.g2 = nn.GRU(hd,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o1,h1 = self.g1(self.emb(tk.unsqueeze(1)),h)
        o2,h2 = self.g2(o1,h1)
        return F.log_softmax(self.fc(o2.squeeze(1)),dim=1), h2

class Seq2seqRNN(nn.Module):
    def __init__(self,enc,dec,bos,eos,mx=20):
        super().__init__()
        self.enc, self.dec = enc, dec
        self.bos, self.eos, self.mx = bos, eos, mx

    def forward(self,src,tgt=None,tf=0.5,beam=False,k=3):
        bs = src.size(0)
        eo,hidden = self.enc(src)
        inp = src.new_full((bs,), self.bos)
        outputs=[]
        if tgt is not None:
            for t in range(tgt.size(1)):
                out,hidden = self.dec(inp,hidden)
                outputs.append(out.unsqueeze(1))
                inp = tgt[:,t] if torch.rand(1).item()<tf else out.argmax(1)
            return torch.cat(outputs,dim=1)
        # inference
        if not beam:
            seqs=[]
            for _ in range(self.mx):
                out,hidden = self.dec(inp,hidden)
                top1 = out.argmax(1)
                seqs.append(top1.unsqueeze(1))
                inp = top1
                if (top1==self.eos).all(): break
            return torch.cat(seqs,dim=1)
        # beam search
        all_out=[]
        for i in range(bs):
            beams=[([self.bos],0.0,hidden[:,i:i+1,:])]
            for _ in range(self.mx):
                nb=[]
                for seq,sc,hi in beams:
                    last = src.new_tensor([seq[-1]])
                    out,hn = self.dec(last,hi)
                    vals,inds = out.topk(k)
                    for j in range(k):
                        nb.append((seq+[inds[0,j].item()], sc+vals[0,j].item(), hn))
                beams = sorted(nb, key=lambda x:x[1], reverse=True)[:k]
                if all(s[-1]==self.eos for s,_,_ in beams): break
            best = beams[0][0][1:]
            all_out.append(torch.tensor(best,device=src.device))
        ml = max(o.size(0) for o in all_out)
        return torch.stack([F.pad(o,(0,ml-o.size(0)),value=self.eos) for o in all_out],dim=0)

# ------------------------
# 5. Instantiate Model
# ------------------------
embed_dim = inferred_dim
hid_dim   = 300
bos, eos  = vocab['<bos>'], vocab['<eos>']

# Encoder choice
encoder = HierEncoderRNN(vocab_size, embed_dim, hid_dim, hid_dim) if use_hier \
          else EncoderRNN(vocab_size, embed_dim, hid_dim)

# Load GloVe if requested
if use_glove:
    print("→ Loading GloVe embeddings from", glove_path)
    def load_glove(path, vs, ed):
        E = torch.randn(vs, ed)
        with open(path,'r',encoding='utf-8') as f:
            for line in f:
                parts = line.split()
                w, vec = parts[0], torch.tensor(list(map(float,parts[1:])))
                idx = vocab.get(w)
                if idx is not None:
                    E[idx] = vec
        return E
    encoder.load_embeddings(load_glove(glove_path, vocab_size, embed_dim))

# Decoder choice
decoder = Decoder2RNN(vocab_size, embed_dim, hid_dim) if use_decoder2 \
          else DecoderRNN(vocab_size, embed_dim, hid_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = Seq2seqRNN(encoder, decoder, bos, eos, mx=20).to(device)

# ------------------------
# 6. Training & Eval
# ------------------------
opt    = torch.optim.Adam(model.parameters(), lr=1e-3)
crit   = nn.NLLLoss(ignore_index=vocab['<pad>'])
scaler = torch.cuda.amp.GradScaler()

def train_epoch():
    model.train(); total=0
    for src,tgt in train_loader:
        src,tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        with torch.cuda.amp.autocast():
            out = model(src,tgt,tf=0.5,beam=False)
            loss = crit(out.view(-1,out.size(-1)), tgt.view(-1))
        scaler.scale(loss).backward()
        scaler.step(opt); scaler.update()
        total += loss.item()
    return total/len(train_loader)

def eval_epoch():
    model.eval(); total=0
    with torch.no_grad():
        for src,tgt in val_loader:
            src,tgt = src.to(device), tgt.to(device)
            out = model(src,tgt,tf=0,beam=False)
            total += crit(out.view(-1,out.size(-1)), tgt.view(-1)).item()
    return total/len(val_loader)

for ep in range(1,6):
    t0 = time.time()
    tr = train_epoch(); va = eval_epoch()
    print(f"Epoch {ep}: train {tr:.3f} | val {va:.3f} | {time.time()-t0:.1f}s")

# ------------------------
# 7. ROUGE Evaluation
# ------------------------
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
r1=r2=rL=cnt=0
model.eval()
with torch.no_grad():
    for src,tgt in test_loader:
        src = src.to(device)
        pred_seq = model(src, tgt=None, beam=use_beam, k=beam_width).squeeze(0).tolist()
        if eos in pred_seq: pred_seq = pred_seq[:pred_seq.index(eos)]
        pred_txt = seq2text(pred_seq)
        tgt_seq = [i for i in tgt.squeeze(0).tolist() if i not in (bos,eos,vocab['<pad>'])]
        tgt_txt  = seq2text(tgt_seq)
        scr = scorer.score(tgt_txt, pred_txt)
        r1 += scr['rouge1'].fmeasure
        r2 += scr['rouge2'].fmeasure
        rL += scr['rougeL'].fmeasure
        cnt += 1

print(f"\n→ ROUGE on test ({'beam' if use_beam else 'greedy'}):")
print(f"   ROUGE‑1: {r1/cnt:.4f}")
print(f"   ROUGE‑2: {r2/cnt:.4f}")
print(f"   ROUGE‑L: {rL/cnt:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=308a9e708342dddbbac3c835cf6d89623f7780efb835bdf24f1d45c0f83cc831
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Use GloVe embeddings? (y/n): n
Use Hierarchical Encoder? (y/n): y
Use 2‑GRU Decoder? (y/n): n
Use Beam Search? (y/n): n
Beam width (e.g. 3): 3

Config → GloVe: False, Hier: True, Decoder2: False, Beam: False, k=3

Vocab size: 54473


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1: train 1.767 | val 1.539 | 273.1s
Epoch 2: train 1.215 | val 1.179 | 272.8s
Epoch 3: train 0.906 | val 0.962 | 261.7s
Epoch 4: train 0.686 | val 0.919 | 262.9s
Epoch 5: train 0.542 | val 0.894 | 260.3s

→ ROUGE on test (greedy):
   ROUGE‑1: 0.6577
   ROUGE‑2: 0.3828
   ROUGE‑L: 0.6510


In [None]:
# import os
# # Mitigate CUDA fragmentation
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# import pandas as pd
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import time
# from collections import Counter
# from torch.utils.data import Dataset, DataLoader
# import nltk
# import pandas as pd
# import re
# import nltk
# import string
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import time
# from collections import Counter
# from torch.utils.data import Dataset, DataLoader
# !pip install rouge-score
# nltk.download('punkt')
# from rouge_score import rouge_scorer

# # ------------------------
# # 1. Runtime Configuration
# # ------------------------
# use_glove     = input("Use GloVe embeddings? (y/n): ").strip().lower().startswith('y')
# use_hier      = input("Use Hierarchical Encoder? (y/n): ").strip().lower().startswith('y')
# use_decoder2  = input("Use 2‑GRU Decoder? (y/n): ").strip().lower().startswith('y')
# use_beam      = input("Use Beam Search? (y/n): ").strip().lower().startswith('y')
# beam_width    = int(input("Beam width (e.g. 3): ").strip() or 3)

# print(f"\nConfig → GloVe: {use_glove}, Hier: {use_hier}, Decoder2: {use_decoder2}, Beam: {use_beam}, k={beam_width}\n")

# # ------------------------
# # 2. Load Data & Build Vocab
# # ------------------------
# train_df = pd.read_csv('/content/train_processed.csv')
# val_df   = pd.read_csv('/content/val_processed.csv')
# test_df  = pd.read_csv('/content/test_processed.csv')

# def tokenize(txt): return txt.split()

# # Build freq counter
# min_freq = 0.001 * len(train_df)
# ctr = Counter()
# for col in ['text','title']:
#     for s in train_df[col]:
#         ctr.update(tokenize(s))

# # Build vocab
# vocab = {'<pad>':0,'<bos>':1,'<eos>':2,'<unk>':3}
# for tok,f in ctr.items():
#     if f>=min_freq:
#         vocab[tok] = len(vocab)
# inv_vocab = {i:t for t,i in vocab.items()}
# vocab_size = len(vocab)
# print("Vocab size:",vocab_size)

# def text2seq(s):
#     return [vocab.get(t,4) for t in tokenize(s)]
# def seq2text(seq):
#     return " ".join(inv_vocab.get(i,'<unk>') for i in seq)

# # ------------------------
# # 3. Dataset & Dataloader
# # ------------------------
# class TitleDS(Dataset):
#     def __init__(self,df):
#         self.df=df
#     def __len__(self): return len(self.df)
#     def __getitem__(self,i):
#         row=self.df.iloc[i]
#         inp=text2seq(row['text'])
#         tgt=[vocab['<bos>']]+text2seq(row['title'])+[vocab['<eos>']]
#         return torch.tensor(inp), torch.tensor(tgt)

# def collate_fn(batch):
#     ins,ts = zip(*batch)
#     li=[len(x) for x in ins]; lt=[len(x) for x in ts]
#     Mi, Mt = max(li), max(lt)
#     ins = torch.stack([F.pad(x,(0,Mi-len(x)),value=0) for x in ins])
#     ts  = torch.stack([F.pad(x,(0,Mt-len(x)),value=0) for x in ts])
#     return ins, ts

# train_loader = DataLoader(TitleDS(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
# val_loader   = DataLoader(TitleDS(val_df),   batch_size=8, shuffle=False,collate_fn=collate_fn)
# test_loader  = DataLoader(TitleDS(test_df),  batch_size=1, shuffle=False,collate_fn=collate_fn)

# # ------------------------
# # 4. Model Components
# # ------------------------
# class EncoderRNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.gru=nn.GRU(ed,hd,batch_first=True)
#     def forward(self,x):
#         e=self.emb(x)
#         o,h=self.gru(e)
#         return o,h
#     def load_embeddings(self,weights):
#         self.emb.weight.data.copy_(weights)

# class HierEncoderRNN(nn.Module):
#     def __init__(self,vs,ed,wh,sh,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.wgru=nn.GRU(ed,wh,batch_first=True)
#         self.sgru=nn.GRU(wh,sh,batch_first=True)
#     def forward(self,x):
#         e=self.emb(x)
#         wo,wh=self.wgru(e)
#         # average over time → sentence embedding
#         se=wo.mean(dim=1,keepdim=True)
#         so,sh=self.sgru(se)
#         return so,sh
#     def load_embeddings(self,weights):
#         self.emb.weight.data.copy_(weights)

# class DecoderRNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.gru=nn.GRU(ed,hd,batch_first=True)
#         self.fc=nn.Linear(hd,vs)
#     def forward(self,token,hidden):
#         e=self.emb(token.unsqueeze(1))
#         o,h=self.gru(e,hidden)
#         return F.log_softmax(self.fc(o.squeeze(1)),dim=1), h

# class Decoder2RNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.g1=nn.GRU(ed,hd,batch_first=True)
#         self.g2=nn.GRU(hd,hd,batch_first=True)
#         self.fc=nn.Linear(hd,vs)
#     def forward(self,token,hidden):
#         e=self.emb(token.unsqueeze(1))
#         o1,h1=self.g1(e,hidden)
#         o2,h2=self.g2(o1,h1)
#         return F.log_softmax(self.fc(o2.squeeze(1)),dim=1), h2

# class Seq2seqRNN(nn.Module):
#     def __init__(self,enc,dec,bos,eos,mx=20):
#         super().__init__()
#         self.enc, self.dec = enc, dec
#         self.bos, self.eos, self.mx = bos,eos,mx

#     def forward(self,src,tgt=None,tf=0.5,beam=False,k=3):
#         bs=src.size(0)
#         eo,hidden = self.enc(src)
#         inp = src.new_full((bs,),self.bos)
#         outputs=[]
#         if tgt is not None:
#             L=tgt.size(1)
#             for t in range(L):
#                 out,hidden=self.dec(inp,hidden)
#                 outputs.append(out.unsqueeze(1))
#                 inp = tgt[:,t] if torch.rand(1).item()<tf else out.argmax(1)
#             return torch.cat(outputs,dim=1)
#         # inference
#         if not beam:
#             seqs=[]
#             for _ in range(self.mx):
#                 out,hidden=self.dec(inp,hidden)
#                 top1=out.argmax(1)
#                 seqs.append(top1.unsqueeze(1))
#                 inp=top1
#                 if (top1==self.eos).all(): break
#             return torch.cat(seqs,dim=1)
#         # beam search
#         all_out=[]
#         for i in range(bs):
#             beams=[([self.bos],0.0,hidden[:,i:i+1,:])]
#             for _ in range(self.mx):
#                 nb=[]
#                 for seq,sc,hi in beams:
#                     last=src.new_tensor([seq[-1]])
#                     out,hn=self.dec(last,hi)
#                     vals,inds=out.topk(k)
#                     for j in range(k):
#                         nb.append((seq+[inds[0,j].item()], sc+vals[0,j].item(), hn))
#                 beams=sorted(nb,key=lambda x:x[1],reverse=True)[:k]
#                 if all(s[-1]==self.eos for s,_,_ in beams): break
#             best=beams[0][0][1:]  # drop BOS
#             all_out.append(torch.tensor(best,device=src.device))
#         # pad
#         ml=max(o.size(0) for o in all_out)
#         padded=[F.pad(o,(0,ml-o.size(0)),value=self.eos) for o in all_out]
#         return torch.stack(padded,dim=0)

# # ------------------------
# # 5. Instantiate Model
# # ------------------------
# embed_dim, hid_dim = 300, 300
# bos, eos = vocab['<bos>'], vocab['<eos>']

# # choose encoder
# if use_hier:
#     encoder = HierEncoderRNN(vocab_size, embed_dim, hid_dim, hid_dim)
# else:
#     encoder = EncoderRNN(vocab_size, embed_dim, hid_dim)

# # load GloVe if requested
# if use_glove:
#     print("→ Loading GloVe embeddings...")
#     def load_glove(path):
#         E = torch.randn(vocab_size, embed_dim)
#         with open(path,'r',encoding='utf-8') as f:
#             for l in f:
#                 p=l.split()
#                 w,vec=p[0],torch.tensor(list(map(float,p[1:])))
#                 if w in vocab: E[vocab[w]] = vec
#         return E
#     embw = load_glove(config.glove_path if 'config' in globals() else '/content/glove.6B.300d.txt')
#     encoder.load_embeddings(embw)

# # choose decoder
# decoder = Decoder2RNN(vocab_size, embed_dim, hid_dim) if use_decoder2 else DecoderRNN(vocab_size, embed_dim, hid_dim)

# model = Seq2seqRNN(encoder, decoder, bos, eos, mx=20).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# # ------------------------
# # 6. Training & Eval
# # ------------------------
# opt = torch.optim.Adam(model.parameters(), lr=1e-3)
# crit=nn.NLLLoss(ignore_index=0)
# scaler=torch.cuda.amp.GradScaler()

# def train_epoch():
#     model.train(); total=0
#     for src,tgt in train_loader:
#         src,tgt=src.to(model.dec.emb.weight.device),tgt.to(model.dec.emb.weight.device)
#         opt.zero_grad()
#         with torch.cuda.amp.autocast():
#             out=model(src,tgt,tf=0.5,beam=False)
#             loss=crit(out.view(-1,out.size(-1)),tgt.view(-1))
#         scaler.scale(loss).backward()
#         scaler.step(opt); scaler.update()
#         total+=loss.item()
#     return total/len(train_loader)

# def eval_epoch():
#     model.eval(); total=0
#     with torch.no_grad():
#         for src,tgt in val_loader:
#             src,tgt=src.to(model.dec.emb.weight.device),tgt.to(model.dec.emb.weight.device)
#             out=model(src,tgt,tf=0,beam=False)
#             total+=crit(out.view(-1,out.size(-1)),tgt.view(-1)).item()
#     return total/len(val_loader)

# for ep in range(1,6):
#     t0=time.time()
#     tr=train_epoch(); va=eval_epoch()
#     print(f"Epoch {ep}: train {tr:.3f} | val {va:.3f} | {time.time()-t0:.1f}s")

# # ------------------------
# # 7. ROUGE Evaluation
# # ------------------------
# scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'],use_stemmer=True)
# r1=r2=rL=cnt=0
# model.eval()
# with torch.no_grad():
#     for src,tgt in test_loader:
#         src=src.to(model.dec.emb.weight.device)
#         pred_seq = model(src, tgt=None, beam=use_beam, k=beam_width).squeeze(0).tolist()
#         # cut at EOS
#         if eos in pred_seq: pred_seq=pred_seq[:pred_seq.index(eos)]
#         pred_txt = seq2text(pred_seq)
#         tgt_seq = tgt.squeeze(0).tolist()
#         tgt_seq=[i for i in tgt_seq if i not in (bos,eos,0)]
#         tgt_txt=seq2text(tgt_seq)
#         scr = scorer.score(tgt_txt,pred_txt)
#         r1+=scr['rouge1'].fmeasure
#         r2+=scr['rouge2'].fmeasure
#         rL+=scr['rougeL'].fmeasure
#         cnt+=1

# print(f"\n→ ROUGE on test ({'beam' if use_beam else 'greedy'}):")
# print(f"   ROUGE‑1: {r1/cnt:.4f}")
# print(f"   ROUGE‑2: {r2/cnt:.4f}")
# print(f"   ROUGE‑L: {rL/cnt:.4f}")
import os
# Mitigate CUDA fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('punkt')
!pip install rouge-score
from rouge_score import rouge_scorer
import os
# Set environment variable for CUDA memory fragmentation mitigation

import pandas as pd
import re
import nltk
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
!pip install rouge-score

# Download required NLTK data
nltk.download('punkt')
# ------------------------
# 1. Runtime Configuration
# ------------------------
use_glove     = input("Use GloVe embeddings? (y/n): ").strip().lower().startswith('y')
glove_path = None
if use_glove:
    glove_path = input("→ Path to your GloVe file (e.g. /content/glove.6B.300d.txt): ").strip()
    if not os.path.isfile(glove_path):
        raise FileNotFoundError(f"GloVe file not found at '{glove_path}'")
    # Infer embedding dimension from the first line
    with open(glove_path, 'r', encoding='utf-8') as f:
        first = f.readline().split()
        inferred_dim = len(first) - 1
    print(f"Detected GloVe embedding dimension: {inferred_dim}")
else:
    inferred_dim = 300  # default if not using GloVe

use_hier      = input("Use Hierarchical Encoder? (y/n): ").strip().lower().startswith('y')
use_decoder2  = input("Use 2‑GRU Decoder? (y/n): ").strip().lower().startswith('y')
use_beam      = input("Use Beam Search? (y/n): ").strip().lower().startswith('y')
beam_width    = int(input("Beam width (e.g. 3): ").strip() or 3)

print(f"\nConfig → GloVe: {use_glove}, Hier: {use_hier}, Decoder2: {use_decoder2}, Beam: {use_beam}, k={beam_width}\n")

# ------------------------
# 2. Load Data & Build Vocab
# ------------------------
train_df = pd.read_csv('/content/train_processed (1).csv')
val_df   = pd.read_csv('/content/val_processed (1).csv')
test_df  = pd.read_csv('/content/test_processed (1).csv')

def tokenize(txt): return txt.split()

min_freq = 0.001 * len(train_df)
ctr = Counter()
for col in ['text','title']:
    for s in train_df[col]:
        ctr.update(tokenize(s))

vocab = {'<pad>':0,'<bos>':1,'<eos>':2,'<unk>':3}
for tok,f in ctr.items():
    if f >= min_freq:
        vocab[tok] = len(vocab)
inv_vocab = {i:t for t,i in vocab.items()}
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def text2seq(s):
    return [vocab.get(t, vocab['<unk>']) for t in tokenize(s)]
def seq2text(seq):
    return " ".join(inv_vocab.get(i,'<unk>') for i in seq)

# ------------------------
# 3. Dataset & Dataloader
# ------------------------
class TitleDS(Dataset):
    def __init__(self,df): self.df=df
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        row=self.df.iloc[i]
        inp=text2seq(row['text'])
        tgt=[vocab['<bos>']] + text2seq(row['title']) + [vocab['<eos>']]
        return torch.tensor(inp), torch.tensor(tgt)

def collate_fn(batch):
    ins,ts = zip(*batch)
    Mi, Mt = max(len(x) for x in ins), max(len(x) for x in ts)
    ins = torch.stack([F.pad(x,(0,Mi-len(x)),value=vocab['<pad>']) for x in ins])
    ts  = torch.stack([F.pad(x,(0,Mt-len(x)),value=vocab['<pad>']) for x in ts])
    return ins, ts

train_loader = DataLoader(TitleDS(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TitleDS(val_df),   batch_size=8, shuffle=False,collate_fn=collate_fn)
test_loader  = DataLoader(TitleDS(test_df),  batch_size=1, shuffle=False,collate_fn=collate_fn)

# ------------------------
# 4. Model Components
# ------------------------
class EncoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
    def forward(self,x):
        return self.gru(self.emb(x))
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class HierEncoderRNN(nn.Module):
    def __init__(self,vs,ed,wh,sh):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.wgru=nn.GRU(ed,wh,batch_first=True)
        self.sgru=nn.GRU(wh,sh,batch_first=True)
    def forward(self,x):
        wo,wh = self.wgru(self.emb(x))
        se = wo.mean(dim=1,keepdim=True)
        return self.sgru(se)
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class DecoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o,h = self.gru(self.emb(tk.unsqueeze(1)),h)
        return F.log_softmax(self.fc(o.squeeze(1)),dim=1), h

class Decoder2RNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.g1 = nn.GRU(ed,hd,batch_first=True)
        self.g2 = nn.GRU(hd,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o1,h1 = self.g1(self.emb(tk.unsqueeze(1)),h)
        o2,h2 = self.g2(o1,h1)
        return F.log_softmax(self.fc(o2.squeeze(1)),dim=1), h2

class Seq2seqRNN(nn.Module):
    def __init__(self,enc,dec,bos,eos,mx=20):
        super().__init__()
        self.enc, self.dec = enc, dec
        self.bos, self.eos, self.mx = bos, eos, mx

    def forward(self,src,tgt=None,tf=0.5,beam=False,k=3):
        bs = src.size(0)
        eo,hidden = self.enc(src)
        inp = src.new_full((bs,), self.bos)
        outputs=[]
        if tgt is not None:
            for t in range(tgt.size(1)):
                out,hidden = self.dec(inp,hidden)
                outputs.append(out.unsqueeze(1))
                inp = tgt[:,t] if torch.rand(1).item()<tf else out.argmax(1)
            return torch.cat(outputs,dim=1)
        # inference
        if not beam:
            seqs=[]
            for _ in range(self.mx):
                out,hidden = self.dec(inp,hidden)
                top1 = out.argmax(1)
                seqs.append(top1.unsqueeze(1))
                inp = top1
                if (top1==self.eos).all(): break
            return torch.cat(seqs,dim=1)
        # beam search
        all_out=[]
        for i in range(bs):
            beams=[([self.bos],0.0,hidden[:,i:i+1,:])]
            for _ in range(self.mx):
                nb=[]
                for seq,sc,hi in beams:
                    last = src.new_tensor([seq[-1]])
                    out,hn = self.dec(last,hi)
                    vals,inds = out.topk(k)
                    for j in range(k):
                        nb.append((seq+[inds[0,j].item()], sc+vals[0,j].item(), hn))
                beams = sorted(nb, key=lambda x:x[1], reverse=True)[:k]
                if all(s[-1]==self.eos for s,_,_ in beams): break
            best = beams[0][0][1:]
            all_out.append(torch.tensor(best,device=src.device))
        ml = max(o.size(0) for o in all_out)
        return torch.stack([F.pad(o,(0,ml-o.size(0)),value=self.eos) for o in all_out],dim=0)

# ------------------------
# 5. Instantiate Model
# ------------------------
embed_dim = inferred_dim
hid_dim   = 300
bos, eos  = vocab['<bos>'], vocab['<eos>']

# Encoder choice
encoder = HierEncoderRNN(vocab_size, embed_dim, hid_dim, hid_dim) if use_hier \
          else EncoderRNN(vocab_size, embed_dim, hid_dim)

# Load GloVe if requested
if use_glove:
    print("→ Loading GloVe embeddings from", glove_path)
    def load_glove(path, vs, ed):
        E = torch.randn(vs, ed)
        with open(path,'r',encoding='utf-8') as f:
            for line in f:
                parts = line.split()
                w, vec = parts[0], torch.tensor(list(map(float,parts[1:])))
                idx = vocab.get(w)
                if idx is not None:
                    E[idx] = vec
        return E
    encoder.load_embeddings(load_glove(glove_path, vocab_size, embed_dim))

# Decoder choice
decoder = Decoder2RNN(vocab_size, embed_dim, hid_dim) if use_decoder2 \
          else DecoderRNN(vocab_size, embed_dim, hid_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = Seq2seqRNN(encoder, decoder, bos, eos, mx=20).to(device)

# ------------------------
# 6. Training & Eval
# ------------------------
opt    = torch.optim.Adam(model.parameters(), lr=1e-3)
crit   = nn.NLLLoss(ignore_index=vocab['<pad>'])
scaler = torch.cuda.amp.GradScaler()

def train_epoch():
    model.train(); total=0
    for src,tgt in train_loader:
        src,tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        with torch.cuda.amp.autocast():
            out = model(src,tgt,tf=0.5,beam=False)
            loss = crit(out.view(-1,out.size(-1)), tgt.view(-1))
        scaler.scale(loss).backward()
        scaler.step(opt); scaler.update()
        total += loss.item()
    return total/len(train_loader)

def eval_epoch():
    model.eval(); total=0
    with torch.no_grad():
        for src,tgt in val_loader:
            src,tgt = src.to(device), tgt.to(device)
            out = model(src,tgt,tf=0,beam=False)
            total += crit(out.view(-1,out.size(-1)), tgt.view(-1)).item()
    return total/len(val_loader)

for ep in range(1,6):
    t0 = time.time()
    tr = train_epoch(); va = eval_epoch()
    print(f"Epoch {ep}: train {tr:.3f} | val {va:.3f} | {time.time()-t0:.1f}s")

# ------------------------
# 7. ROUGE Evaluation
# ------------------------
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
r1=r2=rL=cnt=0
model.eval()
with torch.no_grad():
    for src,tgt in test_loader:
        src = src.to(device)
        pred_seq = model(src, tgt=None, beam=use_beam, k=beam_width).squeeze(0).tolist()
        if eos in pred_seq: pred_seq = pred_seq[:pred_seq.index(eos)]
        pred_txt = seq2text(pred_seq)
        tgt_seq = [i for i in tgt.squeeze(0).tolist() if i not in (bos,eos,vocab['<pad>'])]
        tgt_txt  = seq2text(tgt_seq)
        scr = scorer.score(tgt_txt, pred_txt)
        r1 += scr['rouge1'].fmeasure
        r2 += scr['rouge2'].fmeasure
        rL += scr['rougeL'].fmeasure
        cnt += 1

print(f"\n→ ROUGE on test ({'beam' if use_beam else 'greedy'}):")
print(f"   ROUGE‑1: {r1/cnt:.4f}")
print(f"   ROUGE‑2: {r2/cnt:.4f}")
print(f"   ROUGE‑L: {rL/cnt:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Use GloVe embeddings? (y/n): n
Use Hierarchical Encoder? (y/n): n
Use 2‑GRU Decoder? (y/n): y
Use Beam Search? (y/n): n
Beam width (e.g. 3): 3

Config → GloVe: False, Hier: False, Decoder2: True, Beam: False, k=3

Vocab size: 54473


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1: train 1.673 | val 1.355 | 274.0s
Epoch 2: train 1.100 | val 1.085 | 270.3s
Epoch 3: train 0.818 | val 0.931 | 270.2s
Epoch 4: train 0.604 | val 0.886 | 271.5s
Epoch 5: train 0.463 | val 0.847 | 271.8s

→ ROUGE on test (greedy):
   ROUGE‑1: 0.6688
   ROUGE‑2: 0.3833
   ROUGE‑L: 0.6630


In [None]:
# import os
# # Mitigate CUDA fragmentation
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# import pandas as pd
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import time
# from collections import Counter
# from torch.utils.data import Dataset, DataLoader
# import nltk
# import pandas as pd
# import re
# import nltk
# import string
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import time
# from collections import Counter
# from torch.utils.data import Dataset, DataLoader
# !pip install rouge-score
# nltk.download('punkt')
# from rouge_score import rouge_scorer

# # ------------------------
# # 1. Runtime Configuration
# # ------------------------
# use_glove     = input("Use GloVe embeddings? (y/n): ").strip().lower().startswith('y')
# use_hier      = input("Use Hierarchical Encoder? (y/n): ").strip().lower().startswith('y')
# use_decoder2  = input("Use 2‑GRU Decoder? (y/n): ").strip().lower().startswith('y')
# use_beam      = input("Use Beam Search? (y/n): ").strip().lower().startswith('y')
# beam_width    = int(input("Beam width (e.g. 3): ").strip() or 3)

# print(f"\nConfig → GloVe: {use_glove}, Hier: {use_hier}, Decoder2: {use_decoder2}, Beam: {use_beam}, k={beam_width}\n")

# # ------------------------
# # 2. Load Data & Build Vocab
# # ------------------------
# train_df = pd.read_csv('/content/train_processed.csv')
# val_df   = pd.read_csv('/content/val_processed.csv')
# test_df  = pd.read_csv('/content/test_processed.csv')

# def tokenize(txt): return txt.split()

# # Build freq counter
# min_freq = 0.001 * len(train_df)
# ctr = Counter()
# for col in ['text','title']:
#     for s in train_df[col]:
#         ctr.update(tokenize(s))

# # Build vocab
# vocab = {'<pad>':0,'<bos>':1,'<eos>':2,'<unk>':3}
# for tok,f in ctr.items():
#     if f>=min_freq:
#         vocab[tok] = len(vocab)
# inv_vocab = {i:t for t,i in vocab.items()}
# vocab_size = len(vocab)
# print("Vocab size:",vocab_size)

# def text2seq(s):
#     return [vocab.get(t,4) for t in tokenize(s)]
# def seq2text(seq):
#     return " ".join(inv_vocab.get(i,'<unk>') for i in seq)

# # ------------------------
# # 3. Dataset & Dataloader
# # ------------------------
# class TitleDS(Dataset):
#     def __init__(self,df):
#         self.df=df
#     def __len__(self): return len(self.df)
#     def __getitem__(self,i):
#         row=self.df.iloc[i]
#         inp=text2seq(row['text'])
#         tgt=[vocab['<bos>']]+text2seq(row['title'])+[vocab['<eos>']]
#         return torch.tensor(inp), torch.tensor(tgt)

# def collate_fn(batch):
#     ins,ts = zip(*batch)
#     li=[len(x) for x in ins]; lt=[len(x) for x in ts]
#     Mi, Mt = max(li), max(lt)
#     ins = torch.stack([F.pad(x,(0,Mi-len(x)),value=0) for x in ins])
#     ts  = torch.stack([F.pad(x,(0,Mt-len(x)),value=0) for x in ts])
#     return ins, ts

# train_loader = DataLoader(TitleDS(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
# val_loader   = DataLoader(TitleDS(val_df),   batch_size=8, shuffle=False,collate_fn=collate_fn)
# test_loader  = DataLoader(TitleDS(test_df),  batch_size=1, shuffle=False,collate_fn=collate_fn)

# # ------------------------
# # 4. Model Components
# # ------------------------
# class EncoderRNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.gru=nn.GRU(ed,hd,batch_first=True)
#     def forward(self,x):
#         e=self.emb(x)
#         o,h=self.gru(e)
#         return o,h
#     def load_embeddings(self,weights):
#         self.emb.weight.data.copy_(weights)

# class HierEncoderRNN(nn.Module):
#     def __init__(self,vs,ed,wh,sh,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.wgru=nn.GRU(ed,wh,batch_first=True)
#         self.sgru=nn.GRU(wh,sh,batch_first=True)
#     def forward(self,x):
#         e=self.emb(x)
#         wo,wh=self.wgru(e)
#         # average over time → sentence embedding
#         se=wo.mean(dim=1,keepdim=True)
#         so,sh=self.sgru(se)
#         return so,sh
#     def load_embeddings(self,weights):
#         self.emb.weight.data.copy_(weights)

# class DecoderRNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.gru=nn.GRU(ed,hd,batch_first=True)
#         self.fc=nn.Linear(hd,vs)
#     def forward(self,token,hidden):
#         e=self.emb(token.unsqueeze(1))
#         o,h=self.gru(e,hidden)
#         return F.log_softmax(self.fc(o.squeeze(1)),dim=1), h

# class Decoder2RNN(nn.Module):
#     def __init__(self,vs,ed,hd,drop=0.2):
#         super().__init__()
#         self.emb=nn.Embedding(vs,ed)
#         self.g1=nn.GRU(ed,hd,batch_first=True)
#         self.g2=nn.GRU(hd,hd,batch_first=True)
#         self.fc=nn.Linear(hd,vs)
#     def forward(self,token,hidden):
#         e=self.emb(token.unsqueeze(1))
#         o1,h1=self.g1(e,hidden)
#         o2,h2=self.g2(o1,h1)
#         return F.log_softmax(self.fc(o2.squeeze(1)),dim=1), h2

# class Seq2seqRNN(nn.Module):
#     def __init__(self,enc,dec,bos,eos,mx=20):
#         super().__init__()
#         self.enc, self.dec = enc, dec
#         self.bos, self.eos, self.mx = bos,eos,mx

#     def forward(self,src,tgt=None,tf=0.5,beam=False,k=3):
#         bs=src.size(0)
#         eo,hidden = self.enc(src)
#         inp = src.new_full((bs,),self.bos)
#         outputs=[]
#         if tgt is not None:
#             L=tgt.size(1)
#             for t in range(L):
#                 out,hidden=self.dec(inp,hidden)
#                 outputs.append(out.unsqueeze(1))
#                 inp = tgt[:,t] if torch.rand(1).item()<tf else out.argmax(1)
#             return torch.cat(outputs,dim=1)
#         # inference
#         if not beam:
#             seqs=[]
#             for _ in range(self.mx):
#                 out,hidden=self.dec(inp,hidden)
#                 top1=out.argmax(1)
#                 seqs.append(top1.unsqueeze(1))
#                 inp=top1
#                 if (top1==self.eos).all(): break
#             return torch.cat(seqs,dim=1)
#         # beam search
#         all_out=[]
#         for i in range(bs):
#             beams=[([self.bos],0.0,hidden[:,i:i+1,:])]
#             for _ in range(self.mx):
#                 nb=[]
#                 for seq,sc,hi in beams:
#                     last=src.new_tensor([seq[-1]])
#                     out,hn=self.dec(last,hi)
#                     vals,inds=out.topk(k)
#                     for j in range(k):
#                         nb.append((seq+[inds[0,j].item()], sc+vals[0,j].item(), hn))
#                 beams=sorted(nb,key=lambda x:x[1],reverse=True)[:k]
#                 if all(s[-1]==self.eos for s,_,_ in beams): break
#             best=beams[0][0][1:]  # drop BOS
#             all_out.append(torch.tensor(best,device=src.device))
#         # pad
#         ml=max(o.size(0) for o in all_out)
#         padded=[F.pad(o,(0,ml-o.size(0)),value=self.eos) for o in all_out]
#         return torch.stack(padded,dim=0)

# # ------------------------
# # 5. Instantiate Model
# # ------------------------
# embed_dim, hid_dim = 300, 300
# bos, eos = vocab['<bos>'], vocab['<eos>']

# # choose encoder
# if use_hier:
#     encoder = HierEncoderRNN(vocab_size, embed_dim, hid_dim, hid_dim)
# else:
#     encoder = EncoderRNN(vocab_size, embed_dim, hid_dim)

# # load GloVe if requested
# if use_glove:
#     print("→ Loading GloVe embeddings...")
#     def load_glove(path):
#         E = torch.randn(vocab_size, embed_dim)
#         with open(path,'r',encoding='utf-8') as f:
#             for l in f:
#                 p=l.split()
#                 w,vec=p[0],torch.tensor(list(map(float,p[1:])))
#                 if w in vocab: E[vocab[w]] = vec
#         return E
#     embw = load_glove(config.glove_path if 'config' in globals() else '/content/glove.6B.300d.txt')
#     encoder.load_embeddings(embw)

# # choose decoder
# decoder = Decoder2RNN(vocab_size, embed_dim, hid_dim) if use_decoder2 else DecoderRNN(vocab_size, embed_dim, hid_dim)

# model = Seq2seqRNN(encoder, decoder, bos, eos, mx=20).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# # ------------------------
# # 6. Training & Eval
# # ------------------------
# opt = torch.optim.Adam(model.parameters(), lr=1e-3)
# crit=nn.NLLLoss(ignore_index=0)
# scaler=torch.cuda.amp.GradScaler()

# def train_epoch():
#     model.train(); total=0
#     for src,tgt in train_loader:
#         src,tgt=src.to(model.dec.emb.weight.device),tgt.to(model.dec.emb.weight.device)
#         opt.zero_grad()
#         with torch.cuda.amp.autocast():
#             out=model(src,tgt,tf=0.5,beam=False)
#             loss=crit(out.view(-1,out.size(-1)),tgt.view(-1))
#         scaler.scale(loss).backward()
#         scaler.step(opt); scaler.update()
#         total+=loss.item()
#     return total/len(train_loader)

# def eval_epoch():
#     model.eval(); total=0
#     with torch.no_grad():
#         for src,tgt in val_loader:
#             src,tgt=src.to(model.dec.emb.weight.device),tgt.to(model.dec.emb.weight.device)
#             out=model(src,tgt,tf=0,beam=False)
#             total+=crit(out.view(-1,out.size(-1)),tgt.view(-1)).item()
#     return total/len(val_loader)

# for ep in range(1,6):
#     t0=time.time()
#     tr=train_epoch(); va=eval_epoch()
#     print(f"Epoch {ep}: train {tr:.3f} | val {va:.3f} | {time.time()-t0:.1f}s")

# # ------------------------
# # 7. ROUGE Evaluation
# # ------------------------
# scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'],use_stemmer=True)
# r1=r2=rL=cnt=0
# model.eval()
# with torch.no_grad():
#     for src,tgt in test_loader:
#         src=src.to(model.dec.emb.weight.device)
#         pred_seq = model(src, tgt=None, beam=use_beam, k=beam_width).squeeze(0).tolist()
#         # cut at EOS
#         if eos in pred_seq: pred_seq=pred_seq[:pred_seq.index(eos)]
#         pred_txt = seq2text(pred_seq)
#         tgt_seq = tgt.squeeze(0).tolist()
#         tgt_seq=[i for i in tgt_seq if i not in (bos,eos,0)]
#         tgt_txt=seq2text(tgt_seq)
#         scr = scorer.score(tgt_txt,pred_txt)
#         r1+=scr['rouge1'].fmeasure
#         r2+=scr['rouge2'].fmeasure
#         rL+=scr['rougeL'].fmeasure
#         cnt+=1

# print(f"\n→ ROUGE on test ({'beam' if use_beam else 'greedy'}):")
# print(f"   ROUGE‑1: {r1/cnt:.4f}")
# print(f"   ROUGE‑2: {r2/cnt:.4f}")
# print(f"   ROUGE‑L: {rL/cnt:.4f}")
import os
# Mitigate CUDA fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('punkt')
!pip install rouge-score
from rouge_score import rouge_scorer
import os
# Set environment variable for CUDA memory fragmentation mitigation

import pandas as pd
import re
import nltk
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from collections import Counter
from torch.utils.data import Dataset, DataLoader
!pip install rouge-score

# Download required NLTK data
nltk.download('punkt')
# ------------------------
# 1. Runtime Configuration
# ------------------------
use_glove     = input("Use GloVe embeddings? (y/n): ").strip().lower().startswith('y')
glove_path = None
if use_glove:
    glove_path = input("→ Path to your GloVe file (e.g. /content/glove.6B.300d.txt): ").strip()
    if not os.path.isfile(glove_path):
        raise FileNotFoundError(f"GloVe file not found at '{glove_path}'")
    # Infer embedding dimension from the first line
    with open(glove_path, 'r', encoding='utf-8') as f:
        first = f.readline().split()
        inferred_dim = len(first) - 1
    print(f"Detected GloVe embedding dimension: {inferred_dim}")
else:
    inferred_dim = 300  # default if not using GloVe

use_hier      = input("Use Hierarchical Encoder? (y/n): ").strip().lower().startswith('y')
use_decoder2  = input("Use 2‑GRU Decoder? (y/n): ").strip().lower().startswith('y')
use_beam      = input("Use Beam Search? (y/n): ").strip().lower().startswith('y')
beam_width    = int(input("Beam width (e.g. 3): ").strip() or 3)

print(f"\nConfig → GloVe: {use_glove}, Hier: {use_hier}, Decoder2: {use_decoder2}, Beam: {use_beam}, k={beam_width}\n")

# ------------------------
# 2. Load Data & Build Vocab
# ------------------------
train_df = pd.read_csv('/content/train_processed (1).csv')
val_df   = pd.read_csv('/content/val_processed (1).csv')
test_df  = pd.read_csv('/content/test_processed (1).csv')

def tokenize(txt): return txt.split()

min_freq = 0.001 * len(train_df)
ctr = Counter()
for col in ['text','title']:
    for s in train_df[col]:
        ctr.update(tokenize(s))

vocab = {'<pad>':0,'<bos>':1,'<eos>':2,'<unk>':3}
for tok,f in ctr.items():
    if f >= min_freq:
        vocab[tok] = len(vocab)
inv_vocab = {i:t for t,i in vocab.items()}
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

def text2seq(s):
    return [vocab.get(t, vocab['<unk>']) for t in tokenize(s)]
def seq2text(seq):
    return " ".join(inv_vocab.get(i,'<unk>') for i in seq)

# ------------------------
# 3. Dataset & Dataloader
# ------------------------
class TitleDS(Dataset):
    def __init__(self,df): self.df=df
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        row=self.df.iloc[i]
        inp=text2seq(row['text'])
        tgt=[vocab['<bos>']] + text2seq(row['title']) + [vocab['<eos>']]
        return torch.tensor(inp), torch.tensor(tgt)

def collate_fn(batch):
    ins,ts = zip(*batch)
    Mi, Mt = max(len(x) for x in ins), max(len(x) for x in ts)
    ins = torch.stack([F.pad(x,(0,Mi-len(x)),value=vocab['<pad>']) for x in ins])
    ts  = torch.stack([F.pad(x,(0,Mt-len(x)),value=vocab['<pad>']) for x in ts])
    return ins, ts

train_loader = DataLoader(TitleDS(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TitleDS(val_df),   batch_size=8, shuffle=False,collate_fn=collate_fn)
test_loader  = DataLoader(TitleDS(test_df),  batch_size=1, shuffle=False,collate_fn=collate_fn)

# ------------------------
# 4. Model Components
# ------------------------
class EncoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
    def forward(self,x):
        return self.gru(self.emb(x))
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class HierEncoderRNN(nn.Module):
    def __init__(self,vs,ed,wh,sh):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.wgru=nn.GRU(ed,wh,batch_first=True)
        self.sgru=nn.GRU(wh,sh,batch_first=True)
    def forward(self,x):
        wo,wh = self.wgru(self.emb(x))
        se = wo.mean(dim=1,keepdim=True)
        return self.sgru(se)
    def load_embeddings(self,weights):
        self.emb.weight.data.copy_(weights)

class DecoderRNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.gru=nn.GRU(ed,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o,h = self.gru(self.emb(tk.unsqueeze(1)),h)
        return F.log_softmax(self.fc(o.squeeze(1)),dim=1), h

class Decoder2RNN(nn.Module):
    def __init__(self,vs,ed,hd):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.g1 = nn.GRU(ed,hd,batch_first=True)
        self.g2 = nn.GRU(hd,hd,batch_first=True)
        self.fc = nn.Linear(hd,vs)
    def forward(self,tk,h):
        o1,h1 = self.g1(self.emb(tk.unsqueeze(1)),h)
        o2,h2 = self.g2(o1,h1)
        return F.log_softmax(self.fc(o2.squeeze(1)),dim=1), h2

class Seq2seqRNN(nn.Module):
    def __init__(self,enc,dec,bos,eos,mx=20):
        super().__init__()
        self.enc, self.dec = enc, dec
        self.bos, self.eos, self.mx = bos, eos, mx

    def forward(self,src,tgt=None,tf=0.5,beam=False,k=3):
        bs = src.size(0)
        eo,hidden = self.enc(src)
        inp = src.new_full((bs,), self.bos)
        outputs=[]
        if tgt is not None:
            for t in range(tgt.size(1)):
                out,hidden = self.dec(inp,hidden)
                outputs.append(out.unsqueeze(1))
                inp = tgt[:,t] if torch.rand(1).item()<tf else out.argmax(1)
            return torch.cat(outputs,dim=1)
        # inference
        if not beam:
            seqs=[]
            for _ in range(self.mx):
                out,hidden = self.dec(inp,hidden)
                top1 = out.argmax(1)
                seqs.append(top1.unsqueeze(1))
                inp = top1
                if (top1==self.eos).all(): break
            return torch.cat(seqs,dim=1)
        # beam search
        all_out=[]
        for i in range(bs):
            beams=[([self.bos],0.0,hidden[:,i:i+1,:])]
            for _ in range(self.mx):
                nb=[]
                for seq,sc,hi in beams:
                    last = src.new_tensor([seq[-1]])
                    out,hn = self.dec(last,hi)
                    vals,inds = out.topk(k)
                    for j in range(k):
                        nb.append((seq+[inds[0,j].item()], sc+vals[0,j].item(), hn))
                beams = sorted(nb, key=lambda x:x[1], reverse=True)[:k]
                if all(s[-1]==self.eos for s,_,_ in beams): break
            best = beams[0][0][1:]
            all_out.append(torch.tensor(best,device=src.device))
        ml = max(o.size(0) for o in all_out)
        return torch.stack([F.pad(o,(0,ml-o.size(0)),value=self.eos) for o in all_out],dim=0)

# ------------------------
# 5. Instantiate Model
# ------------------------
embed_dim = inferred_dim
hid_dim   = 300
bos, eos  = vocab['<bos>'], vocab['<eos>']

# Encoder choice
encoder = HierEncoderRNN(vocab_size, embed_dim, hid_dim, hid_dim) if use_hier \
          else EncoderRNN(vocab_size, embed_dim, hid_dim)

# Load GloVe if requested
if use_glove:
    print("→ Loading GloVe embeddings from", glove_path)
    def load_glove(path, vs, ed):
        E = torch.randn(vs, ed)
        with open(path,'r',encoding='utf-8') as f:
            for line in f:
                parts = line.split()
                w, vec = parts[0], torch.tensor(list(map(float,parts[1:])))
                idx = vocab.get(w)
                if idx is not None:
                    E[idx] = vec
        return E
    encoder.load_embeddings(load_glove(glove_path, vocab_size, embed_dim))

# Decoder choice
decoder = Decoder2RNN(vocab_size, embed_dim, hid_dim) if use_decoder2 \
          else DecoderRNN(vocab_size, embed_dim, hid_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = Seq2seqRNN(encoder, decoder, bos, eos, mx=20).to(device)

# ------------------------
# 6. Training & Eval
# ------------------------
opt    = torch.optim.Adam(model.parameters(), lr=1e-3)
crit   = nn.NLLLoss(ignore_index=vocab['<pad>'])
scaler = torch.cuda.amp.GradScaler()

def train_epoch():
    model.train(); total=0
    for src,tgt in train_loader:
        src,tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        with torch.cuda.amp.autocast():
            out = model(src,tgt,tf=0.5,beam=False)
            loss = crit(out.view(-1,out.size(-1)), tgt.view(-1))
        scaler.scale(loss).backward()
        scaler.step(opt); scaler.update()
        total += loss.item()
    return total/len(train_loader)

def eval_epoch():
    model.eval(); total=0
    with torch.no_grad():
        for src,tgt in val_loader:
            src,tgt = src.to(device), tgt.to(device)
            out = model(src,tgt,tf=0,beam=False)
            total += crit(out.view(-1,out.size(-1)), tgt.view(-1)).item()
    return total/len(val_loader)

for ep in range(1,6):
    t0 = time.time()
    tr = train_epoch(); va = eval_epoch()
    print(f"Epoch {ep}: train {tr:.3f} | val {va:.3f} | {time.time()-t0:.1f}s")

# ------------------------
# 7. ROUGE Evaluation
# ------------------------
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
r1=r2=rL=cnt=0
model.eval()
with torch.no_grad():
    for src,tgt in test_loader:
        src = src.to(device)
        pred_seq = model(src, tgt=None, beam=use_beam, k=beam_width).squeeze(0).tolist()
        if eos in pred_seq: pred_seq = pred_seq[:pred_seq.index(eos)]
        pred_txt = seq2text(pred_seq)
        tgt_seq = [i for i in tgt.squeeze(0).tolist() if i not in (bos,eos,vocab['<pad>'])]
        tgt_txt  = seq2text(tgt_seq)
        scr = scorer.score(tgt_txt, pred_txt)
        r1 += scr['rouge1'].fmeasure
        r2 += scr['rouge2'].fmeasure
        rL += scr['rougeL'].fmeasure
        cnt += 1

print(f"\n→ ROUGE on test ({'beam' if use_beam else 'greedy'}):")
print(f"   ROUGE‑1: {r1/cnt:.4f}")
print(f"   ROUGE‑2: {r2/cnt:.4f}")
print(f"   ROUGE‑L: {rL/cnt:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Use GloVe embeddings? (y/n): n
Use Hierarchical Encoder? (y/n): n
Use 2‑GRU Decoder? (y/n): n
Use Beam Search? (y/n): y
Beam width (e.g. 3): 2

Config → GloVe: False, Hier: False, Decoder2: False, Beam: True, k=2

Vocab size: 54473


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1: train 1.530 | val 1.102 | 263.4s
Epoch 2: train 0.839 | val 0.872 | 260.3s
Epoch 3: train 0.600 | val 0.819 | 263.4s
Epoch 4: train 0.438 | val 0.821 | 262.5s
