In [None]:
import json
from collections import Counter
from tqdm.notebook import tqdm
from google.colab import drive

## Vocabulary building

- download data from [here](https://drive.google.com/file/d/1oD6R-JW4muQ38VG3HNBDW4Z31Bx5f4B3/view?usp=sharing)

In [None]:
SPECIAL_TOKENS = {
    "<pad>": 0,  # Padding
    "<sos>": 1,  # Start of sequence
    "<eos>": 2,  # End of sequence
    "<unk>": 3   # Unknown word
}


def build_vocab(data, min_freq_ratio=0):
    # Count how many articles each token appears in
    token_document_count = Counter()
    total_articles = len(data)

    print(f"Total articles: {total_articles}")

    for article in tqdm(data, desc='processing articles...'):
        # Get unique tokens from this article
        # Using set() ensures each token is counted only once per article
        # even if it appears multiple times in the text
        text_tokens = set(article['text'].split())
        title_tokens = set(article['title'].split())
        unique_tokens = text_tokens | title_tokens

        # Increment the counter for each unique token in this article
        token_document_count.update(unique_tokens)

    # Calculate minimum document frequency threshold based on percentage
    min_document_count = max(1, int(min_freq_ratio * total_articles))
    print(f"Minimum document count: {min_document_count} (appears in {min_freq_ratio*100:.1f}% of articles)")

    # Create vocabulary with tokens that appear in at least min_document_count articles
    # This ensures we only keep tokens that appear in the specified percentage of articles
    vocab = {
        word: i + len(SPECIAL_TOKENS)
        for i, (word, count) in tqdm(
            enumerate(token_document_count.items()),
            desc='creating vocabulary'
        )
        if count >= min_document_count
    }

    vocab = {**SPECIAL_TOKENS, **vocab}
    print(f"Final vocabulary size: {len(vocab)}")

    return vocab



drive.mount('/content/drive')
file_path = '/content/drive/My Drive/data.json'
with open(file_path, "r") as f:
    data = json.load(f)

# Use only training data for vocabulary
training_data = data['training_data']

# Build vocabulary using only training data with 1% minimum document frequency
vocab_src = build_vocab(training_data, min_freq_ratio=0.01)

def re_index_vocab(vocab):
    new_vocab = {}
    for i, token in enumerate(vocab.keys()):
        new_vocab[token] = i
    return new_vocab

vocab_src = re_index_vocab(vocab_src)

vocab_tgt = vocab_src.copy()

# Save vocabularies
with open("vocab_src.json", "w") as f:
    json.dump(vocab_src, f, indent=4)

with open("vocab_tgt.json", "w") as f:
    json.dump(vocab_tgt, f, indent=4)

print("Vocabularies created and saved successfully!")




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total articles: 13379
Minimum document count: 133 (appears in 1.0% of articles)
Final vocabulary size: 8603
Vocabularies created and saved successfully!


## Hier Encoder, Decoder, Beam Search

In [None]:
import torch
torch.device('cuda')

device(type='cuda')

device(type='cuda')

In [None]:
!pip install rouge_score



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import json
import random
from rouge_score import rouge_scorer
import numpy as np
from tqdm.notebook import tqdm

SPECIAL_TOKENS = {
    "<pad>": 0,  # Padding
    "<sos>": 1,  # Start of sequence
    "<eos>": 2,  # End of sequence
    "<unk>": 3   # Unknown word
}


# Device configuration
device = torch.device("cuda")
print(f"Using device: {device}")

# Hyperparameters
EMB_DIM = 300 # 512
HID_DIM = 300
BATCH_SIZE = 64
LEARNING_RATE = 0.001
N_EPOCHS = 50
MAX_LEN = 10 # we can increase this
MAX_LEN_SRC = 500
TEACHER_FORCING_RATIO = 0.5

# Text processing
# def clean_text(text):
#     """Basic text cleaning"""
#     if not isinstance(text, str):
#         return ""
#     text = text.lower().strip()
#     text = re.sub(r'[^a-z0-9\\s]', '', text)  # Keep alphanumeric
#     return ' '.join(text.split())


# Dataset
class HeadlineDataset(Dataset):
    def __init__(self, data, vocab_src):
        self.data = data
        self.vocab_src = vocab_src
        self.vocab_tgt = vocab_src

    def __len__(self):
        return len(self.data)

    # def __getitem__(self, idx):
    #     # text = clean_text(self.data[idx]['text'])
    #     text = (self.data[idx]['text'])
    #     # title = clean_text(self.data[idx]['title'])
    #     title = (self.data[idx]['title'])

    #     src = [self.vocab_src['<sos>']] + \\n    #           [self.vocab_src.get(word, self.vocab_src['<unk>'])\n    #            for word in text.split()[:MAX_LEN-1]] + \\n    #           [self.vocab_src['<eos>']]

    #     tgt = [self.vocab_tgt['<sos>']] + \\n    #           [self.vocab_tgt.get(word, self.vocab_tgt['<unk>'])\n    #            for word in title.split()[:MAX_LEN-1]] + \\n    #           [self.vocab_tgt['<eos>']]

    #     return torch.tensor(src), torch.tensor(tgt)

    #     text = clean_text(self.data[idx]['text'])

    def __getitem__(self, idx):
      text = self.data[idx]['text']
      title = self.data[idx]['title']

      src = [self.vocab_src['<sos>']]
      src += [self.vocab_src.get(word, self.vocab_src['<unk>']) for word in text.split()[:MAX_LEN_SRC-1]]
      src.append(self.vocab_src['<eos>'])

      vocab_size = len(self.vocab_src)
      # Ensure all tokens are within valid range
      src = [tok if tok < vocab_size else self.vocab_src['<unk>'] for tok in src]

      # Repeat for tgt
      tgt = [self.vocab_tgt.get(word, self.vocab_tgt['<unk>']) for word in title.split()[:MAX_LEN-1]]
      tgt = [tok if tok < vocab_size else self.vocab_tgt['<unk>'] for tok in tgt]

      return torch.tensor(src), torch.tensor(tgt)



def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, padding_value=SPECIAL_TOKENS['<pad>'], batch_first=True)
    tgt_padded = pad_sequence(tgt_batch, padding_value=SPECIAL_TOKENS['<pad>'], batch_first=True)
    return src_padded, tgt_padded

def check_dataset_indices(dataset, vocab_size):
    for i in range(len(dataset)):
        src, tgt = dataset[i]
        if src.max() >= vocab_size or tgt.max() >= vocab_size:
            print(f"Invalid indices in sample {i}")
            return False
    return True

# Model Architecture
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout_rate= 0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True,bidirectional=True)

        # Linear layer to combine bidirectional outputs for the decoder
        self.fc = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden_forward = hidden[0, :, :]
        hidden_backward = hidden[1, :, :]
        hidden_combined = torch.cat((hidden_forward, hidden_backward), dim=1)
        hidden_transformed = torch.tanh(self.fc(hidden_combined))
        hidden_for_decoder = hidden_transformed.unsqueeze(0)

        return outputs, hidden_for_decoder

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, ff_dim=512, dropout_rate=0.5):
        super().__init__()

        # Existing components
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)

        # # New FFNN components
        # self.ffnn = nn.Sequential(
        #     nn.Linear(hid_dim, ff_dim),\n        #     nn.ReLU(),\n        #     nn.Dropout(dropout_rate),\n        #     nn.Linear(ff_dim, hid_dim),\n        #     nn.ReLU()\n        # )

        # Final output layer
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input_token, hidden):
        # Existing RNN processing
        embedded = self.dropout(self.embedding(input_token))
        output, hidden = self.rnn(embedded, hidden)

        # New FFNN processing
        # ff_output = self.ffnn(output)  # Shape: [batch_size, seq_len, hid_dim]\n
        # Final projection to vocabulary
        # prediction = self.fc_out(ff_output.squeeze(1))\n        prediction = self.fc_out(output.squeeze(1))

        return prediction, hidden



class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, vocab_src, max_len=50):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.max_len = max_len
        self.vocabulary = vocab_src

    def forward(self, src, tgt=None, teacher_forcing_ratio=1):
        batch_size = src.shape[0]

        # Define target length - use tgt length during training, max_len during inference
        tgt_len = tgt.shape[1] if tgt is not None else self.max_len

        # Define vocabulary size from decoder output layer
        vocab_size = self.decoder.fc_out.out_features  # Fixed: was fc.out_features

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)

        # Encode the source sequence
        encoder_outputs, hidden = self.encoder(src)

        # First decoder input is the <SOS> token
        input_token = torch.tensor([[self.vocabulary["<sos>"]] * batch_size], device=self.device).T
        # For batching: input_token shape should be [batch_size, 1]

        for t in range(1, tgt_len):
            # Pass through decoder
            # Note: Removed the encoder_outputs argument as it's not in decoder's forward signature
            prediction, hidden = self.decoder(input_token, hidden)

            # Store prediction
            outputs[:, t, :] = prediction

            # Teacher forcing: decide whether to use real target tokens
            use_teacher_forcing = random.random() < teacher_forcing_ratio

            if use_teacher_forcing and tgt is not None:
                # Use actual next token as next input
                input_token = tgt[:, t].unsqueeze(1)
            else:
                # Use best predicted token
                top1 = prediction.argmax(1).unsqueeze(1)
                input_token = top1

            # Stop if all sequences in batch have generated EOS
            if tgt is None and (input_token == self.vocabulary["<eos>"]).all():
                break

        return outputs


def evaluate_rouge(model, dataset, vocab_src, vocab_tgt, num_examples=8):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    idx2word_tgt = {v:k for k,v in vocab_tgt.items()}

    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    examples = []

    # Get special token IDs
    special_tokens = {vocab_tgt["<pad>"], vocab_tgt["<sos>"], vocab_tgt["<eos>"], vocab_tgt["<unk>"]}

    with torch.no_grad():
        for i in range(min(100, len(dataset))):  # Evaluate on max 100 examples
            src, tgt = dataset[i]
            src = src.unsqueeze(0).to(model.device)

            # Generate output with the model (no target provided for inference)
            outputs = model(src, tgt=None, teacher_forcing_ratio=0)

            # Get the predictions (batch_size=1, so we take index 0)
            output_tokens = outputs[0].argmax(dim=1).cpu().numpy()

            # Convert tokens to words, filtering out special tokens
            pred = ' '.join([idx2word_tgt[idx] for idx in output_tokens\n                           if idx not in special_tokens])

            true = ' '.join([idx2word_tgt[idx.item()] for idx in tgt\n                           if idx.item() not in special_tokens])

            # Calculate ROUGE scores
            rouge_scores = scorer.score(true, pred)
            for key in scores:
                scores[key].append(rouge_scores[key].fmeasure)

            # Save examples
            if len(examples) < num_examples:
                examples.append((pred, true, rouge_scores))

    # Print examples
    for i, (pred, true, rouge) in enumerate(examples):
        print(f"\\nExample {i+1}:")
        print(f"Predicted: {pred}")
        print(f"True: {true}")
        print(f"ROUGE-1: {rouge['rouge1'].fmeasure:.3f}")
        print(f"ROUGE-2: {rouge['rouge2'].fmeasure:.3f}")
        print(f"ROUGE-L: {rouge['rougeL'].fmeasure:.3f}")

    # Calculate average scores
    avg_scores = {k: np.mean(v) for k,v in scores.items()}
    print("\\nAverage ROUGE Scores:")
    print(f"ROUGE-1: {avg_scores['rouge1']:.3f}")
    print(f"ROUGE-2: {avg_scores['rouge2']:.3f}")
    print(f"ROUGE-L: {avg_scores['rougeL']:.3f}")

    return avg_scores


def train_model(data, vocab_src):
    # Build vocabularies
    # vocab_src = build_vocab(data)
    print(f"Vocab size: {len(vocab_src)}")

    # Create datasets
    train_data = HeadlineDataset(data['training_data'], vocab_src)
    val_data = HeadlineDataset(data['validation_data'], vocab_src)
    test_data = HeadlineDataset(data['test_data'], vocab_src)

    assert check_dataset_indices(train_data, len(vocab_src)), "Invalid indices found in training data"


    # Create dataloaders
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,\n                            shuffle=True, collate_fn=collate_fn, pin_memory=True)


    # Initialize model with updated parameters
    encoder = Encoder(len(vocab_src), EMB_DIM, HID_DIM)
    decoder = Decoder(len(vocab_src), EMB_DIM, HID_DIM)
    model = Seq2Seq(encoder, decoder, device, vocab_src, max_len=MAX_LEN).to(device)

    # Optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=SPECIAL_TOKENS['<pad>'])


    vocab_size = len(vocab_src)
    for batch_idx, (src, tgt) in enumerate(train_loader):
        if (src >= vocab_size).any():
            print(f"Found out-of-bounds indices in batch {batch_idx}")
            print(f"Max index: {src.max().item()}, Vocab size: {vocab_size}")
            # Fix the indices by capping them
            src[src >= vocab_size] = vocab_src["<unk>"]

        # Same check for target
        if (tgt >= vocab_size).any():
            tgt[tgt >= vocab_size] = vocab_src["<unk>"]

    # Training loop
    best_val_score = 0
    for epoch in range(N_EPOCHS):
        model.train()
        epoch_loss = 0

        for src, tgt in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            src, tgt = src.to(device), tgt.to(device)

            optimizer.zero_grad()

            # Pass tgt for teacher forcing
            output = model(src, tgt, teacher_forcing_ratio=TEACHER_FORCING_RATIO)

            # Reshape output and target for loss calculation
            # output: [batch_size, tgt_len, vocab_size]
            # Target should exclude <sos> token (first token)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            # Calculate loss
            loss = criterion(output, tgt)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss/len(train_loader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

        # Evaluate every 5 epochs
        if (epoch+1) % 5 == 0:
            print("\\nValidation Evaluation:")
            val_scores = evaluate_rouge(model, val_data, vocab_src, vocab_src)

            # Save the best model
            if val_scores['rougeL'] > best_val_score:
                best_val_score = val_scores['rougeL']
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': avg_loss,
                    'vocab': vocab_src
                }, 'best_headline_generator.pth')
                print(f"New best model saved with ROUGE-L: {best_val_score:.3f}")

    # Final evaluation
    print("\\nTest Evaluation:")
    test_scores = evaluate_rouge(model, test_data, vocab_src, vocab_src)

    # Save final model
    torch.save({
        'model_state_dict': model.state_dict(),
        'vocab': vocab_src,
        'config': {
            'emb_dim': EMB_DIM,
            'hid_dim': HID_DIM,
            'max_len': MAX_LEN,
            'max_len_src' : MAX_LEN_SRC
        }
    }, 'final_headline_generator.pth')

    return model, val_scores, test_scores

if __name__ == "__main__":
    # Load data - Make sure this matches your actual data loading code

    with open(file_path, "r") as f:
        data = json.load(f)

    model, val_scores, test_scores = train_model(data, vocab_src)

Using device: cuda
Vocab size: 8603
Epoch 1, Loss: 3.8689
Epoch 2, Loss: 2.9240
Epoch 3, Loss: 2.5205
Epoch 4, Loss: 2.2017
Epoch 5, Loss: 1.9351

Validation Evaluation:

Example 1:
Predicted: john jones
True: john wilson
ROUGE-1: 0.500
ROUGE-2: 0.000
ROUGE-L: 0.500

Example 2:
Predicted: 
True: sofia
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Example 3:
Predicted: 
True: 
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Example 4:
Predicted: tim
True: jeff american football
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Example 5:
Predicted: 
True: 
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Example 6:
Predicted: mount township new jersey
True: mount laurel new jersey
ROUGE-1: 0.750
ROUGE-2: 0.333
ROUGE-L: 0.750

Example 7:
Predicted: 
True: tyrone
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Example 8:
Predicted: ray smith
True: raymond washington
ROUGE-1: 0.000
ROUGE-2: 0.000
ROUGE-L: 0.000

Average ROUGE Scores:
ROUGE-1: 0.270
ROUGE-2: 0.055
ROUGE-L: 0.270
New best model saved w

In [None]:
print("Max index in vocab:", max(vocab_src.values()))
print("Vocab size:", len(vocab_src))

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
train_data = HeadlineDataset(data['training_data'], vocab_src)
for i in range(5):
    src, tgt = train_data[i]
    print(tgt)
    assert src.max() < len(vocab_src), f"Invalid src index in sample {i}"
    assert tgt.max() < len(vocab_src), f"Invalid tgt index in sample {i}"

## Transfomer Fine tuning


In [None]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

def create_datasets(train_df, test_df, random_seed=42):
    """Create training, validation, and test datasets without text processing"""
    print("Creating datasets...")

    # Create a random permutation for validation split
    np.random.seed(random_seed)
    shuffled = np.random.permutation(len(train_df))
    validation_indices = set(shuffled[:500])  # Take 500 samples for validation

    training_data = []
    validation_data = []
    test_data = []

    # Extract training and validation data
    print("Loading training data...")
    for index, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Extracting training data"):
        item = {'text': row['text'], 'title': row['title']}
        if index in validation_indices:
            validation_data.append(item)
        else:
            training_data.append(item)

    # Extract test data
    print("Loading test data...")
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Extracting test data"):
        test_data.append({'text': row['text'], 'title': row['title']})

    return training_data, test_data, validation_data

def main():
    """Main execution function"""
    # Load data
    print("Loading CSV files...")
    try:
        train_df = pd.read_csv('/content/drive/My Drive/train.csv')
        test_df = pd.read_csv('/content/drive/My Drive/test.csv')
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please ensure train.csv and test.csv files are in the current directory.")
        return

    # Create datasets
    training_data, test_data, validation_data = create_datasets(train_df, test_df)

    # Save results
    data_dict = {
        'training_data': training_data,
        'test_data': test_data,
        'validation_data': validation_data,
    }

    output_path = 'processed_data.json'
    print(f"Saving results to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump(data_dict, f, indent=4)

    print(f"Processing complete! Data saved to {output_path}")
    print(f"Processed {len(training_data)} training items, {len(test_data)} test items, and {len(validation_data)} validation items")

if __name__ == '__main__':
    main()


Loading CSV files...
Creating datasets...
Loading training data...
Loading test data...
Saving results to processed_data.json...
Processing complete! Data saved to processed_data.json
Processed 13379 training items, 100 test items, and 500 validation items


In [None]:
!pip install datasets



In [None]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from rouge_score import rouge_scorer
from tqdm import tqdm

# 1. Load pretrained model
model_name = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 2. Prepare datasets
def convert_to_hf_dataset(data, tokenizer, max_input_length=512, max_target_length=64):
    texts = [item['text'] for item in data]
    titles = [item['title'] for item in data]

    inputs = ["summarize: " + text for text in texts]

    input_encodings = tokenizer(inputs, max_length=max_input_length,\n                                truncation=True, padding="max_length")

    target_encodings = tokenizer(titles, max_length=max_target_length,\n                                truncation=True, padding="max_length")

    dataset_dict = {\n        "input_ids": input_encodings.input_ids,\n        "attention_mask": input_encodings.attention_mask,\n        "labels": target_encodings.input_ids\n    }

    for i in range(len(dataset_dict["labels"])):
        dataset_dict["labels"][i] = [\n            (l if l != tokenizer.pad_token_id else -100) for l in dataset_dict["labels"][i]\n        ]

    return Dataset.from_dict(dataset_dict)

# Create datasets
train_dataset = convert_to_hf_dataset(data['training_data'], tokenizer)
val_dataset = convert_to_hf_dataset(data['validation_data'], tokenizer)
test_dataset = convert_to_hf_dataset(data['test_data'], tokenizer)

# Create data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
    for pred, label in zip(decoded_preds, decoded_labels):
        results = scorer.score(label, pred)
        scores['rouge1'] += results['rouge1'].fmeasure
        scores['rouge2'] += results['rouge2'].fmeasure
        scores['rougeL'] += results['rougeL'].fmeasure

    scores = {k: v / len(decoded_preds) for k, v in scores.items()}
    return scores

# 4. Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-headline-generator",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True if torch.cuda.is_available() else False,
    logging_steps=100,
    push_to_hub=False,
)

# 5. Create trainer and train
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# 6. Generate headlines and evaluate
def generate_and_evaluate(model, tokenizer, test_data, use_beam_search=False, beam_width=4):
    model.eval()
    batch_size = 16
    num_samples = len(test_data)
    predictions = []
    references = [item['title'] for item in test_data]

    for i in tqdm(range(0, num_samples, batch_size)):
        batch_data = test_data[i:min(i+batch_size, num_samples)]
        input_texts = ["summarize: " + item['text'] for item in batch_data]
        inputs = tokenizer(input_texts, max_length=512, truncation=True,\n                          padding=True, return_tensors="pt").to(model.device)

        if use_beam_search:
            outputs = model.generate(\n                **inputs,\n                max_length=64,\n                num_beams=beam_width,\n                early_stopping=True\n            )
        else:
            outputs = model.generate(\n                **inputs,\n                max_length=64,\n                num_beams=1\n            )

        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded_outputs)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    for pred, ref in zip(predictions, references):
        results = scorer.score(ref, pred)
        scores['rouge1'] += results['rouge1'].fmeasure
        scores['rouge2'] += results['rouge2'].fmeasure
        scores['rougeL'] += results['rougeL'].fmeasure

    avg_scores = {k: v / len(predictions) for k, v in scores.items()}

    print(f"\\n{'Beam Search' if use_beam_search else 'Greedy'} Results:")
    for k, v in avg_scores.items():
        print(f"{k}: {v:.4f}")

    return predictions, avg_scores

# Generate with greedy search
greedy_preds, greedy_scores = generate_and_evaluate(model, tokenizer, data['test_data'], use_beam_search=False)

# Generate with beam search
beam_preds, beam_scores = generate_and_evaluate(model, tokenizer, data['test_data'], use_beam_search=True, beam_width=4)

# Compare results
print("\\nResults Comparison:")
print(f"{'Metric':<10} {'Greedy':<10} {'Beam Search':<10}")
print("-" * 30)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    print(f"{metric:<10} {greedy_scores[metric]:.4f}      {beam_scores[metric]:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  trainer = Seq2SeqTrainer(



Greedy Results:
rouge1: 0.9511
rouge2: 0.8493
rougeL: 0.9494

Beam Search Results:
rouge1: 0.9577
rouge2: 0.8622
rougeL: 0.9561

Results Comparison:
Metric     Greedy     Beam Search
------------------------------
rouge1     0.9511      0.9577
rouge2     0.8493      0.8622
rougeL     0.9494      0.9561


## C2


In [None]:
!pip install rouge_score
!pip install huggingface_hub[hf_xet]
!pip install hf_xet


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e9c74e8bb83afbb59b4481a77ce4d06383cd6e3537c3689c8903db35be52aace
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import json
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from rouge_score import rouge_scorer
import numpy as np
import time

# First, check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")

# Step 1: Convert CSV to JSON
def convert_csv_to_json(csv_path, json_path):
    data = pd.read_csv(csv_path)
    data.to_json(json_path, orient='records', lines=True)
    print(f"Converted {csv_path} to {json_path}")
    return json_path

# Step 2: Load articles from JSON
def load_articles_from_json(json_path):
    with open(json_path, 'r') as file:
        json_data = [json.loads(line) for line in file]

    # Identify article and title fields
    possible_article_fields = ['article', 'body', 'content', 'text']
    possible_title_fields = ['title', 'headline', 'header']

    article_field = next((f for f in possible_article_fields if f in json_data[0]), None)
    title_field = next((f for f in possible_title_fields if f in json_data[0]), None)

    if not article_field or not title_field:
        raise ValueError(f"Could not find article or title fields. Available: {list(json_data[0].keys())}")

    articles = [item[article_field] for item in json_data if article_field in item]
    reference_titles = [item[title_field] for item in json_data if title_field in item]

    return articles, reference_titles

# Step 3: Generate titles using LLMs
def generate_titles(articles, model, tokenizer, prompt_prefix, device):
    generated_titles = []
    start_time = time.time()

    for i, article in enumerate(articles):
        if i % 10 == 0 and i > 0:
            print(f"  Processed {i}/{len(articles)} articles...")

        prompt = prompt_prefix + article
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate with CUDA acceleration if available
        outputs = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)
        generated_title = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_titles.append(generated_title)

    elapsed_time = time.time() - start_time
    print(f"  Generation completed in {elapsed_time:.2f} seconds for {len(articles)} articles")
    print(f"  Average time per article: {elapsed_time/len(articles):.2f} seconds")

    return generated_titles

# Step 4: Calculate ROUGE scores
def calculate_rouge_scores(reference_titles, generated_titles):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for ref_title, gen_title in zip(reference_titles, generated_titles):
        scores = scorer.score(ref_title, gen_title)

        # Extract F1 scores for each ROUGE metric
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    # Calculate average ROUGE scores
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

    return {\n        'rouge1': avg_rouge1,\n        'rouge2': avg_rouge2,\n        'rougeL': avg_rougeL,\n        'average': (avg_rouge1 + avg_rouge2 + avg_rougeL) / 3\n    }

# Main function to execute all steps
def main():
    # Configure paths according to Kaggle environment
    csv_path = '/content/drive/MyDrive/test.csv'
    json_path = '/content/test.json'

    # Models to use - both base and large as per task requirements
    model_names = ['google/flan-t5-base', 'google/flan-t5-large']

    # Define prompt variations for title generation
    prompt_variations = [\n        "Give a title for the following article: ",\n        "Based on this given text create an appropriate title : "\n    ]

    try:
        # Step 1: Convert CSV to JSON
        json_path = convert_csv_to_json(csv_path, json_path)

        # Step 2: Load articles and reference titles from JSON
        articles, reference_titles = load_articles_from_json(json_path)
        print(f"Loaded {len(articles)} articles with reference titles")

        # Results dictionary to store scores by model and prompt
        results = {}

        for model_name in model_names:
            print(f"\\nProcessing model: {model_name}")
            # Load model with CUDA support
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            model = model.to(device)  # Move model to GPU if available
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Print memory usage after loading the model
            if device.type == 'cuda':
                print(f"  GPU memory allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB")
                print(f"  GPU memory reserved: {torch.cuda.memory_reserved() / (1024**2):.2f} MB")

            model_results = {}

            for prompt in prompt_variations:
                print(f"  Using prompt: \"{prompt}\"")

                # Step 3: Generate titles with this prompt
                generated_titles = generate_titles(articles, model, tokenizer, prompt, device)

                # Step 4: Calculate ROUGE scores for this prompt
                scores = calculate_rouge_scores(reference_titles, generated_titles)
                model_results[prompt] = scores

                print(f"    ROUGE-1: {scores['rouge1']:.4f}")
                print(f"    ROUGE-2: {scores['rouge2']:.4f}")
                print(f"    ROUGE-L: {scores['rougeL']:.4f}")
                print(f"    Average: {scores['average']:.4f}")

            results[model_name] = model_results

            # Clear GPU memory after using each model
            if device.type == 'cuda':
                del model
                torch.cuda.empty_cache()
                print(f"  GPU memory freed up")

        # Print final summary of ROUGE scores for each prompt
        print("\\n===== FINAL ROUGE SCORES BY PROMPT =====")

        for prompt in prompt_variations:
            print(f"\\nPrompt: \"{prompt}\"")

            # Calculate average scores across models for this prompt
            rouge1_total = sum(results[model][prompt]['rouge1'] for model in model_names)
            rouge2_total = sum(results[model][prompt]['rouge2'] for model in model_names)
            rougeL_total = sum(results[model][prompt]['rougeL'] for model in model_names)
            avg_total = sum(results[model][prompt]['average'] for model in model_names)

            for model in model_names:
                scores = results[model][prompt]
                print(f"  {model}:")
                print(f"    ROUGE-1: {scores['rouge1']:.4f}")
                print(f"    ROUGE-2: {scores['rouge2']:.4f}")
                print(f"    ROUGE-L: {scores['rougeL']:.4f}")
                print(f"    Average: {scores['average']:.4f}")

            # Print average across models for this prompt
            model_count = len(model_names)
            print(f"  AVERAGE ACROSS MODELS:")
            print(f"    ROUGE-1: {rouge1_total/model_count:.4f}")
            print(f"    ROUGE-2: {rouge2_total/model_count:.4f}")
            print(f"    ROUGE-L: {rougeL_total/model_count:.4f}")
            print(f"    Average: {avg_total/model_count:.4f}")

    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

Using device: cuda
GPU: Tesla T4
CUDA Version: 12.4
Available GPU memory: 14.74 GB
Converted /content/drive/MyDrive/test.csv to /content/test.json
Loaded 100 articles with reference titles

Processing model: google/flan-t5-base
  GPU memory allocated: 947.43 MB
  GPU memory reserved: 1014.00 MB
  Using prompt: "Give a title for the following article: "
  Processed 10/100 articles...
  Processed 20/100 articles...
  Processed 30/100 articles...
  Processed 40/100 articles...
  Processed 50/100 articles...
  Processed 60/100 articles...
  Processed 70/100 articles...
  Processed 80/100 articles...
  Processed 90/100 articles...
  Generation completed in 25.35 seconds for 100 articles
  Average time per article: 0.25 seconds
    ROUGE-1: 0.8556
    ROUGE-2: 0.6665
    ROUGE-L: 0.8556
    Average: 0.7926
  Using prompt: "Based on this given text create an appropriate title : "
  Processed 10/100 articles...
  Processed 20/100 articles...
  Processed 30/100 articles...
  Processed 40/100 ar

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
