In [None]:
# --- Install and Fixes ---
!pip install numpy==1.26.4 --quiet
!pip install -q --upgrade transformers datasets accelerate evaluate rouge_score
import os
os.kill(os.getpid(), 9)  # Restart runtime to apply numpy fix

In [None]:
# --- Imports ---
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
import pandas as pd
import json
from accelerate import Accelerator
import numpy as np

In [None]:
# --- Load Raw Data ---
with open("train.src.cleaned", "r") as f:
    docs = [line.strip() for _, line in zip(range(500), f)]

with open("train.tgt", "r") as f:
    summaries = [line.strip() for _, line in zip(range(500), f)]

print("Docs:", len(docs))
print("Summaries:", len(summaries))

In [None]:
# --- Clean + Create DataFrame ---
min_len = min(len(docs), len(summaries))
docs = docs[:min_len]
summaries = summaries[:min_len]

df = pd.DataFrame({
    "document": [doc.strip() for doc in docs],
    "summary": [summary.strip() for summary in summaries]
})

df.head()

In [None]:
# --- Convert to Dataset ---
dataset = Dataset.from_pandas(df)

In [None]:
# --- Tokenizer ---
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
# --- Preprocessing Function ---
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# --- Apply Preprocessing ---
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
# --- Save Sample Tokenized Subset ---
small_dataset = tokenized_dataset.select(range(500))
small_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_dataset.save_to_disk("tokenized_sample_dataset")