In [1]:
!pip install transformers datasets evaluate rouge_score --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platfor

In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from transformers import T5ForConditionalGeneration, T5TokenizerFast, get_linear_schedule_with_warmup
import pandas as pd
from tqdm.auto import tqdm

# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "t5-base"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
NUM_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 500
OUTPUT_DIR = "./t5_summarizer"

data_path = "/kaggle/input/ccdv-arxiv-summarization-dataset/train.csv"

os.makedirs(OUTPUT_DIR, exist_ok=True)

tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

class ArxivCSV(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.articles = df["article"].tolist()
        self.abstracts = df["abstract"].tolist()

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        source = str(self.articles[idx])
        target = str(self.abstracts[idx])
        inputs = tokenizer(
            source,
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding="max_length"
        )
        labels = tokenizer(
            target,
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )
        input_ids = torch.tensor(inputs.input_ids)
        attention_mask = torch.tensor(inputs.attention_mask)
        labels_ids = torch.tensor(labels.input_ids)
        labels_ids[labels_ids == tokenizer.pad_token_id] = -100
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels_ids}

# Read CSV and split
df = pd.read_csv(data_path)
train_df = df.iloc[:20000].reset_index(drop=True)
val_df = df.iloc[20000:25000].reset_index(drop=True)

dataset_train = ArxivCSV(train_df)
dataset_val = ArxivCSV(val_df)

train_loader = DataLoader(dataset_train, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=EVAL_BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) // GRADIENT_ACCUMULATION_STEPS * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)
scaler = torch.cuda.amp.GradScaler()

def evaluate():
    model.eval()
    total_loss = 0.0
    eval_loader = tqdm(val_loader, desc="Validating", leave=False)
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    avg_loss = total_loss / len(val_loader)
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss

def train():
    model.train()
    for epoch in range(1, NUM_EPOCHS + 1):
        running_loss = 0.0
        epoch_loader = tqdm(train_loader, desc=f"Epoch {epoch}/{NUM_EPOCHS}", leave=False)
        for step, batch in enumerate(epoch_loader, start=1):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS

            scaler.scale(loss).backward()
            running_loss += loss.item()

            if step % GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

                epoch_loader.set_postfix({"loss": f"{running_loss/GRADIENT_ACCUMULATION_STEPS:.4f}"})
                running_loss = 0.0

        # Save checkpoint
        ckpt_path = os.path.join(OUTPUT_DIR, f"checkpoint_epoch_{epoch}.pt")
        torch.save(model.state_dict(), ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # Run validation after each epoch
        evaluate()

if __name__ == "__main__":
    train()


2025-07-14 15:47:17.839343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752508038.041844      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752508038.103341      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/3:   0%|          | 0/5000 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Saved checkpoint: ./t5_summarizer/checkpoint_epoch_1.pt


Validating:   0%|          | 0/1250 [00:00<?, ?it/s]

Validation Loss: 2.6422


Epoch 2/3:   0%|          | 0/5000 [00:00<?, ?it/s]

Saved checkpoint: ./t5_summarizer/checkpoint_epoch_2.pt


Validating:   0%|          | 0/1250 [00:00<?, ?it/s]

Validation Loss: 2.5500


Epoch 3/3:   0%|          | 0/5000 [00:00<?, ?it/s]

Saved checkpoint: ./t5_summarizer/checkpoint_epoch_3.pt


Validating:   0%|          | 0/1250 [00:00<?, ?it/s]

Validation Loss: 2.5326


In [None]:
import os
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from evaluate import load as load_metric
import pandas as pd
from tqdm.auto import tqdm

# Config
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "t5-base"
CHECKPOINT_PATH = "./t5_summarizer/checkpoint_epoch_3.pt"
DATA_PATH = "/kaggle/input/ccdv-arxiv-summarization-dataset/train.csv"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150
BATCH_SIZE = 4

# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

# Load validation data
print("Loading validation data...")
df = pd.read_csv(DATA_PATH)
val_df = df.iloc[20000:20500].reset_index(drop=True)

class ArxivValDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.articles = df["article"].tolist()
        self.abstracts = df["abstract"].tolist()

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        source = str(self.articles[idx])
        target = str(self.abstracts[idx])
        inputs = tokenizer(
            source,
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": inputs.input_ids.squeeze(0),
            "attention_mask": inputs.attention_mask.squeeze(0),
            "target": target
        }

val_dataset = ArxivValDataset(val_df)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Load ROUGE
print("Loading ROUGE metric...")
rouge = load_metric("rouge")


# Generate summaries and compute ROUGE
print("Evaluating...")
preds = []
refs = []

for batch in tqdm(val_loader):
    input_ids = batch["input_ids"].to(DEVICE)
    attention_mask = batch["attention_mask"].to(DEVICE)

    with torch.no_grad():
        summaries = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_TARGET_LENGTH,
            num_beams=4,
            early_stopping=True
        )

    decoded_preds = tokenizer.batch_decode(summaries, skip_special_tokens=True)
    decoded_refs = [ref for ref in batch["target"]]

    preds.extend(decoded_preds)
    refs.extend(decoded_refs)

# Compute ROUGE
print("\nComputing ROUGE scores...")
rouge_output = rouge.compute(predictions=preds, references=refs, use_stemmer=True)

for key in ["rouge1", "rouge2", "rougeL"]:
    score = rouge_output[key]
    print(f"{key.upper()} F1: {score:.4f}")


# Sample Input and Summary
print("\n🔍 Sample Result:")
sample_input = val_df.iloc[0]["article"][:1000]  # Shortened for readability
input_tokens = tokenizer.encode(sample_input, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True).to(DEVICE)
summary_ids = model.generate(input_tokens, max_length=MAX_TARGET_LENGTH)
sample_output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n📝 Input (truncated):\n", sample_input[:500], "...\n")
print("📌 Model Summary:\n", sample_output)
print("🎯 Reference Summary:\n", val_df.iloc[0]["abstract"])


In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# === Config ===
MODEL_NAME = "t5-base"
CHECKPOINT_PATH = "./t5_summarizer/checkpoint_epoch_3.pt"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load tokenizer and model ===
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

# === Input Text (sample) ===
input_text = """
Large language models (LLMs) have demonstrated impressive capabilities in natural language understanding and generation. However, their deployment in real-world applications is limited by issues such as high inference latency, large memory requirements, and the risk of generating factually incorrect or toxic content. In this work, we introduce a two-stage approach combining knowledge distillation with safety-aligned reinforcement learning to produce compact, safe, and efficient LLMs. We evaluate our models on a suite of benchmark tasks including summarization, question answering, and factual correctness, and show that our method significantly reduces model size and latency while maintaining competitive accuracy and safety metrics.
"""

# === Add 'summarize:' prefix ===
prefixed_text = "summarize: " + input_text.strip()

# === Tokenize input ===
inputs = tokenizer(
    prefixed_text,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
    max_length=512
).to(DEVICE)

# === Generate Summary ===
summary_ids = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=256,                   # Better than 150 for full thought
    num_beams=4,                      # Beam search for quality
    repetition_penalty=2.0,           # Avoid repeated phrases
    no_repeat_ngram_size=4,           # Block repeated n-grams
    early_stopping=True
)

# === Decode and Post-process Summary ===
summary = tokenizer.decode(
    summary_ids[0],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)

# Optional: Capitalize the first letter
summary = summary[0].upper() + summary[1:]

print("Summary:\n", summary)


Summary:
 Large language models (LLMs) have demonstrated impressive capabilities in natural language understanding. however, their deployment in real-world applications is limited by issues such as high inference latency and large memory requirements. we introduce a two-stage approach combining knowledge distillation with safety-aligned reinforcement learning to produce compact, safe, and efficient LLMs.


In [4]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("./t5_summarizer/checkpoint_epoch_3.pt"))

tokenizer = T5TokenizerFast.from_pretrained("t5-base")

# Save both into a folder
model.save_pretrained("my_local_t5_model")
tokenizer.save_pretrained("my_local_t5_model")


('my_local_t5_model/tokenizer_config.json',
 'my_local_t5_model/special_tokens_map.json',
 'my_local_t5_model/spiece.model',
 'my_local_t5_model/added_tokens.json',
 'my_local_t5_model/tokenizer.json')

In [None]:
import shutil
shutil.make_archive("my_local_t5_model", 'zip', "my_local_t5_model")


In [5]:
# Move the zip file to the output directory so it appears in the Files tab
!cp my_local_t5_model.zip /kaggle/working/
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import shutil

# Save model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("./t5_summarizer/checkpoint_epoch_3.pt"))
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model.save_pretrained("my_local_t5_model")
tokenizer.save_pretrained("my_local_t5_model")

# Zip and move to working
shutil.make_archive("my_local_t5_model", 'zip', "my_local_t5_model")
!cp my_local_t5_model.zip /kaggle/working/


cp: cannot stat 'my_local_t5_model.zip': No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


cp: 'my_local_t5_model.zip' and '/kaggle/working/my_local_t5_model.zip' are the same file


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import os
import json
from transformers import T5ForConditionalGeneration, T5TokenizerFast

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("./t5_summarizer/checkpoint_epoch_3.pt"))
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

SAVE_DIR = "/kaggle/working/my_local_t5_model"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

os.environ["KAGGLE_USERNAME"] = "Your_kaggle_username_here"  # Replace with your actual Kaggle username
os.environ["KAGGLE_KEY"] = "Your_kaggle_key_here"  # Replace with your actual Kaggle key download from kaggle account

dataset_metadata = {
    "title": "My T5 Summarizer Model",
    "id": "kanishk2223/my-t5-model", 
    "licenses": [{"name": "CC0-1.0"}]
}

with open("/kaggle/working/dataset-metadata.json", "w") as f:
    json.dump(dataset_metadata, f)

!kaggle datasets create -p /kaggle/working/ -u


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Skipping folder: my_local_t5_model; use '--dir-mode' to upload folders
Skipping folder: t5_summarizer; use '--dir-mode' to upload folders
Skipping folder: .virtual_documents; use '--dir-mode' to upload folders
Starting upload for file my_local_t5_model.zip
100%|█████████████████████████████████████████| 782M/782M [00:04<00:00, 167MB/s]
Upload successful: my_local_t5_model.zip (782MB)
Your public Dataset is being created. Please check progress at https://www.kaggle.com/datasets/kanishk2223/my-t5-model
