In [1]:
!pip install transformers datasets peft accelerate evaluate rouge-score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [2]:
from datasets import load_dataset

# Load reddit_tifu and take 1000 samples
dataset = load_dataset("ctr4si/reddit_tifu", "long")["train"].shuffle(seed=42).select(range(1000))

# Split into train (90%) and eval (10%)
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
eval_data = dataset["test"]

print(f"Train samples: {len(train_data)}, Eval samples: {len(eval_data)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

reddit_tifu.py:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

The repository for ctr4si/reddit_tifu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ctr4si/reddit_tifu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


tifu_all_tokenized_and_filtered.json.gz:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42139 [00:00<?, ? examples/s]

Train samples: 800, Eval samples: 200


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [4]:
print(model)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [5]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,                  # Rank
    lora_alpha=32,        # Scaling factor
    lora_dropout=0.1,     # Dropout
    target_modules=["q_proj", "v_proj"]  # Target Pegasus layers
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Check: ~1.3M trainable params

trainable params: 1,572,864 || all params: 571,321,344 || trainable%: 0.2753


In [6]:
def preprocess_function(examples):
    # Tokenize posts (input) and TL;DRs (labels)
    inputs = tokenizer(
        examples["documents"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["tldr"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply preprocessing
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_eval = eval_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [15]:
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainerCallback
)
import torch
import numpy as np
import evaluate
import os

# Enable expandable memory segments to reduce fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Define metric computation for ROUGE
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

# Aggressively clear GPU cache before training
torch.cuda.empty_cache()

# Reduce memory footprint during evaluation
class MemorySaverCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

# Use dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Reduce evaluation set size to avoid memory spikes
tokenized_eval = tokenized_eval.select(range(50))  # Use a smaller sample if needed

# Training arguments with memory optimizations
training_args = Seq2SeqTrainingArguments(
    output_dir="./pegasus-reddit-lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    logging_dir="./logs",
    save_strategy="no",                 # Disable model saving during training
    eval_strategy="no",                 # Skip evaluation during training
    report_to="none",
    fp16=True,                          # Mixed precision
    gradient_accumulation_steps=4,      # Simulate larger batch size
    generation_max_length=64,           # Limit generation length to reduce memory
    predict_with_generate=True
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,  # Still required for decoding
    data_collator=data_collator,
    callbacks=[MemorySaverCallback()],
    compute_metrics=compute_metrics
)

# Start training
trainer.train()
# Saving the model after training so that i can use it later
trainer.model.save_pretrained("./pegasus-reddit-lora")
tokenizer.save_pretrained("./pegasus-reddit-lora")
# Evaluate after training
evaluation_results = trainer.evaluate()
print("Evaluation results:", evaluation_results)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,7.6984


Evaluation results: {'eval_loss': 6.95993709564209, 'eval_rouge1': 0.2560719034817054, 'eval_rouge2': 0.0788356115856148, 'eval_rougeL': 0.2015019149515525, 'eval_rougeLsum': 0.20188410831159265, 'eval_runtime': 43.1892, 'eval_samples_per_second': 1.158, 'eval_steps_per_second': 1.158, 'epoch': 5.0}


In [16]:
# 1. Preproces the base model first

from datasets import load_dataset

# Load 1000 samples from reddit_tifu (long posts)
dataset = load_dataset("ctr4si/reddit_tifu", "long")["train"].shuffle(seed=42).select(range(1000))

# Split into train/validation/test (e.g., 80/10/10)
split = dataset.train_test_split(test_size=0.2)
val_test = split["test"].train_test_split(test_size=0.5)
train_data = split["train"]
val_data = val_test["train"]
test_data = val_test["test"]



In [17]:
# Evaluate the base model
from transformers import pipeline
from rouge_score import rouge_scorer
import warnings
warnings.filterwarnings("ignore")

# Initialize base model
base_model = pipeline("summarization", model="facebook/bart-large-cnn", device=0)  # Use GPU if available

# Evaluate on test set
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
base_rouge_scores = []

for example in test_data:
    base_summary = base_model(example["tldr"], max_length=130)[0]["summary_text"]
    rouge_score = scorer.score(example["tldr"], base_summary)  # Compare with gold "tldr"
    base_rouge_scores.append(rouge_score)

# Calculate averages
avg_base_rouge1 = sum(s["rouge1"].fmeasure for s in base_rouge_scores) / len(base_rouge_scores)
avg_base_rougeL = sum(s["rougeL"].fmeasure for s in base_rouge_scores) / len(base_rouge_scores)
print(f"Base Model ROUGE-1: {avg_base_rouge1:.3f}, ROUGE-L: {avg_base_rougeL:.3f}")

Device set to use cuda:0
Your max_length is set to 130, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 130, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 130, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 130, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('

Base Model ROUGE-1: 0.492, ROUGE-L: 0.471


In [18]:


from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("./pegasus-reddit-lora").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("./pegasus-reddit-lora")

# Compute ROUGE scores
finetuned_rouge_scores = []
for example in test_data:
    inputs = tokenizer(example["tldr"], return_tensors="pt", truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_length=130)
    finetuned_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    rouge_score = scorer.score(example["tldr"], finetuned_summary)
    finetuned_rouge_scores.append(rouge_score)

# Compare with base model
avg_finetuned_rouge1 = sum(s["rouge1"].fmeasure for s in finetuned_rouge_scores) / len(finetuned_rouge_scores)
print(f"Fine-tuned ROUGE-1: {avg_finetuned_rouge1:.3f} (vs Base: {avg_base_rouge1:.3f})")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuned ROUGE-1: 0.574 (vs Base: 0.492)
