In [1]:
!pip install -q datasets==3.2.0 bitsandbytes==0.45.1 accelerate==1.2.1 evaluate==0.4.3 \
    transformers==4.47.1 torch==2.5.1+cu124 numpy==1.26.4 peft==0.14.0 rouge_score==0.1.2

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00

In [2]:
import os
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)

In [3]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [4]:
# Configurations
MODEL_NAME = "google/flan-t5-small"  # Using flan-t5 which is better for summarization
SUMMARY_PREFIX = "summarize: "
MAX_INPUT_LENGTH = 256  # Increased for summarization
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8  # Reduced batch size due to longer sequences
NUM_TRAIN_EPOCHS = 3
LEARNING_RATE = 2e-4

In [5]:
# Load dataset
dataset = load_dataset("billsum", split="train[:1000]")

dataset = dataset.train_test_split(test_size=0.1, seed=42)

dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 900
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 100
    })
})

In [6]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    """
    Preprocess the summarization dataset by tokenizing inputs and targets.

    Args:
        examples (dict): Batch of examples containing text and summaries

    Returns:
        dict: Tokenized model inputs
    """
    inputs = [SUMMARY_PREFIX + text for text in examples["text"]]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )

    labels = tokenizer(
        text_target=examples["summary"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [8]:
# Tokenize dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=MODEL_NAME,
    padding=True
)

In [10]:
# Load evaluation metric
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    Compute ROUGE scores for summarization evaluation.

    Args:
        eval_pred (EvalPrediction): Model predictions and labels

    Returns:
        dict: Evaluation metrics including ROUGE scores
    """
    predictions, labels = eval_pred
    logits = predictions[0]

    # Decode predictions into readable text
    decoded_preds = tokenizer.batch_decode(
        logits.argmax(-1),
        skip_special_tokens=True
    )

    # Replace -100 in labels with pad token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels,
        skip_special_tokens=True
    )

    # Compute ROUGE scores
    result = rouge_metric.compute(
        predictions = decoded_preds,
        references = decoded_labels,
        use_stemmer = True # map each token to its root form (stem)
    )

    # Add generation length metric
    prediction_lens = [
        np.count_nonzero(pred) for pred in logits.argmax(-1)
    ]

    result["gen_len"] = np.mean(prediction_lens)

    # Round all metrics to 4 decimal places
    result = {k: round(v * 100, 4) for k, v in result.items()}

    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [12]:
# Load base model with quantization
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(base_model)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
# Find target modules for LoRA
target_modules = set()
for name, module in base_model.named_modules():
    if isinstance(module, torch.nn.Linear):
        module_name = name.split('.')[-1]
        target_modules.add(module_name)
target_modules = list(target_modules)
target_modules

['k', 'v', 'q', 'wi_1', 'wi_0', 'lm_head', 'o', 'wo']

In [14]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Increased rank for summarization
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "v"]  # Target attention layers
)

# Get PEFT model
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [15]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./summarization_results",
    overwrite_output_dir=True,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",  # Using ROUGE-1 as primary metric
    greater_is_better=True, # when comparing the best models, True = higher metric value is better (e.g., accuracy). False = lower is better (e.g., loss).
    bf16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True, # saves memory by recomputing some activations during the backward pass instead of storing all.
)




In [16]:
# Initialize Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

# Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdulrahman-ahmed20072[0m ([33mabdulrahman-ahmed20072-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.688623,54.9372,27.1955,46.0335,46.044,12800.0
2,No log,2.565884,55.3349,28.3454,46.8051,46.7503,12800.0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=84, training_loss=12.2684326171875, metrics={'train_runtime': 158.1347, 'train_samples_per_second': 17.074, 'train_steps_per_second': 0.531, 'total_flos': 246022869614592.0, 'train_loss': 12.2684326171875, 'epoch': 2.920353982300885})

In [17]:
# Inference function
def generate_summary(text, max_length=MAX_TARGET_LENGTH):
    """
    Generate a summary for the input text.

    Args:
        text (str): Input text to summarize
        max_length (int): Maximum length of generated summary

    Returns:
        str: Generated summary
    """
    inputs = tokenizer(
        SUMMARY_PREFIX + text,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        truncation=True
    ).to(peft_model.device)

    peft_model.eval()

    with torch.no_grad():
        outputs = peft_model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,  # Using beam search for better summaries
            length_penalty=2.0, # Penalizes shorter sequences if > 1.0, shorter if < 1.0. Encourages longer summaries here.
            early_stopping=True, # Stops beam search early if a likely best sequence is found.
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [18]:
# Example usage
example_text = """
The Inflation Reduction Act lowers prescription drug costs, health care costs,
and energy costs. It's the most aggressive action on tackling the climate crisis
in American history, which will lift up American workers and create good-paying,
union jobs across the country.
"""

print(f"Original text:\n{example_text}\n")
print(f"Generated summary:\n{generate_summary(example_text)}")

Original text:

The Inflation Reduction Act lowers prescription drug costs, health care costs, 
and energy costs. It's the most aggressive action on tackling the climate crisis 
in American history, which will lift up American workers and create good-paying, 
union jobs across the country.


Generated summary:
The Inflation Reduction Act will reduce prescription drug costs, health care costs, and energy costs, and create good-paying, union jobs across the country.
