In [None]:
# Fix numpy + transformers crash in Colab (after runtime reset)
# 🛠 Downgrade NumPy to avoid 2.0 issues
!pip install numpy==1.26.4 --quiet
!pip install -q --upgrade transformers datasets accelerate evaluate rouge_score

# Restart runtime automatically to apply numpy fix
import os
os.kill(os.getpid(), 9)

In [None]:
# Install libraries (only run once)
!pip install --no-cache-dir transformers datasets accelerate



In [None]:
# --- Imports ---
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
import pandas as pd
import json
from torch.utils.data import DataLoader
from accelerate import Accelerator
import numpy as np

In [None]:
with open("train.src.cleaned", "r") as f:
    docs = [line.strip() for _, line in zip(range(500), f)]

with open("train.tgt", "r") as f:
    summaries = [line.strip() for _, line in zip(range(500), f)]

In [None]:
print("Docs:", len(docs))
print("Summaries:", len(summaries))

Docs: 500
Summaries: 500


In [None]:
min_len = min(len(docs), len(summaries))

df = pd.DataFrame({
    "document": [doc.strip() for doc in docs[:min_len]],
    "summary": [summary.strip() for summary in summaries[:min_len]]
})

In [None]:
# Trim both lists to the same length before cleaning
min_len = min(len(docs), len(summaries))
docs = docs[:min_len]
summaries = summaries[:min_len]

# Strip and build the DataFrame
df = pd.DataFrame({
    "document": [doc.strip() for doc in docs],
    "summary": [summary.strip() for summary in summaries]
})

df.head()

Unnamed: 0,document,summary
0,National Archives NEWLINE_CHAR NEWLINE_CHAR Ye...,– The unemployment rate dropped to 8.2% last m...
1,LOS ANGELES (AP) — In her first interview sinc...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"GAITHERSBURG, Md. (AP) — A small, private jet ...",– A twin-engine Embraer jet that the FAA descr...
3,Tucker Carlson Exposes His Own Sexism on Twitt...,– Tucker Carlson is in deep doodoo with conser...
4,A man accused of removing another man's testic...,– What are the three most horrifying words in ...


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# --- Slice the dataset for faster training ---
small_dataset = tokenized_dataset.select(range(100))  # or whatever size
small_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])  # ✅ Add this line

# --- Create DataLoader with smaller dataset ---
train_loader = DataLoader(small_dataset, batch_size=4, shuffle=True)


Map:   0%|          | 0/500 [00:00<?, ? examples/s]



In [None]:
small_dataset.save_to_disk("tokenized_sample_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk
small_dataset = load_from_disk("tokenized_sample_dataset")

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM
from accelerate import Accelerator

# Initialize accelerator
accelerator = Accelerator()

# Load model
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Prepare model, optimizer, and DataLoader using accelerator
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
model.train()
num_epochs = 3  # You can bump to 5 later

for epoch in range(num_epochs):
    total_loss = 0

    for batch in train_loader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"✅ Epoch {epoch+1}/{num_epochs} complete! Avg Loss: {total_loss / len(train_loader):.4f}")


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


✅ Epoch 1/3 complete! Avg Loss: 4.3021
✅ Epoch 2/3 complete! Avg Loss: 3.6566
✅ Epoch 3/3 complete! Avg Loss: 3.5055


In [None]:
model.save_pretrained("t5_finetuned_summary")
tokenizer.save_pretrained("t5_finetuned_summary")

('t5_finetuned_summary/tokenizer_config.json',
 't5_finetuned_summary/special_tokens_map.json',
 't5_finetuned_summary/spiece.model',
 't5_finetuned_summary/added_tokens.json',
 't5_finetuned_summary/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5_finetuned_summary")
model = AutoModelForSeq2SeqLM.from_pretrained("t5_finetuned_summary")


In [None]:
!zip -r t5_finetuned_summary.zip t5_finetuned_summary/

  adding: t5_finetuned_summary/ (stored 0%)
  adding: t5_finetuned_summary/tokenizer_config.json (deflated 95%)
  adding: t5_finetuned_summary/special_tokens_map.json (deflated 85%)
  adding: t5_finetuned_summary/model.safetensors (deflated 10%)
  adding: t5_finetuned_summary/tokenizer.json (deflated 74%)
  adding: t5_finetuned_summary/config.json (deflated 62%)
  adding: t5_finetuned_summary/generation_config.json (deflated 27%)
  adding: t5_finetuned_summary/spiece.model (deflated 48%)


In [None]:
from google.colab import files
files.download("t5_finetuned_summary.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install -q evaluate

In [None]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Function to generate summaries and compute ROUGE
def compute_rouge(model, tokenizer, dataset, num_samples=100):
    model.eval()
    inputs = dataset["input_ids"][:num_samples]
    attention_mask = dataset["attention_mask"][:num_samples]
    references = dataset["labels"][:num_samples]

    # Convert label IDs back to text
    references_text = tokenizer.batch_decode(references, skip_special_tokens=True)

    # Generate summaries
    generated_ids = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_length=128,
        num_beams=4,
    )
    predictions_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Compute ROUGE scores
    results = rouge.compute(predictions=predictions_text, references=references_text)
    return results


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
rouge_scores = compute_rouge(model, tokenizer, small_dataset, num_samples=100)
print(rouge_scores)


{'rouge1': 0.3164842014981846, 'rouge2': 0.09548135093056624, 'rougeL': 0.19272925115256562, 'rougeLsum': 0.19302656905109356}


In [None]:
model.eval()

# Grab a small batch from the dataset
for i in range(5):  # You can increase this to see more examples
    input_ids = small_dataset[i]["input_ids"].unsqueeze(0).to(model.device)
    attention_mask = small_dataset[i]["attention_mask"].unsqueeze(0).to(model.device)
    label_ids = small_dataset[i]["labels"]

    # Generate summary
    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

    # Decode
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    reference_summary = tokenizer.decode(label_ids, skip_special_tokens=True)

    # Print
    print(f"\n📰 ARTICLE {i+1}:\n{input_text[:1000]}...\n")
    print(f"🔹 Reference Summary:\n{reference_summary}\n")
    print(f"🔸 Generated Summary:\n{generated_summary}\n")
    print("=" * 80)



📰 ARTICLE 1:
summarize: National Archives NEWLINE_CHAR NEWLINE_CHAR Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. NEWLINE_CHAR NEWLINE_CHAR A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. NEWLINE_CHAR NEWLINE_CHAR Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. NEWLINE_CHAR NEWLINE_CHAR Enjoy the show. 

In [None]:
!pip install -q huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer

# Save tokenizer again just in case
tokenizer.save_pretrained("t5_finetuned_summary")

# Push to Hugging Face Hub
model.push_to_hub("t5-finetuned-summary")
tokenizer.push_to_hub("t5-finetuned-summary")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CuHz/t5-finetuned-summary/commit/3065734d6f373f24cdca34a5e9c94ef4d3f1c5dc', commit_message='Upload tokenizer', commit_description='', oid='3065734d6f373f24cdca34a5e9c94ef4d3f1c5dc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CuHz/t5-finetuned-summary', endpoint='https://huggingface.co', repo_type='model', repo_id='CuHz/t5-finetuned-summary'), pr_revision=None, pr_num=None)