In [1]:
# !pip install datasets
# !pip install evaluate
# !pip install rouge_score
!pip install transformers datasets evaluate rouge_score



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import numpy as np
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# **Load Dataset**

In [3]:
ds_full = load_dataset('multi_news')
ds_train = ds_full['train']
ds_5000 = ds_train.shuffle(seed=42).select(range(5000))
dataset = ds_5000.train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(dataset.shape)

{'train': (4000, 2), 'test': (1000, 2)}


# **Preprocess Data**

In [5]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, truncation=True)

    # Tokenize the labels without setting a max_length
    labels = tokenizer(text_target=examples["summary"], truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Assuming 'dataset' is already loaded and defined
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
# checkpoint = "google-t5/t5-small" # We need an encoder-decoder model since we're going text-text
# tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Use the right tokenizer
# prefix = "summarize: " # This is a multipurpose model - we need to attach a task to tell it what we want

In [8]:
# def preprocess_function(examples):
#     inputs = [prefix + doc for doc in examples["document"]] # add summarize to documents
#     model_inputs = tokenizer(inputs, max_length=1024, truncation=True) # tokenize inputs
#     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) # tokenize outputs

#     model_inputs["labels"] = labels["input_ids"] # match up the text and summary. Specific to this application
#     return model_inputs

In [9]:
# tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [10]:
# from transformers import DataCollatorForSeq2Seq
# # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) # code when using Pytorch
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# **Define Pipeline Evaluation Metrics**

In [11]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# **Define Model**

In [12]:
!pip install transformers[torch]



In [13]:
!pip install accelerate -U



In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [15]:
# Define training args:

training_args = Seq2SeqTrainingArguments(
    output_dir="multi_news_train_4000_t5-small_summary_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01, # regularization in optimizer
    save_total_limit=3, # maximum number of versions to have saved
    num_train_epochs=3,
    predict_with_generate= True
)

In [16]:
# build trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
# trainer.train()
# trainer.save_model('multi_news_train_800_t5-small_summary_model')

In [18]:
trainer.train()
trainer.save_model('multi_news_train_4000_t5-small_summary_model')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.5688,3.314996,0.0789,0.0253,0.0618,0.0618,18.982
2,3.4929,3.268353,0.0801,0.0261,0.063,0.063,18.984
3,3.4888,3.25791,0.0802,0.026,0.063,0.0631,18.984




# **Login to HuggingFace**

In [23]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
trainer.push_to_hub('multi_news_train_4000_t5-small_summary_model')

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

events.out.tfevents.1710645699.fee58342db7c.1728.0:   0%|          | 0.00/8.83k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AlexandraSnelling/multi_news_train_4000_t5-small_summary_model/commit/3b18c76c913028eed574bdf4db168bc35af5f69a', commit_message='multi_news_train_4000_t5-small_summary_model', commit_description='', oid='3b18c76c913028eed574bdf4db168bc35af5f69a', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
document = ds_full['test'][0]['document']
test_summary_example = ds_full['test'][0]['summary']
test_summary_example

'– It\'s a race for the governor\'s mansion in 11 states today, and the GOP could end the night at the helm of more than two-thirds of the 50 states. The GOP currently controls 29 of the country\'s top state offices; it\'s expected to keep the three Republican ones that are up for grabs (Utah, North Dakota, and Indiana), and wrest North Carolina from the Dems. That brings its toll to 30, with the potential to take three more, reports NPR. Races in Montana, New Hampshire, and Washington are still too close to call, and in all three, Democrat incumbents aren\'t seeking reelection. The results could have a big impact on health care, since a Supreme Court ruling grants states the ability to opt out of ObamaCare\'s Medicaid expansion. "A Romney victory would dramatically empower Republican governors," said one analyst. Click for NPR\'s state-by-state breakdown of what could happen.'

In [28]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("multi_news_train_4000_t5-small_summary_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("multi_news_train_4000_t5-small_summary_model", local_files_only=True)

# Tokenize the input text
inputs = tokenizer(document, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=None, max_length= 500, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
summary

', and the Republican Attorney General Rob McKenna is in a dead-heat battle to keep it that way. Montana: The state\'s governorship has been in Democratic hands for 32 years, and former U.S. Rep. Jay Inslee is in a dead-heat battle to keep it that way. The state\'s Republican Attorney General Rob McKenna has a proven ability to win statewide, and to easily win GOP-held seats in Utah, North Dakota and Indiana, and Republican Gov. Rick Scott of Florida and Scott Walker, who ran unsuccessfully for governor in 1996 and for the U.S. Senate in 2010. "A [Mitt] Romney victory would dramatically empower Republican governors," he says. "It\'s going to be stalemated on the Affordable Care Act," he says. "It\'s going to be stalemated on the Affordable Care Act," he says. "It\'s going to be stalemated on the Affordable Care Act," he says. "A [Mitt] Romney victory would dramatically empower Republican governors," he says. "It\'s going to be stalemated on the Affordable Care Act," he says. "It\'s goi