In [1]:
# !pip install datasets
# !pip install evaluate
# !pip install rouge_score
!pip install transformers datasets evaluate rouge_score



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import numpy as np
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# **Login to HuggingFace**

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Load Dataset**

In [4]:
ds_full = load_dataset('multi_news')
ds_train = ds_full['train']
ds_1000 = ds_train.shuffle(seed=42).select(range(1000))
dataset = ds_1000.train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
print(dataset.shape)

{'train': (800, 2), 'test': (200, 2)}


# **Preprocess Data**

In [6]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, truncation=True)

    # Tokenize the labels without setting a max_length
    labels = tokenizer(text_target=examples["summary"], truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Assuming 'dataset' is already loaded and defined
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
# checkpoint = "google-t5/t5-small" # We need an encoder-decoder model since we're going text-text
# tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Use the right tokenizer
# prefix = "summarize: " # This is a multipurpose model - we need to attach a task to tell it what we want

In [9]:
# def preprocess_function(examples):
#     inputs = [prefix + doc for doc in examples["document"]] # add summarize to documents
#     model_inputs = tokenizer(inputs, max_length=1024, truncation=True) # tokenize inputs
#     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) # tokenize outputs

#     model_inputs["labels"] = labels["input_ids"] # match up the text and summary. Specific to this application
#     return model_inputs

In [10]:
# tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [11]:
# from transformers import DataCollatorForSeq2Seq
# # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) # code when using Pytorch
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# **Define Pipeline Evaluation Metrics**

In [12]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# **Define Model**

In [13]:
!pip install transformers[torch]



In [14]:
!pip install accelerate -U



In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [16]:
# Define training args:

training_args = Seq2SeqTrainingArguments(
    output_dir="multi_news_train_800_t5-small_summary_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01, # regularization in optimizer
    save_total_limit=3, # maximum number of versions to have saved
    num_train_epochs=3,
    predict_with_generate= True
)

In [17]:
# build trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()
trainer.save_model('multi_news_train_800_t5-small_summary_model')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.523568,0.0787,0.0277,0.063,0.063,19.0
2,No log,3.4513,0.0795,0.0266,0.0625,0.0625,19.0
3,3.936000,3.434036,0.0789,0.0263,0.0628,0.0628,19.0




In [22]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
trainer.push_to_hub('multi_news_train_800_t5-small_summary_model')

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1710640539.37f71654a8ba.1866.0:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/AlexandraSnelling/multi_news_train_800_t5-small_summary_model/commit/b2bf6aa9b7ec01c13ecea57ac00705de84e4ab1a', commit_message='multi_news_train_800_t5-small_summary_model', commit_description='', oid='b2bf6aa9b7ec01c13ecea57ac00705de84e4ab1a', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
# document = ds_full['test'][0]['document']
# # document = ds_full['test'][0]['summary']
# document
# # print(summary)

'GOP Eyes Gains As Voters In 11 States Pick Governors \n \n Enlarge this image toggle caption Jim Cole/AP Jim Cole/AP \n \n Voters in 11 states will pick their governors tonight, and Republicans appear on track to increase their numbers by at least one, with the potential to extend their hold to more than two-thirds of the nation\'s top state offices. \n \n Eight of the gubernatorial seats up for grabs are now held by Democrats; three are in Republican hands. Republicans currently hold 29 governorships, Democrats have 20, and Rhode Island\'s Gov. Lincoln Chafee is an Independent. \n \n Polls and race analysts suggest that only three of tonight\'s contests are considered competitive, all in states where incumbent Democratic governors aren\'t running again: Montana, New Hampshire and Washington. \n \n While those state races remain too close to call, Republicans are expected to wrest the North Carolina governorship from Democratic control, and to easily win GOP-held seats in Utah, North 

In [21]:
# # from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model = AutoModelForSeq2SeqLM.from_pretrained("multi_news_train_800_t5-small_summary_model", local_files_only=True)
# tokenizer = AutoTokenizer.from_pretrained("multi_news_train_800_t5-small_summary_model", local_files_only=True)

# # Tokenize the input text
# inputs = tokenizer(document, return_tensors="pt", max_length=1024, truncation=True)

# # Generate summary
# summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=None, max_length= 500, early_stopping=True)

# # Decode the summary
# summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# # Print the summary
# print("Summary:", summary)

Summary: , and the Republican Attorney General Rob McKenna is in a dead-heat battle to keep it that way, says Thad Kousser, co-author of The Power of American Governors. "No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act," says Kousser, of the University of California, San Diego. "No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act," says Kousser. "A [Mitt] Romney is in a dead-heat battle to keep it that way. "A [Mitt] Romney is in a dead-heat battle to keep it that way. Inslee is in a dead-heat battle to keep it that way," says Kousser. "No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act," says Kousser. "A [Mitt] Romney victory would dramatically empower Republican governors," says Kousser. "No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act," says Kousser. "A [Mitt] R