In [52]:
import transformers
print(transformers.__version__)

from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import numpy as np
import torch

4.37.2


In [2]:
model_checkpoint = "t5-base-finetuned-multi-news/checkpoint-112000"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
raw_datasets = load_dataset("multi_news")
metric = load("rouge")

In [4]:
max_input_length = 2048
max_target_length = 256

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
prefix = "summarize: "
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/44972 [00:00<?, ? examples/s]

Map:   0%|          | 0/5622 [00:00<?, ? examples/s]

Map:   0%|          | 0/5622 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [10]:
batch_size = 2
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-multi-news",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [12]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
results = trainer.predict(tokenized_datasets["test"].select(range(5)), metric_key_prefix="test")



In [46]:
len(results.predictions[4])

20

In [47]:
tokenizer.decode(tokenized_datasets['test'][0]['input_ids'])

'summarize: GOP Eyes Gains As Voters In 11 States Pick Governors Enlarge this image toggle caption Jim Cole/AP Jim Cole/AP Voters in 11 states will pick their governors tonight, and Republicans appear on track to increase their numbers by at least one, with the potential to extend their hold to more than two-thirds of the nation\'s top state offices. Eight of the gubernatorial seats up for grabs are now held by Democrats; three are in Republican hands. Republicans currently hold 29 governorships, Democrats have 20, and Rhode Island\'s Gov. Lincoln Chafee is an Independent. Polls and race analysts suggest that only three of tonight\'s contests are considered competitive, all in states where incumbent Democratic governors aren\'t running again: Montana, New Hampshire and Washington. While those state races remain too close to call, Republicans are expected to wrest the North Carolina governorship from Democratic control, and to easily win GOP-held seats in Utah, North Dakota and Indiana.

In [74]:
temp = model.generate(torch.tensor(tokenized_datasets['test'][0]['input_ids']).reshape(1, -1), do_sample=True, max_new_tokens=400)

In [75]:
len(temp[0])

274

In [69]:
tokenizer.decode(temp.tolist()[0])

'<pad> – Voters in 11 states will likely end up picking their governors tomorrow, and some Democrats will make an odd one. In Montana, new Hampshire, Washington, and West Virginia, Democrats have 29 state governorships, while Democrat Lincoln Chafee is an Independent. Only three of the 16 states up for grabs are considered competitive, reports NBC: Washington, Montana, New Hampshire, and Washington. Republican Dave Spector in New Hampshire, Montana GOP Sen. Gregg Harris and Former State Rep. Bill Maloney in West Virginia: Mississippi Gov. Mike Pence on the left, Oregon Gov. Bobby Gillen on the left, and New York Gov. Tom Bossman on the left. "We\'ll have to wait for a confirmation hearing, because the Republican will be elected, and we\'re going to have to wait it out for Trump to decide whether to resign for Obama or lose his power," he writes. Statewide: Michigan GOP Gov. Mike Pence in Virginia, Utah, and Indiana, and West Virginia Republican Gov. Peter Shumlin in Delaware. GOP GOP G