In [None]:
print("Hello world!")

Hello world!


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
import pandas as pd

#Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library:
billsum = load_dataset("billsum", split="ca_test")

In [None]:
type(billsum)

datasets.arrow_dataset.Dataset

In [None]:
#Split the dataset into a train and test set with the train_test_split method:
billsum = billsum.train_test_split(test_size=0.4)

In [None]:
#The next step is to load a T5 tokenizer to process text and summary:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "

In [None]:
#The preprocessing function you want to create needs to:
#1)Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
#2)Use the keyword text_target argument when tokenizing labels.
#3)Truncate sequences to be no longer than the maximum length set by the max_length parameter.

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
#To apply the preprocessing function over the entire dataset, use 🤗 Datasets map method.
#You can speed up the map function by setting batched=True to process multiple elements of the dataset at once:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/742 [00:00<?, ? examples/s]

Map:   0%|          | 0/495 [00:00<?, ? examples/s]

In [None]:
#Now create a batch of examples using DataCollatorForSeq2Seq.
#It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
#Then create a function that passes your predictions and labels to compute to calculate the ROUGE metric:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
#Load T5 with AutoModelForSeq2SeqLM:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
#For summarization you should prefix your input as shown below:
text = "summarize: A period of unrest and civil wars in the 1st century BCE marked the transition of Rome from a republic to an empire. This period encompassed the career of Julius Caesar, who eventually took full power over Rome as its dictator. After his assassination in 44 BCE, the triumvirate of Mark Antony, Lepidus, and Octavian, Caesar’s nephew, ruled. It was not long before Octavian went to war against Antony in northern Africa, and after his victory at Actium (31 BCE) he was crowned Rome’s first emperor, Augustus. His reign, from 27 BCE to 14 CE, was distinguished by stability and peace. With a mind toward maintaining the structure of power entrusted to his rule, Augustus began thinking early about who should follow him. Death played havoc with his attempts to select his successor. He had no son and his nephew Marcellus, his son-in-law Agrippa, and his grandsons Gaius and Lucius each predeceased him. He eventually chose Tiberius, a scion of the ultra-aristocratic Claudia gens, and in 4 CE adopted him as his son. Tiberius (reigned 14–37) became the first successor in the Julio-Claudian dynasty and ruled as an able administrator but cruel tyrant. His great-nephew Caligula (37–41) reigned as an absolutist, his short reign filled with reckless spending, callous murders, and humiliation of the Senate. Claudius (41–54) centralized state finances in the imperial household, thus making rapid strides in organizing the imperial bureaucracy, but was ruthless toward the senators and equites. Nero (54–68) left administration to capable advisers for a few years but then asserted himself as a vicious despot. He brought the dynasty to its end by being the first emperor to suffer damnatio memoriae: his reign was officially stricken from the record by order of the Senate."

In [None]:
#-----------------------------------------------------------------------------------------------------
#The simplest way to try out your finetuned model for inference is to use it in a pipeline().
#Instantiate a pipeline for summarization with your model, and pass your text to it:
summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
summarizer(text)

[{'summary_text': 'Julius Caesar was crowned Rome’s first emperor, Augustus, after his assassination in 44 BCE . his reign, from 27 BCE to 14 CE, was distinguished by stability and peace . he was a scion of the ultra-aristocratic Claudia gens, and in 4 CE adopted him as his son .'}]

In [34]:
#Tokenize the text and return the input_ids as PyTorch tensors:
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

#Use the generate() method to create the summarization.
model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

#Decode the generated token ids back into text:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Julius Caesar, who eventually took full power over Rome as its dictator, was crowned Rome’s first emperor Augustus. he was distinguished by stability and peace, and died in 14 CE. he was crowned Rome’s first emperor, Augustus, after his assassination in 44 BCE.'