# **Load Dataset**

In [1]:
pip install datasets



In [2]:
from datasets import load_dataset

In [3]:
ds_full = load_dataset('multi_news')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
ds_full

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [5]:
dataset = ds_full['train']

In [6]:
dataset = dataset.shuffle(seed=42).select(range(1000))

In [7]:
dataset = dataset.train_test_split(test_size=0.2)

# **AutoTokenize Data**

In [8]:
from transformers import AutoTokenizer

In [9]:
# # use the default preprocessor
# # important to ensure expected input to our model (i.e. same lemmatization modelling, stopwords, etc)
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# def tokenize_function(examples):
#    # Map function
#     # padding and truncation control for variable length sequences
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

# # apply to all datasets with .map(). Built in function of the HF datasets class
# tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [10]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [11]:
checkpoint = "google-t5/t5-small" # We need an encoder-decoder model since we're going text-text
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Use the right tokenizer
prefix = "summarize: " # This is a multipurpose model - we need to attach a task to tell it what we want
# Just like how you prompt chat GPT with specific questions

In [12]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]] # add summarize to documents
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True) # tokenize inputs
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) # tokenize outputs

    model_inputs["labels"] = labels["input_ids"] # match up the text and summary. Specific to this application
    return model_inputs

In [13]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) # batch examples
# Better for computation and memory
# Also makes it easier/less disruptive to make all sequences in a batch the same length, rather than the whole dataset

In [15]:
!pip install evaluate
!pip install rouge_score



In [16]:
import numpy as np
import evaluate

In [17]:
rouge = evaluate.load("rouge") # metric
# specially designed metric for summarization tasks: https://huggingface.co/spaces/evaluate-metric/rouge

In [18]:
# From the documentation, don't worry about syntax here

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [19]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) # load t5

In [20]:
!pip install transformers[torch]



In [21]:
!pip install accelerate -U



In [22]:
# Define training args:

training_args = Seq2SeqTrainingArguments(
    output_dir="dataset_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01, # regularization in optimizer
    save_total_limit=3, # maximum number of versions to have saved
    num_train_epochs=3,
    predict_with_generate= True
)

In [23]:
# build trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
trainer.train()
trainer.save_model('dataset_model')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.058532,0.1391,0.0398,0.1053,0.1056,19.0
2,No log,2.982237,0.1344,0.038,0.1012,0.1013,19.0
3,3.384000,2.965845,0.1313,0.038,0.1006,0.1007,19.0




In [34]:
document = ds_full['test'][0]['document']
# document = ds_full['test'][0]['summary']
document
# print(summary)

'GOP Eyes Gains As Voters In 11 States Pick Governors \n \n Enlarge this image toggle caption Jim Cole/AP Jim Cole/AP \n \n Voters in 11 states will pick their governors tonight, and Republicans appear on track to increase their numbers by at least one, with the potential to extend their hold to more than two-thirds of the nation\'s top state offices. \n \n Eight of the gubernatorial seats up for grabs are now held by Democrats; three are in Republican hands. Republicans currently hold 29 governorships, Democrats have 20, and Rhode Island\'s Gov. Lincoln Chafee is an Independent. \n \n Polls and race analysts suggest that only three of tonight\'s contests are considered competitive, all in states where incumbent Democratic governors aren\'t running again: Montana, New Hampshire and Washington. \n \n While those state races remain too close to call, Republicans are expected to wrest the North Carolina governorship from Democratic control, and to easily win GOP-held seats in Utah, North 

In [35]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("dataset_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("dataset_model", local_files_only=True)

# Tokenize the input text
inputs = tokenizer(document, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=None, max_length= 500, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:", summary)

Summary: , and the state's Republican Attorney General Rob McKenna is in a dead-heat battle to keep it that way. Montana: The state's governorship has been in Democratic hands for 32 years, and former U.S. Rep. Jay Inslee is in a dead-heat battle to keep it that way. The state's Republican Attorney General Rob McKenna has a proven ability to win statewide, but working in Inslee's favor are Obama's poll


In [36]:
summary

", and the state's Republican Attorney General Rob McKenna is in a dead-heat battle to keep it that way. Montana: The state's governorship has been in Democratic hands for 32 years, and former U.S. Rep. Jay Inslee is in a dead-heat battle to keep it that way. The state's Republican Attorney General Rob McKenna has a proven ability to win statewide, but working in Inslee's favor are Obama's poll"

# **Run Text Summarization through Pipeline**

In [None]:
# from transformers import pipeline

In [None]:
  # # Tasks summary https://huggingface.co/docs/transformers/main/en/task_summary
  # pipe = pipeline(task="summarization")

  # # # You can also specify a model to use in the pipeline
  # # pipe = pipeline(model='add the model name here')

In [None]:
# ds['train']['document'][0]

In [None]:
# data = ds['train']['document']
# preds = pipe(data)

In [None]:
# preds