In [None]:
! pip install datasets transformers rouge-score nltk py7zr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 45.7 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting py7zr
  Downloading py7zr-0.18.9.tar.gz (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 44.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_

# Fine-tuning a model on a summarization task

In [None]:
model_checkpoint = "facebook/bart-base"

In [None]:
model_checkpoint="distilbert-base-cased"

In [None]:
model_checkpoint="t5-small"

## Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("samsum")
metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading and preparing dataset samsum/samsum (download: 2.81 MiB, generated: 10.04 MiB, post-processed: Unknown size, total: 12.85 MiB) to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e...


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
batch_size =16
args = Seq2SeqTrainingArguments(
    "t5-dialogue-summarization",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import  TrainingArguments
training_args = TrainingArguments(output_dir="t5-dialogue-summarization", push_to_hub=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/chanifrusydi/t5-dialogue-summarization into local empty directory.


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

Saving model checkpoint to t5-dialogue-summarization
Configuration saved in t5-dialogue-summarization/config.json
Model weights saved in t5-dialogue-summarization/pytorch_model.bin
tokenizer config file saved in t5-dialogue-summarization/tokenizer_config.json
Special tokens file saved in t5-dialogue-summarization/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/231M [00:00<?, ?B/s]

Upload file runs/Jun08_05-06-01_0ee5a351b71a/events.out.tfevents.1654664962.0ee5a351b71a.74.3:  54%|#####3    …

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/chanifrusydi/t5-dialogue-summarization
   60342c5..dd7bc2c  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'dataset': {'name': 'samsum', 'type': 'samsum', 'args': 'samsum'}}
remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/chanifrusydi/t5-dialogue-summarization
   dd7bc2c..a4f446a  main -> main



'https://huggingface.co/chanifrusydi/t5-dialogue-summarization/commit/dd7bc2c56a572a4fdc0f9e732a34e3e32004ec46'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer_pre = AutoTokenizer.from_pretrained("chanifrusydi/t5-dialogue-summarization")
model_pre = AutoModelForSeq2SeqLM.from_pretrained("chanifrusydi/t5-dialogue-summarization")

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

dialog_summarizer = pipeline("summarization", model=model_pre, tokenizer=tokenizer_pre)

In [None]:
text = raw_datasets["test"][100]['dialogue']
text

In [None]:
text='''Sourav: Who’s that woman in a red dress next to Reema?

Raunak: That’s her friend Surabhi. Didn’t you meet her at the picnic last month?

Sourav: No, I couldn’t make it to the picnic, my mother was not well at that time.

Raunak: Oh! Yes, I forgot that. Then let me introduce you to her now. Surabhi, this is my friend Sourav.

Surabhi: Hi, Sourav. Nice to meet you, hope you are doing fine.

Sourav: Yes, hope you are well too. Would you like to have some coffee?

Surabhi: Sure, let’s go get and have two cups of espresso.

Telephonic Dialogues for Everyday Conversations in English

Rancho: Hi, Aditi, it’s Rancho. How are you? What are your plans for today?

Aditi: Oh, hi, Rancho! I was just thinking about giving you a call. Well, I am free today, what about you?

Rancho: That’s nice. I was wondering if you’d like to go to a dinner party tonight

Aditi: Sure, I’d love to! Where is the party?

Rancho: It’s in the Park Hotel?

Aditi: Sounds great!

Rancho: Ok I’ll pick you up around 8:30. We will probably reach the hall by 9 p.m.

Aditi: Great! See you then. Bye!'''

In [None]:
text='''
The next winter will be the "most difficult" in the more than three decades since Ukraine gained independence, President Volodymyr Zelensky warned Tuesday, as Russia's invasion brings the threat of an energy crisis.
Here are the latest headlines from the Russia-Ukraine war: will be launched in Ukraine. Russia has denied allegations of war crimes, but CNN journalists on the ground have seen firsthand evidence of atrocities at multiple locations across the country.
Crimea land corridor: Russia claims it has opened a land corridor to Russian-occupied Crimea, allowing civilians and goods to pass through the eastern Ukrainian territory. Russia's defense minister said the military, working with Russian Railways, had restored 1,200 kilometers of train tracks and opened roads to allow "full-fledged traffic" between Russia, eastern Ukraine's Donbas region and Crimea, the peninsula annexed by Russian forces from Ukraine in 2014.
Fighting in the east: Ukrainian troops are locked in fierce street battles with Russian forces in Severodonetsk as other cities face increased air assaults in the Donbas region. Satellite images by Maxar Technologies show military strikes have hit at least two hospitals in Severodonetsk and the city of Rubizhne.
Melitopol referendum: High-ranking Russian officials are visiting the occupied city in southeastern Ukraine as they prepare to hold a referendum for the remaining residents on becoming part of Russia. The key city in the Zaporizhizhia region neighbors the Kherson region that has been under Russian control since the beginning of the invasion in late February.
Remains repatriated: Ukrainian officials said the bodies of more than 200 soldiers have been repatriated to Ukraine, most of them "heroic defenders" of the Azovstal steel factory in Mariupol. Ukraine and Russia have conducted an exchange of bodies as part of the agreement that ended that siege. 
Maritime corridors: Russia's Defense Ministry said it has created two maritime humanitarian corridors to allow for the movement of ships in the Black Sea and the Sea of Azov, after facing international condemnation over its months-long blockade of key ports. The European Council president has accused the Kremlin of "using food supplies as a stealth missile against developing countries" by blocking Ukrainian grain exports.
Mariupol cholera risk: According to an exiled local official, Russian officials in control of the ravaged southeastern city of Mariupol are considering imposing a quarantine as decomposing corpses and garbage contaminate drinking water —putting remaining residents at risk of cholera and other diseases.
Merkel on Putin: Former German Chancellor Angela Merkel said Russia made "a big mistake" invading Ukraine, adding she was convinced that — from Russian President Vladimir Putin's perspective — any plan for Ukraine to join NATO during her time in office would have been tantamount to a declaration of war.

'''

In [None]:
result = dialog_summarizer(text, min_length=10, max_length=100)

In [None]:
result[0]['summary_text']

'Sourav: Oh! Yes, I forgot that. Then let me introduce you to her now. Surabhi: Sure, let’s go get and have two cups of espresso. Telephonic Dialogues for Everyday Conversations in English Rancho: Hi, Aditi: Okay I’ll pick you up around 8:30. We will probably reach the hall by 9 p.m. Adit: Great! See you then. Bye!'