In [3]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [1]:
from transformers import pipeline

import matplotlib.pyplot as plt
from datasets import load_dataset, load_metric
import pandas as pd
from tqdm import tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [3]:
model_checkpoint = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [5]:
data = load_dataset("samsum")

Found cached dataset samsum (C:/Users/Dr.G.Mahadevan/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.62it/s]


In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
len(data['train']), len(data['test']), len(data['validation'])

(14732, 819, 818)

In [8]:
data['train'].column_names

['id', 'dialogue', 'summary']

In [9]:
print("Dialogue")
print(data['test'][1]['dialogue'])
print("Summary")
print(data['test'][1]['summary'])

Dialogue
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
Summary
Eric and Rob are going to watch a stand-up on youtube.


In [10]:
dialogue = data['test'][0]['dialogue']
summary = data['test'][0]['summary']
print(dialogue)
print(summary)

Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [11]:
pipe = pipeline('summarization', model = model_checkpoint)

In [None]:
pipe

In [15]:
pipe_out

[{'summary_text': "Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him ."}]

In [16]:
def generate_batches(list_of_elements, batch_size):
    for i in range(0,len(list_of_elements),batch_size):
        yield list_of_elements[i:i+batch_size]

In [17]:
def calculate_metric_on_test(dataset,metric,model,tokenizer,batch_size = 16, column_text = "article", column_summary = "highlights"):
    article_batches = list(generate_batches(dataset[column_text],batch_size))
    target_batches = list(generate_batches(dataset[column_summary],batch_size))
    
    for article_batch, target_batch in tqdm(zip(article_batches,target_batches), total = len(article_batches)):
        inputs = tokenizer(article_batch,max_length = 1024, truncation = True, padding = "max_length", return_tensors = 'pt')
        summaries = model.generate(input_ids = inputs["input_ids"], attention_mask = inputs["attention_mask"], length_penalty = 0.8, num_beams = 8,max_length = 128)
        
        decoded_summaries = [tokenizer.decode(s,skip_special_tokens = True, clean_up_tokenization_spaces = True) for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions = decoded_summaries, references = target_batch)
        
    score = metric.compute()
    return score

In [None]:
rouge_metric = load_metric("rouge")
score = calculate_metric_on_test(data['test'],rouge_metric,model_pegasus,tokenizer,column_text = 'dialogue',column_summary = 'summary',batch_size = 4)

  """Entry point for launching an IPython kernel.
  2%|████▌                                                                                                                                                                                        | 5/205 [1:28:40<58:45:11, 1057.56s/it]

###### 