In [1]:
#!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [2]:
#!pip install --upgrade accelerate
#!pip install transformers accelerate

In [4]:
import torch
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
device

'cpu'

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
data_files = {"train": "samsum-train.csv", "test": "samsum-test.csv", 'validation': 'samsum-validation.csv'}
dataset_samsum = load_dataset("summarizer-data", data_files=data_files)
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum['test'][1]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [10]:
def convert_examples_to_features(example_batch):
    dialogues = example_batch['dialogue']
    summaries = example_batch['summary']
    
    dialogues = [str(dialogue) for dialogue in dialogues]
    summaries = [str(summary) for summary in summaries]
    
    input_encodings = tokenizer(dialogues, max_length=512, truncation=True, padding='max_length')
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(summaries, max_length=128, truncation=True, padding='max_length')

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [12]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

In [13]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [14]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='pegasus_samsum',             
    num_train_epochs=1,
    per_device_train_batch_size=1,          
    per_device_eval_batch_size=1,            
    warmup_steps=500,                        
    weight_decay=0.01,
    logging_steps = 10,
    eval_strategy = 'steps',
    eval_steps = 500,
    gradient_accumulation_steps=16,           
    save_steps = 1e6                              
)

In [17]:
trainer = Trainer(
    model=model_pegasus, args=training_args,                      
    train_dataset=dataset_samsum_pt['test'], #test because of train size 
    eval_dataset=dataset_samsum_pt['validation'],
    data_collator=seq2seq_data_collator
)

In [18]:
try:
    trainer.train()
except Exception as e:
    print(f"Error: {e}")

Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [19]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                                batch_size=16, device=device,
                                column_text='article',
                                column_summary='highlights'):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length = 512, truncation=True,
                           padding='max_length', return_tensors='pt')

        summaries = model.generate(input_ids=inputs['input_ids'].to(device),
                                attention_mask=inputs['attention_mask'].to(device),
                                length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
                    for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [21]:
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_metric = load_metric('rouge', trust_remote_code=True)

In [22]:
score = calculate_metric_on_test_ds(dataset_samsum['test'][0:5], rouge_metric, 
                                    trainer.model, tokenizer, batch_size=2, 
                                    column_text = 'dialogue', column_summary = 'summary'
)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame(rouge_dict, index = [f'pegasus'])

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:23<00:00, 27.79s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.021488,0.0,0.021488,0.021937


In [25]:
model_pegasus.save_pretrained("pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [26]:
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\spiece.model',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [37]:
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [39]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}

sample_text = dataset_samsum["test"][0]["dialogue"]

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline('summarization', model = 'pegasus-samsum-model', tokenizer=tokenizer)

print('Dialogue:')
print(sample_text)

print('\nReference Summary:')
print(reference)

print('\nModel Summary:')
print(pipe(sample_text, **gen_kwargs)[0]['summary_text'])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him .
