In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
#  pip install evaluate
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download('punkt_tab')
# Dataset : CNN_Dailymail
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')

print(f"Features in cnn daily_mail : {dataset['train'].column_names}")
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 charachters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])
## Text Summarization Pipelines
sample_text = dataset['train'][1]['article'][:1000]

#We'll collect the generated summaries of each model in a dictionary
summaries = {}
### Summarization Baseline
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])
summaries['baseline'] = baseline_summary_three_sent(sample_text)

summaries['baseline']
### gpt2 model
from transformers import pipeline, set_seed

# Set the random seed for reproducibility
set_seed(42)

# Initialize the text-generation pipeline using GPT-2 medium
pipe = pipeline('text-generation', model='gpt2-medium')

# Input query
gpt2_query = sample_text + "\nTL;DR:\n"

# Generate text (removing 'clean_up_tokenization_space')
pipe_out = pipe(gpt2_query, max_length=512)


pipe_out
pipe_out[0]['generated_text'][len(gpt2_query) :]
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]['generated_text'][len(gpt2_query) :]))
### T5
pipe = pipeline('summarization', model='t5-small')

pipe_out = pipe(sample_text)
pipe_out
summaries['t5'] = "\n".join(sent_tokenize(pipe_out[0]['summary_text']))
### BART
pipe = pipeline('summarization', model='facebook/bart-large-cnn')

pipe_out = pipe(sample_text)
pipe_out
summaries['bart'] = "\n".join(sent_tokenize(pipe_out[0]['summary_text']))
### PEGASUS
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail")

pipe_out = pipe(sample_text)
pipe_out
summaries['pegasus'] = pipe_out[0]['summary_text'].replace (" .<n>", ".\n")
## Comparing different Summaries
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])

# SacreBLEU
# pip install sacrebleu
import evaluate

bleu_metric = evaluate.load('sacrebleu')
import numpy as np
# Load the BLEU metric
bleu_metric = evaluate.load('sacrebleu')

# Define the predictions and references
predictions = [summaries["pegasus"]]
references = [[dataset['train'][1]['highlights']]]  # Note: References should be a list of lists

# Compute the BLEU score
results = bleu_metric.compute(predictions=predictions, references=references, smooth_method='floor', smooth_value=0)

# Format precision values
results['precision'] = [np.round(p, 2) for p in results['precisions']]

# Convert results to a DataFrame
bleu_results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Value'])
bleu_results_df
# ROUGE
# pip install rouge_score
rouge_metric = evaluate.load('rouge')
import pandas as pd
import evaluate

# Load the ROUGE metric
rouge_metric = evaluate.load("rouge")

# List of ROUGE metrics to extract
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

# Reference summary
reference = dataset['train'][1]['highlights']

# Store results
records = []

# Compute ROUGE scores for all model summaries
for model_name in summaries:
    # Add predictions and references
    score = rouge_metric.compute(
        predictions=[summaries[model_name]], references=[reference]
    )

    # Extract ROUGE scores
    rouge_dict = {}
    for rn in rouge_names:
        if rn in score:
            rouge_dict[rn] = score[rn]  # Directly assign the scalar value

    print("ROUGE scores for model:", model_name, rouge_dict)

    # Add scores to records
    records.append({"model": model_name, **rouge_dict})

# Convert results to a DataFrame
df = pd.DataFrame(records)

# Set the model name as index
df.set_index("model", inplace=True)

print(df)


### Evaluating on the TEST set of the CNN/DailyMail dataset
def calculate_metric_on_baseline_test_ds(dataset, metric, column_text='article',column_summary = 'highlights'):
  summaries = [baseline_summary_three_sent(text) for text in dataset[column_text]]
  metric.add_batch(predictions=summaries, references=dataset[column_summary])
  score = metric.compute()
  return score
test_sampled = dataset['train'].shuffle(seed = 42).select(range(1000))

score=calculate_metric_on_baseline_test_ds(test_sampled, rouge_metric)
rouge_dict = {}

for rn in rouge_names:
    if rn in score:
        rouge_dict[rn] = score[rn]  #  # Directly assign the scalar value

pd.DataFrame.from_dict(rouge_dict, orient='index', columns=['Value'])
### Strategy to calculate the ROUGE Metric on test dataset for the other Models
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text],batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary],batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length = 1024, truncation = True, padding = 'max_length',return_tensors = 'pt')

        summaries = model.generate(input_ids = inputs['input_ids'].to(device), attention_mask = inputs['attention_mask'].to(device),
                                    length_penalty = 0.8, num_beams = 8, max_length = 128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True) for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

score = calculate_metric_on_test_ds(test_sampled, rouge_metric, model_pegasus, tokenizer, batch_size=2)

pd.DataFrame(rouge_dict, index = ['pegasus'] )
