In [1]:
!pip install rouge_score
!pip install datasets



In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer
from rouge_score import rouge_scorer
from datasets import load_metric
import numpy as np
import pandas as pd

# Load the BART model and tokenizer
model_name = 'facebook/bart-large-cnn'
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to generate summary using different prompts
def generate_summary(prompt, text, max_length=100):
    input_text = prompt + text
    inputs = tokenizer([input_text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to evaluate the generated summary using ROUGE and BLEU
def evaluate_summaries(reference_summary, generated_summary):
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge_scorer_obj.score(reference_summary, generated_summary)

    bleu_metric = load_metric("bleu")
    bleu_scores = bleu_metric.compute(predictions=[generated_summary.split()], references=[[reference_summary.split()]])

    return rouge_scores, bleu_scores['bleu']

# Example dataset: a small sample for illustration purposes
dataset = pd.DataFrame({
    'text': [
        "The quick brown fox jumps over the lazy dog. The dog was not happy about it and chased the fox.",
        "Artificial intelligence is rapidly evolving. It is transforming industries and the way we live and work."
    ],
    'summary': [
        "A fox jumps over a dog, who then chases it.",
        "AI is evolving rapidly, transforming industries and daily life."
    ]
})

# List of prompt designs
prompts = {
    'Basic Prompt': "Summarize the following text: ",
    'Guided Prompt': "In 3 sentences, summarize the key points of the following text: ",
    'Role-Playing Prompt': "You are a news editor. Summarize the following article for your readers: ",
    'Question-Driven Prompt': "What are the most important takeaways from the following text? Summarize them: ",
    'Contextual Prompt': "Based on current events, summarize the following text: ",
    'Instructive Prompt': "Provide a concise summary of the following text, highlighting the key aspects: ",
    'Comparative Prompt': "Compare and contrast the following text and summarize the main differences and similarities: "
}

# Dictionary to store results
results = {}

# Generate summaries and evaluate them
for prompt_name, prompt in prompts.items():
    print(f"Evaluating {prompt_name}...")
    generated_summaries = []
    rouge_scores_agg = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores_agg = []

    for i, row in dataset.iterrows():
        text = row['text']
        reference_summary = row['summary']

        # Generate summary using the current prompt
        generated_summary = generate_summary(prompt, text)
        generated_summaries.append(generated_summary)

        # Evaluate summary using ROUGE and BLEU
        rouge_scores, bleu_score = evaluate_summaries(reference_summary, generated_summary)

        # Collect ROUGE scores
        for metric in rouge_scores_agg:
            rouge_scores_agg[metric].append(rouge_scores[metric].fmeasure)

        # Collect BLEU score
        bleu_scores_agg.append(bleu_score)

    # Store average ROUGE and BLEU scores
    results[prompt_name] = {
        metric: np.mean(rouge_scores_agg[metric]) for metric in rouge_scores_agg
    }
    results[prompt_name]['BLEU'] = np.mean(bleu_scores_agg)

# Display the results
print("\nEvaluation Results (ROUGE and BLEU Scores):")
for prompt_name, scores in results.items():
    print(f"\n{prompt_name}:")
    for metric, score in scores.items():
        print(f"  {metric.upper()}: {score:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Evaluating Basic Prompt...


  bleu_metric = load_metric("bleu")


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

The repository for bleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Evaluating Guided Prompt...
Evaluating Role-Playing Prompt...
Evaluating Question-Driven Prompt...
Evaluating Contextual Prompt...
Evaluating Instructive Prompt...
Evaluating Comparative Prompt...

Evaluation Results (ROUGE and BLEU Scores):

Basic Prompt:
  ROUGE1: 0.2221
  ROUGE2: 0.0708
  ROUGEL: 0.1879
  BLEU: 0.0000

Guided Prompt:
  ROUGE1: 0.1935
  ROUGE2: 0.0833
  ROUGEL: 0.1774
  BLEU: 0.0000

Role-Playing Prompt:
  ROUGE1: 0.2415
  ROUGE2: 0.0715
  ROUGEL: 0.1725
  BLEU: 0.0000

Question-Driven Prompt:
  ROUGE1: 0.2433
  ROUGE2: 0.1074
  ROUGEL: 0.1820
  BLEU: 0.0000

Contextual Prompt:
  ROUGE1: 0.2284
  ROUGE2: 0.1164
  ROUGEL: 0.1800
  BLEU: 0.0000

Instructive Prompt:
  ROUGE1