In [9]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    Pipeline,
    pipeline
)
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from rouge_score import rouge_scorer
import pandas as pd

In [2]:
dataset = pq.read_table('./data/test-00000-of-00001.parquet')

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained("PrekshaJoon/flan-t5-finetuned-summarization")
tokenizer = AutoTokenizer.from_pretrained("PrekshaJoon/flan-t5-finetuned-summarization")

In [43]:
def generate_summary(article, max_length=256, min_length=100, length_penalty=2.0, num_beams=16):
    article = str(article)
    
    # Tokenize the article
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=1024, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        num_beams=num_beams,
        length_penalty=length_penalty,
        early_stopping=True
    )
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [46]:
results = []
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

for i in range(len(dataset["article"])):
    article = str(dataset["article"][i])
    reference_summary = str(dataset["highlights"][i])
    generated_summary = str(generate_summary(article))
    
    # Calculate ROUGE scores
    rouge_scores = scorer.score(reference_summary, generated_summary)
    
    # Store results
    results.append({
        "article": article,
        "reference_summary": reference_summary,
        "generated_summary": generated_summary,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure
    })

    if i == 50:
        break

results_df = pd.DataFrame(results)
results_df.to_csv("rouge_scores_flanT5_finetuned.csv", index=False)

print(results_df.head())

                                             article  \
0  (CNN)The Palestinian Authority officially beca...   
1  (CNN)Never mind cats having nine lives. A stra...   
2  (CNN)If you've been following the news lately,...   
3  (CNN)Five Americans who were monitored for thr...   
4  (CNN)A Duke student has admitted to hanging a ...   

                                   reference_summary  \
0  Membership gives the ICC jurisdiction over all...   
1  Theia, a bully breed mix, was apparently hit b...   
2  Mohammad Javad Zarif has spent more time with ...   
3  17 Americans were exposed to the Ebola virus w...   
4  Student is no longer on Duke University campus...   

                                   generated_summary    rouge1    rouge2  \
0  The Palestinian Authority officially became th...  0.323529  0.194030   
1  A stray pooch in Washington State has used up ...  0.413793  0.175439   
2  If you've been following the news lately, ther...  0.250000  0.106667   
3  Five Americans who 

In [4]:
def generate_summary(article):
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [6]:
results = []
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

for i in range(len(dataset["article"])):
    article = str(dataset["article"][i])
    reference_summary = str(dataset["highlights"][i])
    generated_summary = str(generate_summary(article))
    
    # Calculate ROUGE scores
    rouge_scores = scorer.score(reference_summary, generated_summary)
    
    # Store results
    results.append({
        "article": article,
        "reference_summary": reference_summary,
        "generated_summary": generated_summary,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure
    })

    if i == 50:
        break

results_df = pd.DataFrame(results)
results_df.to_csv("rouge_scores_flanT5_finetuned.csv", index=False)

print(results_df.head())

                                             article  \
0  (CNN)The Palestinian Authority officially beca...   
1  (CNN)Never mind cats having nine lives. A stra...   
2  (CNN)If you've been following the news lately,...   
3  (CNN)Five Americans who were monitored for thr...   
4  (CNN)A Duke student has admitted to hanging a ...   

                                   reference_summary  \
0  Membership gives the ICC jurisdiction over all...   
1  Theia, a bully breed mix, was apparently hit b...   
2  Mohammad Javad Zarif has spent more time with ...   
3  17 Americans were exposed to the Ebola virus w...   
4  Student is no longer on Duke University campus...   

                                   generated_summary    rouge1    rouge2  \
0  ICC treaty, paving the way for war crimes inve...  0.232558  0.097561   
1  A stray pooch in Washington State, was buried ...  0.276923  0.095238   
2  Mohammad Javad Zarif a hero's welcome as he ar...  0.230769  0.080000   
3  Five Americans who 

In [19]:
# Load the BART-Large-CNN summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Function to summarize article with truncation
def summarize_article(article, max_input_length=1024, max_output_length=130, min_output_length=30):
    """
    Summarize the article while ensuring input length constraints.
    """
    # Truncate the article if it's too long
    input_tokens = article.split()
    if len(input_tokens) > max_input_length:
        article = " ".join(input_tokens[:max_input_length])

    # Generate summary
    summary = summarizer(article, max_length=max_output_length, min_length=min_output_length, do_sample=False)
    return summary[0]["summary_text"]

# Initialize results storage
results = []
count = 0
# Process each article in the dataset
for i in range(len(dataset["article"])):
    article = str(dataset["article"][i])
    reference_summary = str(dataset["highlights"][i])
    count += 1

    # Generate summary for the article
    try:
        generated_summary = summarize_article(article)
    except Exception as e:
        continue

    # Calculate ROUGE scores
    rouge_scores = scorer.score(reference_summary, generated_summary)

    # Store results
    results.append({
        "article": article,
        "reference_summary": reference_summary,
        "generated_summary": generated_summary,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure
    })

    # Optional: Stop early for testing
    if count == 75:
        break

# Save results to a DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv("bart_large_summaries.csv", index=False)

# Display the first few rows
print(results_df.head())

Your max_length is set to 130, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


                                             article  \
0  (CNN)The Palestinian Authority officially beca...   
1  (CNN)Never mind cats having nine lives. A stra...   
2  (CNN)If you've been following the news lately,...   
3  (CNN)Five Americans who were monitored for thr...   
4  (CNN)A Duke student has admitted to hanging a ...   

                                   reference_summary  \
0  Membership gives the ICC jurisdiction over all...   
1  Theia, a bully breed mix, was apparently hit b...   
2  Mohammad Javad Zarif has spent more time with ...   
3  17 Americans were exposed to the Ebola virus w...   
4  Student is no longer on Duke University campus...   

                                   generated_summary    rouge1    rouge2  \
0  The Palestinian Authority becomes the 123rd me...  0.535211  0.376812   
1  Theia, a one-year-old bully breed mix, was hit...  0.488372  0.285714   
2  Mohammad Javad Zarif is the Iranian foreign mi...  0.400000  0.205882   
3  The five were expos