In [1]:
#imports
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import Trainer
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

# import the metrics
from rouge_score import rouge_scorer
from rouge_score.scoring import Score
from bert_score import BERTScorer
from readability import Readability
#from evaluation.readability import flesch_kincaid_grade_level, dale_chall_readability_score, coleman_liau_index, lens

import json

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# Evaluation

## Metrics
- **Relevance**: ROUGE (1, 2, and L) and BERTScore
- **Readability**: Flesch-Kincaid Grade Level (FKGL) and Dale-Chall Readability Score (DCRS), Coleman-Liau Index (CLI), and LENS
- **Factuality**: AlignScore, SummaC

In [2]:
dataset = load_dataset("Samsung/samsum", "samsum", trust_remote_code=True)

In [3]:
model_name = "facebook/bart-base"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)



In [4]:
#load our first fine-tuned model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "samsum_bart_base"
model = BartForConditionalGeneration.from_pretrained(f"models/{model_name}")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [5]:
#load the summaries from disk
with open("models/samsum_bart_base/summaries.json", "r") as f:
    summaries = json.load(f)

In [7]:
import numpy as np

def generate_summaries(model, tokenizer, dataset, limit: int = None, batch_size: int = 8) -> np.ndarray:
    limit = len(dataset) if limit is None else min(limit, len(dataset))
    summaries = np.empty(limit, dtype=object)

    # Calculate number of batches
    num_batches = (limit + batch_size - 1) // batch_size
    
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculations
        for batch_idx in tqdm(range(num_batches), desc="Generating summaries"):
            # Get batch indices
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, limit)
            dialogues = [dataset[i]["dialogue"] for i in range(start_idx, end_idx)]
            
            # Tokenize batch
            inputs = tokenizer(dialogues, return_tensors="pt", max_length=512, truncation=True, padding=True).to(model.device)
            
            # Generate summaries for the batch
            with torch.amp.autocast("cuda"):
                predicted_summaries = model.generate(
                    inputs.input_ids, 
                    max_length=150, 
                    num_beams=4, 
                    early_stopping=True
                )
            
            # Decode summaries and store them
            decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True) for summary in predicted_summaries]
            summaries[start_idx:end_idx] = decoded_summaries

    return summaries

In [8]:
prompt_template = "{article}"
generated_summaries = generate_summaries(model, tokenizer, dataset['test'], limit=1, batch_size=32)

Generating summaries: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]


In [10]:
print(f"Dialogue: {dataset['test'][0]['dialogue']}")
print(f"True Summary: {dataset['test'][0]['summary']}")
print(f"Generated Summary: {generated_summaries[0]}")

Dialogue: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
True Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
Generated Summary: Hannah is looking for Betty's number. Amanda can't find it. Larry called her last time they were at the park together.


In [12]:
for summary in generated_summaries:
    #get the number of words in the summary
    summary_length = len(summary.split())
    
    if summary_length < 100:
        print(f"{summary_length} words => Summary {summary} is too short.")
    

95 words => Summary Trypanosomatids are parasites that cause African sleeping sickness, Nagana cattle disease, South-American Chagas’ disease and leishmaniasis. These parasites are caused by a protein called trypanothione, which is found in the mitochondrion of the Trypanosome. This protein is made up of a group of enzymes called thioredoxin reductases, which are found in many different types of cells. The enzymes that are involved in this process are known as tryparedoxin and Tpx. However, it was not clear how the two enzymes work together. To investigate this question, Manta et al. used a technique called “tryparedoxin redox reductase� is too short.


In [14]:
gs = summaries['generated_summaries'][0]
ts = summaries['summary'][0]
t = summaries['dialogue'][0]

rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

score1 = rouge.score(t, gs)
score2 = rouge.score(ts, gs)
score3 = rouge.score(t, ts)

In [19]:
for s in [score1, score2, score3]:
    print(s['rouge1'])

Score(precision=0.8260869565217391, recall=0.24358974358974358, fmeasure=0.37623762376237624)
Score(precision=0.34782608695652173, recall=0.5, fmeasure=0.41025641025641024)
Score(precision=0.5625, recall=0.11538461538461539, fmeasure=0.19148936170212766)


In [17]:
#compute all metrics for the test set and save them to a json file
#the dataset is a list of dicts with 'article', 'summary', 'section_headings', 'keywords', 'year', 'title'
#add a progress bar
from transformers import PreTrainedModel

def compute_metrics(summaries: list[str], dataset):
    metrics = {"relevance": {"rouge": [], "bert": []}, "readability": {"fkgl": [], "dcrs": [], "cli": []}}
    
    bert = BERTScorer(lang="en", model_type="bert-base-uncased")
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    for i in tqdm(range(len(summaries)), desc="Computing metrics"):
        #compute the predicted summary from the article
        article = dataset[i]["article"]
        expert_summary = dataset[i]["summary"]
        predicted_summary = summaries[i]
        
        if(len(predicted_summary.split()) < 100):
            continue
        
        #compute the relevance scores
        metrics["relevance"]["rouge"].append(rouge.score(expert_summary, predicted_summary))
        metrics["relevance"]["bert"].append(bert.score([predicted_summary], [expert_summary])[0].item())
        
        #compute the readability scores
        r = Readability(predicted_summary)
        metrics["readability"]["fkgl"].append(r.flesch_kincaid().score)
        metrics["readability"]["dcrs"].append(r.dale_chall().score)
        metrics["readability"]["cli"].append(r.coleman_liau().score)
        
    return metrics

In [10]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\theav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
def exclude_summaries_below_word_thershold(generated_summaries: np.ndarray, threshold: int = 100) -> np.ndarray:
    filtered_summaries = []
    
    filtered_summaries = generated_summaries[generated_summaries.split().__len__() > threshold]
    
    return filtered_summaries

In [18]:
metrics = compute_metrics(generated_summaries, dataset['test'])

Computing metrics: 100%|██████████| 241/241 [00:17<00:00, 13.48it/s]


In [19]:
print(metrics['relevance']['rouge'][0])

{'rouge1': Score(precision=0.5631067961165048, recall=0.31868131868131866, fmeasure=0.4070175438596491), 'rouge2': Score(precision=0.14634146341463414, recall=0.08264462809917356, fmeasure=0.10563380281690142), 'rougeL': Score(precision=0.2912621359223301, recall=0.16483516483516483, fmeasure=0.21052631578947367)}


In [34]:
# save the metrics json
model_folder = f"models/{model_name}"

with open(f"{model_folder}/metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)