In [1]:
import bert_score
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
import matplotlib.pyplot as plt
import os
import pandas as pd
import pprint
from sklearn import metrics
import urllib

from deepeval.models import AmazonBedrockModel

import boto3
from pydantic_ai.models.bedrock import BedrockConverseModel
from pydantic_evals import Dataset, Case
from pydantic_evals.evaluators import LLMJudge
from pydantic_ai.settings import ModelSettings

from ragas.metrics import SummarizationScore
from ragas import EvaluationDataset, evaluate as ragas_evaluate
from langchain_aws import ChatBedrockConverse, BedrockEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

  from ragas.metrics import SummarizationScore


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
settings={}
settings["n_prompts"] = 50
settings["region"] = "eu-west-2"
settings["temperatures"] = [0, 0.1, 0.2, 0.3]
settings["threshold"] = 0.7

In [4]:
models = {}

models['Claude 3 Haiku'] = 'anthropic.claude-3-haiku-20240307-v1:0'
#models['Claude 3 Sonnet'] = 'anthropic.claude-3-sonnet-20240229-v1:0'
models['Claude 3.7 Sonnet'] = 'anthropic.claude-3-7-sonnet-20250219-v1:0'
#models['GPT OSS 20b'] = "openai.gpt-oss-20b-1:0"
#models['GPT OSS 120b'] = "openai.gpt-oss-120b-1:0"

## Set Environment Variables

In [5]:
os.environ["AWS_DEFAULT_REGION"] = settings["region"]

## Get Data

In [6]:
datasets = [
    'https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/data/validation-00000-of-00001.parquet',
    'https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v1/TruthfulQA.csv'
   ]

In [7]:
for directory in ['data', 'output']:
    if not os.path.exists(directory):
        os.makedirs(directory)
    
for dataset in datasets:
    print(dataset)
    filename=dataset.split('/')[-1]
    if not os.path.isfile(f'data/{filename}'):
        print(f'downloading {filename}')
        urllib.request.urlretrieve(dataset, f'data/{filename}')

https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/data/validation-00000-of-00001.parquet
https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v1/TruthfulQA.csv


## Summary Data

In [8]:
summary_df = pd.read_parquet('data/validation-00000-of-00001.parquet')
summary_df

Unnamed: 0,document,summary,id
0,The ex-Reading defender denied fraudulent trad...,Former Premier League footballer Sam Sodje has...,38295789
1,Voges was forced to retire hurt on 86 after su...,Middlesex batsman Adam Voges will be out until...,40202028
2,Seven photographs taken in the Norfolk country...,The Duchess of Cambridge will feature on the c...,36177725
3,"Chris Poole - known as ""moot"" online - created...",Google has hired the creator of one of the web...,35751255
4,Four police officers were injured in the incid...,Two teenagers have been charged in connection ...,35275743
...,...,...,...
11327,Phyllida Lloyd will direct revivals of Julius ...,London's Donmar Warehouse is to open a tempora...,36376969
11328,The decision came after the governor of Rio de...,The government in Brazil says it will send fed...,26691258
11329,The deal will create the second largest cable ...,The US Department of Justice (DoJ) has approve...,36133259
11330,Third seed Murray beat Czech eighth seed Tomas...,Britain's Andy Murray will face world number o...,32178027


In [9]:
summary_correct_df = summary_df.copy()
summary_correct_df['is_correct'] = True

summary_incorrect_df = summary_df.copy()
shuffled_summary_df = summary_df["summary"].sample(frac=1).reset_index(drop=True)
summary_df.drop(columns=["summary"]).join(shuffled_summary_df)
summary_incorrect_df['is_correct'] = False

summary_evaluate_df = pd.concat([summary_correct_df, summary_incorrect_df])
summary_evaluate_df.reset_index(drop=True)
summary_evaluate_df

Unnamed: 0,document,summary,id,is_correct
0,The ex-Reading defender denied fraudulent trad...,Former Premier League footballer Sam Sodje has...,38295789,True
1,Voges was forced to retire hurt on 86 after su...,Middlesex batsman Adam Voges will be out until...,40202028,True
2,Seven photographs taken in the Norfolk country...,The Duchess of Cambridge will feature on the c...,36177725,True
3,"Chris Poole - known as ""moot"" online - created...",Google has hired the creator of one of the web...,35751255,True
4,Four police officers were injured in the incid...,Two teenagers have been charged in connection ...,35275743,True
...,...,...,...,...
11327,Phyllida Lloyd will direct revivals of Julius ...,London's Donmar Warehouse is to open a tempora...,36376969,False
11328,The decision came after the governor of Rio de...,The government in Brazil says it will send fed...,26691258,False
11329,The deal will create the second largest cable ...,The US Department of Justice (DoJ) has approve...,36133259,False
11330,Third seed Murray beat Czech eighth seed Tomas...,Britain's Andy Murray will face world number o...,32178027,False


In [10]:
settings["model_id"]='anthropic.claude-3-haiku-20240307-v1:0'
settings["model_name"]='Claude 3 Haiku'
settings["temperature"] = 0

row = summary_evaluate_df.head(1).to_dict('records')[0]
row

{'document': 'The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport.\nMr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42.\nAppearing at the Old Bailey earlier, all four denied the offence.\nThe charge relates to offences which allegedly took place between 2008 and 2014.\nSam, from Kent, Efe and Bright, of Greater Manchester, and Stephen, from Bexley, are due to stand trial in July.\nThey were all released on bail.',
 'summary': 'Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.',
 'id': '38295789',
 'is_correct': True}

## DeepEval

In [11]:
def connect_to_bedrock(settings):
    bedrock_model = AmazonBedrockModel(
        model=settings["model_id"],
        region=settings["region"],
        generation_kwargs={
            "temperature": settings["temperature"],
            "maxTokens": 2048
        }
    )
    return bedrock_model
#bedrock_model = connect_to_bedrock(settings)

In [12]:
def evaluate_summary_deepeval(bedrock_model, document: str, summary: str, settings: dict):
    
    summarisation_metric = GEval(
        name="Summarization Quality",
        criteria="Summarization Quality: Evaluate if the summary is a concise, faithful, and coherent representation of the source document.",
        evaluation_params=[
            LLMTestCaseParams.INPUT, 
            LLMTestCaseParams.ACTUAL_OUTPUT
        ],
        model=bedrock_model,
        threshold=settings["threshold"],
        async_mode=False
    )

    test_case = LLMTestCase(
        input=document,
        actual_output=summary
    )
    
    summarisation_metric.measure(test_case)

    return {
        'function': 'Summary', 
        'method': 'Deepeval',
        'score': summarisation_metric.measure(test_case),
        'passed': summarisation_metric.is_successful()
    }


#evaluation = evaluate_summary_deepeval(bedrock_model, row["document"], row["summary"], settings)
#evaluation

## Pydantic

In [13]:
def evaluate_summary_pydantic(document: str, summary: str, settings: dict): 
    judge_llm = BedrockConverseModel(settings["model_id"])

    judge = LLMJudge(
        model=judge_llm,
        rubric="""
        Summarization Quality: Evaluate if the summary is a concise, faithful, and coherent representation of the source document.

        Score the summary on the following criteria:
        1. Faithfulness: Does the summary contain only information present in the original? Penalise hallucinations.
        2. Coverage: Does the summary capture the key points of the original?
        3. Conciseness: Is the summary meaningfully shorter without being too terse?
        
        A high score (close to 1.0) means the summary is faithful, covers key points, and is concise.
        A low score (close to 0.0) means the summary contains hallucinations, misses key points, or is poorly written.
        """,
        score={'evaluation_name': 'SummaryQualityScore'},
        model_settings=ModelSettings(
            temperature=settings["temperature"],
            max_tokens=2048
        ),
        include_input=True,
        include_expected_output=False,
    )

    dataset = Dataset(
        cases=[Case(inputs=document)],
        evaluators=[judge],
    )

    report = dataset.evaluate_sync(lambda x: summary)
    score = report.cases[0].scores['SummaryQualityScore'].value
    reason = report.cases[0].assertions.get('LLMJudge_pass').reason

    return {
        'function': 'Summary',
        'method': 'Pydantic',
        'score': score,
        'reason': reason,
        'passed': bool(score >= settings["threshold"])
    }

#evaluation = evaluate_summary_pydantic(row["document"], row["summary"], settings)
#evaluation

## Ragas

In [15]:
def evaluate_summary_ragas(document: str, summary: str, settings: dict):
    langchain_llm = ChatBedrockConverse(
        model_id=settings["model_id"],
        region_name=settings["region"],
        temperature=settings["temperature"]
    )
    
    ragas_llm = LangchainLLMWrapper(langchain_llm)

    metric = SummarizationScore(llm=ragas_llm)

    dataset = EvaluationDataset.from_list([{
        "user_input": document,
        "response": summary,
        "reference_contexts": [document]
    }])

    score = ragas_evaluate(dataset=dataset, metrics=[metric])["summary_score"][0]
    
    return {
        'function': 'Summary',
        'method': 'Ragas',
        'score': score,
        'passed': bool(score >= settings["threshold"])
    }

#evaluation = evaluate_summary_ragas(row["document"], row["summary"], settings)
#evaluation

## Bert Score

In [17]:
def evaluate_summary_bert_score(document, summary, settings):
    model = "distilbert-base-uncased"

    p, r, f1 = bert_score.score(
        [document],
        [summary],
        model_type=model,
        verbose=False
    )

    return {
        'function': 'Summary',
        'method': 'Bert Score',
        'model_name': "Distilbert Base Uncased",
        'model_id': model,
        'score': f1[0].item(),
        'passed': bool(f1[0].item() >= settings["threshold"])
    }

#evaluation = evaluate_summary_bert_score(row["document"], row["summary"], settings)
#evaluation

## Evaluate All Methods

In [19]:
summary_sample_df = summary_evaluate_df.sample(settings["n_prompts"])
evaluations = []

# Deepeval and Pydantic (LLM as as Judge)
for model_name, model_id in models.items():
    settings["model_name"] = model_name
    settings["model_id"] = model_id

    for temperature in settings["temperatures"]:
        printf(f"evaluating deepeval and pydantic using {model_name} at temperature {temperature}")
        settings["temperature"] = temperature

        bedrock_model = connect_to_bedrock(settings)
    
        for row in summary_sample_df.to_dict('records'):     
            evaluation = evaluate_summary_deepeval(bedrock_model, row["document"], row["summary"], settings)
            evaluations.append(row | settings | evaluation)
        
            evaluation = evaluate_summary_pydantic(row["document"], row["summary"], settings)
            evaluations.append(row | settings | evaluation)

# Ragas
printf("evaluating ragas")
for row in summary_sample_df.to_dict('records'):
    evaluation = evaluate_summary_ragas(row["document"], row["summary"], settings)
    evaluations.append(row | settings | evaluation)


# Bert Score
printf("evaluating bert score")
for row in summary_sample_df.to_dict('records'):
    evaluation = evaluate_summary_bert_score(row["document"], row["summary"], settings)
    evaluations.append(row | settings | evaluation)

evaluations_df = pd.DataFrame(evaluations)
evaluations_df.to_csv('output/summary_evaluations.csv', index=False)
evaluations_df

Output()

Output()

Output()

Output()

Output()

Task was destroyed but it is pending!
task: <Task pending name='Task-34' coro=<_async_in_context.<locals>.run_in_context() running at /home/iods/Tresors/Git/ai-spike-evaluation-metrics/.venv/lib/python3.12/site-packages/ipykernel/utils.py:60> wait_for=<Task pending name='Task-36' coro=<Kernel.shell_main() running at /home/iods/Tresors/Git/ai-spike-evaluation-metrics/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/iods/Tresors/Git/ai-spike-evaluation-metrics/.venv/lib/python3.12/site-packages/zmq/eventloop/zmqstream.py:563]>
  obj, end = self.scan_once(s, idx)
Task was destroyed but it is pending!
task: <Task pending name='Task-36' coro=<Kernel.shell_main() running at /home/iods/Tresors/Git/ai-spike-evaluation-metrics/.venv/lib/python3.12/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>


Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x72c7cf0ea3c0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x72c7bc68b5f0>, 8485.290601981)])']
connector: <aiohttp.connector.TCPConnector object at 0x72c7cf0eb380>


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Unnamed: 0,document,summary,id,is_correct,n_prompts,region,temperatures,threshold,model_id,model_name,temperature,function,method,score,passed
0,Pinewood Group's two biggest shareholders have...,Pinewood film studios - home to the James Bond...,36911786,True,3,eu-west-2,[0],0.7,distilbert-base-uncased,Distilbert Base Uncased,0,Summary,Bert Score,0.71906,True
1,"Put together in 1973, the band enjoyed UK char...",Veteran Celtic rockers Runrig are to appear in...,40379385,False,3,eu-west-2,[0],0.7,distilbert-base-uncased,Distilbert Base Uncased,0,Summary,Bert Score,0.737251,True
2,Nancy McAdam has been presented with a British...,A woman has been recognised for her work with ...,35093364,False,3,eu-west-2,[0],0.7,distilbert-base-uncased,Distilbert Base Uncased,0,Summary,Bert Score,0.71595,True
