In [1]:
import bert_score
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
import matplotlib.pyplot as plt
import os
import pandas as pd
import pprint
from sklearn import metrics
import urllib

from deepeval.models import AmazonBedrockModel

import boto3
from pydantic_ai.models.bedrock import BedrockConverseModel
from pydantic_evals import Dataset, Case
from pydantic_evals.evaluators import LLMJudge
from pydantic_ai.settings import ModelSettings

from ragas.metrics import SummarizationScore
from ragas import EvaluationDataset, evaluate as ragas_evaluate
from langchain_aws import ChatBedrockConverse, BedrockEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

  from ragas.metrics import SummarizationScore


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
settings={}
settings["n_prompts"] = 30
settings["region"] = "eu-west-2"
settings["temperatures"] = [0, 0.1, 0.2]
settings["max_tokens"] = 4096
settings["threshold"] = 0.7

In [4]:
models = {}

models['Claude 3 Haiku'] = 'anthropic.claude-3-haiku-20240307-v1:0'
models['Claude 3 Sonnet'] = 'anthropic.claude-3-sonnet-20240229-v1:0'
models['Claude 3.7 Sonnet'] = 'anthropic.claude-3-7-sonnet-20250219-v1:0'
models['GPT OSS 20b'] = "openai.gpt-oss-20b-1:0"
models['GPT OSS 120b'] = "openai.gpt-oss-120b-1:0"

## Set Environment Variables

In [5]:
os.environ["AWS_DEFAULT_REGION"] = settings["region"]

## Get Data

In [6]:
datasets = [
    'https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/data/validation-00000-of-00001.parquet',
    'https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v1/TruthfulQA.csv'
   ]

In [7]:
for directory in ['data', 'output']:
    if not os.path.exists(directory):
        os.makedirs(directory)
    
for dataset in datasets:
    print(dataset)
    filename=dataset.split('/')[-1]
    if not os.path.isfile(f'data/{filename}'):
        print(f'downloading {filename}')
        urllib.request.urlretrieve(dataset, f'data/{filename}')

https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/data/validation-00000-of-00001.parquet
https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/data/v1/TruthfulQA.csv


## Summary Data

In [8]:
summary_df = pd.read_parquet('data/validation-00000-of-00001.parquet')
summary_df

Unnamed: 0,document,summary,id
0,The ex-Reading defender denied fraudulent trad...,Former Premier League footballer Sam Sodje has...,38295789
1,Voges was forced to retire hurt on 86 after su...,Middlesex batsman Adam Voges will be out until...,40202028
2,Seven photographs taken in the Norfolk country...,The Duchess of Cambridge will feature on the c...,36177725
3,"Chris Poole - known as ""moot"" online - created...",Google has hired the creator of one of the web...,35751255
4,Four police officers were injured in the incid...,Two teenagers have been charged in connection ...,35275743
...,...,...,...
11327,Phyllida Lloyd will direct revivals of Julius ...,London's Donmar Warehouse is to open a tempora...,36376969
11328,The decision came after the governor of Rio de...,The government in Brazil says it will send fed...,26691258
11329,The deal will create the second largest cable ...,The US Department of Justice (DoJ) has approve...,36133259
11330,Third seed Murray beat Czech eighth seed Tomas...,Britain's Andy Murray will face world number o...,32178027


In [9]:
summary_correct_df = summary_df.copy()
summary_correct_df['is_correct'] = True

summary_incorrect_df = summary_df.copy()
shuffled_summary_df = summary_df["summary"].sample(frac=1).reset_index(drop=True)
summary_df.drop(columns=["summary"]).join(shuffled_summary_df)
summary_incorrect_df['is_correct'] = False

summary_evaluate_df = pd.concat([summary_correct_df, summary_incorrect_df])
summary_evaluate_df.reset_index(drop=True)
summary_evaluate_df

Unnamed: 0,document,summary,id,is_correct
0,The ex-Reading defender denied fraudulent trad...,Former Premier League footballer Sam Sodje has...,38295789,True
1,Voges was forced to retire hurt on 86 after su...,Middlesex batsman Adam Voges will be out until...,40202028,True
2,Seven photographs taken in the Norfolk country...,The Duchess of Cambridge will feature on the c...,36177725,True
3,"Chris Poole - known as ""moot"" online - created...",Google has hired the creator of one of the web...,35751255,True
4,Four police officers were injured in the incid...,Two teenagers have been charged in connection ...,35275743,True
...,...,...,...,...
11327,Phyllida Lloyd will direct revivals of Julius ...,London's Donmar Warehouse is to open a tempora...,36376969,False
11328,The decision came after the governor of Rio de...,The government in Brazil says it will send fed...,26691258,False
11329,The deal will create the second largest cable ...,The US Department of Justice (DoJ) has approve...,36133259,False
11330,Third seed Murray beat Czech eighth seed Tomas...,Britain's Andy Murray will face world number o...,32178027,False


In [10]:
settings["model_id"]='anthropic.claude-3-haiku-20240307-v1:0'
settings["model_name"]='Claude 3 Haiku'
settings["temperature"] = 0

row = summary_evaluate_df.head(1).to_dict('records')[0]
row

{'document': 'The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport.\nMr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42.\nAppearing at the Old Bailey earlier, all four denied the offence.\nThe charge relates to offences which allegedly took place between 2008 and 2014.\nSam, from Kent, Efe and Bright, of Greater Manchester, and Stephen, from Bexley, are due to stand trial in July.\nThey were all released on bail.',
 'summary': 'Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.',
 'id': '38295789',
 'is_correct': True}

## DeepEval

In [11]:
def connect_to_bedrock(settings):
    bedrock_model = AmazonBedrockModel(
        model=settings["model_id"],
        region=settings["region"],
        generation_kwargs={
            "temperature": settings["temperature"],
            "maxTokens": settings["max_tokens"]
        }
    )
    return bedrock_model
#bedrock_model = connect_to_bedrock(settings)

In [12]:
def evaluate_summary_deepeval(bedrock_model, document: str, summary: str, settings: dict):
    
    summarisation_metric = GEval(
        name="Summarization Quality",
        criteria="Summarization Quality: Evaluate if the summary is a concise, faithful, and coherent representation of the source document.",
        evaluation_params=[
            LLMTestCaseParams.INPUT, 
            LLMTestCaseParams.ACTUAL_OUTPUT
        ],
        model=bedrock_model,
        threshold=settings["threshold"],
        async_mode=False
    )

    test_case = LLMTestCase(
        input=document,
        actual_output=summary
    )
    
    summarisation_metric.measure(test_case)

    return {
        'function': 'Summary', 
        'method': 'Deepeval',
        'score': summarisation_metric.measure(test_case),
        'passed': summarisation_metric.is_successful()
    }


#evaluation = evaluate_summary_deepeval(bedrock_model, row["document"], row["summary"], settings)
#evaluation

## Pydantic

In [13]:
def evaluate_summary_pydantic(document: str, summary: str, settings: dict): 
    judge_llm = BedrockConverseModel(settings["model_id"])

    judge = LLMJudge(
        model=judge_llm,
        rubric="""
        Summarization Quality: Evaluate if the summary is a concise, faithful, and coherent representation of the source document.

        Score the summary on the following criteria:
        1. Faithfulness: Does the summary contain only information present in the original? Penalise hallucinations.
        2. Coverage: Does the summary capture the key points of the original?
        3. Conciseness: Is the summary meaningfully shorter without being too terse?
        
        A high score (close to 1.0) means the summary is faithful, covers key points, and is concise.
        A low score (close to 0.0) means the summary contains hallucinations, misses key points, or is poorly written.
        """,
        score={'evaluation_name': 'SummaryQualityScore'},
        model_settings=ModelSettings(
            temperature=settings["temperature"],
            max_tokens=settings["max_tokens"]
        ),
        include_input=True,
        include_expected_output=False,
    )

    dataset = Dataset(
        cases=[Case(inputs=document)],
        evaluators=[judge],
    )

    report = dataset.evaluate_sync(lambda x: summary)
    score = report.cases[0].scores['SummaryQualityScore'].value
    reason = report.cases[0].assertions.get('LLMJudge_pass').reason

    return {
        'function': 'Summary',
        'method': 'Pydantic',
        'score': score,
        'reason': reason,
        'passed': bool(score >= settings["threshold"])
    }

#evaluation = evaluate_summary_pydantic(row["document"], row["summary"], settings)
#evaluation

## Ragas

In [14]:
def evaluate_summary_ragas(document: str, summary: str, settings: dict):
    langchain_llm = ChatBedrockConverse(
        model_id=settings["model_id"],
        region_name=settings["region"],
        temperature=settings["temperature"]
    )
    
    ragas_llm = LangchainLLMWrapper(langchain_llm)

    metric = SummarizationScore(llm=ragas_llm)

    dataset = EvaluationDataset.from_list([{
        "user_input": document,
        "response": summary,
        "reference_contexts": [document]
    }])

    score = ragas_evaluate(dataset=dataset, metrics=[metric])["summary_score"][0]
    
    return {
        'function': 'Summary',
        'method': 'Ragas',
        'score': score,
        'passed': bool(score >= settings["threshold"])
    }

#evaluation = evaluate_summary_ragas(row["document"], row["summary"], settings)
#evaluation

## Bert Score

In [15]:
def evaluate_summary_bert_score(document, summary, settings):
    p, r, f1 = bert_score.score(
        [document],
        [summary],
        model_type=settings["model_id"],
        verbose=False
    )

    return {
        'function': 'Summary',
        'method': 'Bert Score',
        'score': f1[0].item(),
        'passed': bool(f1[0].item() >= settings["threshold"])
    }

#evaluation = evaluate_summary_bert_score(row["document"], row["summary"], settings)
#evaluation

## Evaluate All Methods

In [16]:
summary_sample_df = summary_evaluate_df.sample(settings["n_prompts"])
evaluations = []

# Deepeval (LLM as as Judge)
for model_name, model_id in models.items():
    settings["model_name"] = model_name
    settings["model_id"] = model_id

    for temperature in settings["temperatures"]:
        print(f"evaluating deepeval using {model_name} at temperature {temperature}")
        settings["temperature"] = temperature

        bedrock_model = connect_to_bedrock(settings)
    
        for row in summary_sample_df.to_dict('records'):     
            try:
                evaluation = evaluate_summary_deepeval(bedrock_model, row["document"], row["summary"], settings)
                evaluations.append(row | settings | evaluation)
            except:
                pass
        
        pd.DataFrame(evaluations).to_csv('output/summary_evaluations.csv', index=False)

# Pydantic (LLM as as Judge)
for model_name, model_id in models.items():
    settings["model_name"] = model_name
    settings["model_id"] = model_id

    for temperature in settings["temperatures"]:
        print(f"evaluating pydantic using {model_name} at temperature {temperature}")
        settings["temperature"] = temperature

        for row in summary_sample_df.to_dict('records'):
            try:
                evaluation = evaluate_summary_pydantic(row["document"], row["summary"], settings)
                evaluations.append(row | settings | evaluation)
            except:
                pass
        
        pd.DataFrame(evaluations).to_csv('output/summary_evaluations.csv', index=False)

# Ragas
#settings["model_name"] = "Langchain"
#settings["model_id"] = "None"
for model_name, model_id in models.items():
    settings["model_name"] = model_name
    settings["model_id"] = model_id
    for temperature in settings["temperatures"]:
        for row in summary_sample_df.to_dict('records'):   
            try:
                evaluation = evaluate_summary_ragas(row["document"], row["summary"], settings)
                evaluations.append(row | settings | evaluation)
            except:
                pass
        
        pd.DataFrame(evaluations).to_csv('output/summary_evaluations.csv', index=False)

# Bert Score
settings["model_name"] = "Distilbert Base Uncased"
settings["model_id"] = "distilbert-base-uncased"
settings["temperature"] = "None"
print("evaluating bert score")
for row in summary_sample_df.to_dict('records'):
    try:
        evaluation = evaluate_summary_bert_score(row["document"], row["summary"], settings)
        evaluations.append(row | settings | evaluation)
    except:
        pass

evaluations_df = pd.DataFrame(evaluations)
evaluations_df.to_csv('output/summary_evaluations.csv', index=False)
evaluations_df

Output()

evaluating deepeval using Claude 3 Haiku at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5d02b4560>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5d01def30>, 28025.698720313)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d02b45c0>


Output()

evaluating deepeval using Claude 3 Haiku at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5d00e0c50>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5c01a3c50>, 28206.505737752)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d0114e90>


Output()

evaluating deepeval using Claude 3 Haiku at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5a074a600>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5c01a1d90>, 28378.847459812)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5a074b140>


Output()

evaluating deepeval using Claude 3 Sonnet at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5d15a1ac0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5d2436ed0>, 28641.301698774)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d00bcd10>


Output()

evaluating deepeval using Claude 3 Sonnet at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a59aec0680>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a59a65a870>, 28887.372442125)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a59ae96210>


Output()

evaluating deepeval using Claude 3 Sonnet at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a59a68ce60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a599e86990>, 29133.546330348)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a59a6cbfb0>


Output()

evaluating deepeval using Claude 3.7 Sonnet at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5996e4620>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5996bef90>, 29422.008694555)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a59a6c8980>


Output()

evaluating deepeval using Claude 3.7 Sonnet at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5995122a0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a598ee6cf0>, 29712.2142903)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a599513560>


Output()

evaluating deepeval using Claude 3.7 Sonnet at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a599eedd90>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5d01def90>, 30008.871144742)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d15a2900>


Output()

evaluating deepeval using GPT OSS 20b at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5d00d1e20>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a599f0d9d0>, 30156.431253242)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d2252cf0>


Output()

evaluating deepeval using GPT OSS 20b at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a5d0273320>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a599976ab0>, 30299.79385231)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5d0232900>


Output()

evaluating deepeval using GPT OSS 20b at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a59aec3a10>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a59ab66870>, 30439.058372217)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a59a68ed80>


Output()

evaluating deepeval using GPT OSS 120b at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a59acaeab0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a59acca990>, 30566.654381284)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a5c07e2cf0>


Output()

evaluating deepeval using GPT OSS 120b at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x77a59aa081a0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x77a5998f6870>, 30708.684746027)])']
connector: <aiohttp.connector.TCPConnector object at 0x77a59aa204a0>


Output()

evaluating deepeval using GPT OSS 120b at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Haiku at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Haiku at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Haiku at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Sonnet at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Sonnet at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3 Sonnet at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3.7 Sonnet at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3.7 Sonnet at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using Claude 3.7 Sonnet at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 20b at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 20b at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 20b at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 120b at temperature 0


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 120b at temperature 0.1


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

evaluating pydantic using GPT OSS 120b at temperature 0.2


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Invalid json output: {
    "keyphrases": [
        "three and seven",
        "Weihai",
        "driver",
        "overtime and night shift pay",
        "Xinhua news agency",
        "children\'s teacher",
        "driver\'s seat",
        "lighter",
        "petrol"
    ]
}
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Invalid json output: {
    "keyphrases": [
        "Rory McIlroy",
        "European Tour",
        "Wells Fargo Championship",
        "Charlotte",
        "2016",
        "Masters",
        "Ireland",
        "Players\ Championship",
        "Irish Open",
        "K Club",
        "Memorial Tournament",
        "Wentworth",
        "US Open",
        "Northern Irishman",
        "Ryder Cup",
        "French Open",
        "Open",
        "PGA",
        "Olympics",
        "Fed Ex Cup"
    ]
}
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Rory McIlroy", "European Tour", "Wells Fargo Championship", "Charlotte", "2016", "Masters", "Ireland", "Quail Hollow", "Players' Championship", "Irish Open", "K Club", "Memorial Tournament", "Wentworth", "US Open", "Northern Irishman", "Ryder Cup", "French Open", "Open", "PGA", "Olympics", "Fed Ex Cup"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Rory McI...lympics', 'Fed Ex Cup']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Henrik Stenson", "Justin Rose", "Jordan Spieth", "Patrick Reed", "Sergio Garcia", "Rafa Cabrera-Bello", "JB Holmes", "Ryan Moore", "Rory McIlroy", "Thomas Pieters", "Dustin Johnson", "Matt Kuchar", "Brooks Koepka", "Brandt Snedeker", "Danny Willet", "Martin Kaymer", "Darren Clarke", "Davis Love", "United States", "Europe", "Hazeltine", "Medinah", "Minnesota", "England", "Belgium", "Ryder Cup", "41st Ryder Cup"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Henrik S...Cup', '41st Ryder Cup']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Bataclan", "13 November", "89", "Paris", "130", "Jesse Hughes", "iTELE", "Laurence Ferrari", "U2", "Olympia Theatre", "Eagles of Death Metal", "France", "Monday", "Tuesday", "350", "US"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Bataclan...'Tuesday', '350', 'US']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["McIlroy", "European Tour", "Wells Fargo Championship", "Players Championship", "Irish Open", "K Club", "Memorial Tournament", "Wentworth event", "US Open", "Ryder Cup", "Fed Ex Cup", "PGA", "Olympics", "Charlotte", "Ireland", "Quail Hollow", "United States", "Augusta"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['McIlroy'...ted States', 'Augusta']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Invalid json output: 
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"questions": ["Is Cannon Hill Park the location of The 2015 Sousse and Bardo Memorial?", "Was the memorial planned to open in 2018?", "Did the memorial include a balance between \"seclusion and tranquillity\" and \"being a place of public prominence\"?", "Was Tobias Ellwood a Minister for North Africa?", "Did the Foreign and Commonwealth Office say the park was chosen following consultation with the victims' families?", "Was Charles Patrick Evans 78?", "Was Adrian Evans 49?", "Was Joel Richards 19?", "Was Sally Adey 57?", "Did Lisa Trickett say the families were honoured?", "Is the memorial located on an oval-shaped site with views of the boating lake?", "Will a shortlist be announced at a later date?"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'questions': ['Is Cannon...nced at a later date?']}, input_type=dict]
    For further information visit ht

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Mr Clegg", "Mr Cameron", "Nick Robinson", "Robert Peston", "Nicolas Sarkozy", "William Hague", "Ed Miliband", "Liberal Democrats", "Conservative coalition", "Eurosceptic Conservatives", "EU", "European Union", "Brussels", "London", "UK", "Lisbon Treaty", "BBC", "House of Commons", "Prime Minister", "0400 GMT", "last week's summit", "Monday", "10 hours"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Mr Clegg..., 'Monday', '10 hours']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Kieran Agard", "Lawrence Vigouroux", "Conor Thomas", "George Williams", "Nicky Ajose", "Ben Reeves", "Charlie Colkett", "Jonathan Obika", "Ed Upson", "Dean Lewington", "Bradley Barry", "George Baldock", "Press Association", "Swindon Town", "MK Dons", "half-time", "second minute", "29th minute", "break", "Second Half"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Kieran A...'break', 'Second Half']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Glenn Murray", "Eagles", "Bolasie", "Costel Pantilimon", "Connor Wickham", "Sunderland", "Black Cats", "QPR", "Burnley", "Arsenal", "Chelsea", "Dick Advocaat", "Gus Poyet", "Aston Villa", "Alan Pardew", "Jermain Defoe", "Steven Fletcher", "Patrick van Aanholt", "Premier League", "Stadium of Light", "14 days", "14-minute spell", "3-0", "4-0", "90th minute"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Glenn Mu..., '4-0', '90th minute']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Dan Kemp", "Kyle Storer", "Kenji Gorre", "Aaron Lewis", "Jonathan Forte", "Haydn Hollis", "Rotherham United", "Spurs", "Hatters", "Chelsea", "Plymouth", "West Ham United", "Swansea City", "Swindon Town", "Notts County", "Everton Under-21s", "Plymouth Argyle", "Chelsea Under-21s", "Cheltenham Town", "Luton Town", "Tottenham Under-21s", "Manchester City Under-21s", "County Ground"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Dan Kemp...-21s', 'County Ground']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"questions": ["Did Ciaran Murtagh score a goal for Roscommon?", "Did Donegal win the match against Roscommon?", "Was Conor Devaney a player mentioned in the text?", "Did Mark Anthony McGinley save a goal?", "Was Ronan Stack a player who had a chance to score?", "Did Seanie Johnston miss a free?", "Was the match played at Dr Hyde Park?", "Did the match include a half-time introduction by Frank McGlynn?", "Was the score at half-time 1-6 to 0-6?", "Did the match involve a substitution by the Roscommon boss?", "Was the final score 0-16 to 2-9?", "Did the match include a 64 minute goal?", "Was the match played in Division One?", "Did the match involve a 21st-minute free?", "Was the match played in the Allianz Football League?"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'questions': ['Did Ciara...ianz Football League?']}, input_type=dict]
    For further 

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Andre Van der Merwe", "Dr Van der Merwe", "Cape Town", "China", "South Africa", "Stellenbosch University", "Tygerberg Hospital", "BBC News", "11 December"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Andre Va...C News', '11 December']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )
  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Henrik Stenson", "Justin Rose", "Jordan Spieth", "Patrick Reed", "Sergio Garcia", "Rafa Cabrera-Bello", "JB Holmes", "Ryan Moore", "Rory McIlroy", "Thomas Pieters", "Dustin Johnson", "Matt Kuchar", "Brooks Koepka", "Brandt Snedeker", "Danny Willet", "Martin Kaymer", "Darren Clarke", "Davis Love", "Lee Westwood", "Mickelson", "Fowler", "Sullivan", "Ian Poulter", "Tom Fordyce", "Sky Sports", "BBC Radio 5 live", "BBC", "BBC Sport", "United States", "Europe", "Ryder Cup", "Hazeltine", "Medinah", "Minnesota", "California", "1981"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Henrik S..., 'California', '1981']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Johnny Sexton", "Joe Schmidt", "Eddie Jones", "Vern Cotter", "WP Nel", "Alasdair Dickinson", "Rob Howley", "Luke Charteris", "Taulupe Faletau", "Chris Robshaw", "Maro Itoje", "Joe Marler", "Leinster", "BBC One", "Scotland", "Edinburgh", "Ireland", "Italy", "Sat, 14:25 GMT", "Sat, 16:50 GMT", "11 February"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Johnny S...50 GMT', '11 February']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["MP", "Labour", "Israel", "United States", "Keith Vaz", "Ms Shah", "Jeremy Corbyn", "Ken Livingstone", "Adolf Hitler", "Naz Shah", "Shami Chakrabarti", "Home Affairs Select Committee", "Liberty", "United Kingdom", "London", "Bradford West", "last week", "2014", "May 2015", "April 2014"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['MP', 'La...ay 2015', 'April 2014']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Pacquiao", "Floyd Mayweather", "Bob Arum", "Khan", "Andre Berto", "Chris Algieri", "Kell Brook", "WBO", "IBF", "Los Angeles Times", "Las Vegas", "Sheffield", "Los Angeles", "May", "9 April", "2015", "September", "summer", "2012", "2014"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Pacquiao...ummer', '2012', '2014']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Invalid json output: \{\"keyphrases\": [\"British\", \"Irish\", \"Red Cross\", \"BBC\", \"ITV\", \"Channel 4\", \"Channel 5\", \"Sky\", \"UK Foreign Office\", \"International Committee of the Red Cross\", \"Disasters Emergency Committee\", \"British Red Cross\", \"Medecins Sans Frontieres\", \"Map Action\", \"British embassy\", \"Jon Kay\", \"Philip Hammond\", \"RAF C-17\", \"RAF Brize Norton\", \"Oxfordshire\", \"Mount Everest\", \"UK\", \"Saturday\", \"90\", \"30\", \"4,300\", \"8,000\", \"1,100\", \"1,700\", \"6 hours\", \"15\", \"5m\", \"3m\", \"2m\", \"£5m\", \"£3m\", \"£2m\", \"$3 trillion\"]\}
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Invalid json output: 
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["John McGrogan", "Lord Burns", "Tony Lenehan", "Blackridge", "Edinburgh", "Glasgow", "High Court", "1981", "2005", "2014", "4 years"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['John McG...05', '2014', '4 years']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Ward", "Marcus Willis", "Jay Clarke", "Andy Murray", "Johanna Konta", "Roger Federer", "Egor Gerasimov", "Go Soeda", "Gabashvili", "David Goffin", "Wimbledon", "Roehampton", "Northampton", "Centre Court", "last year", "12 months ago", "£35,000"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Ward', '...onths ago', '£35,000']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["McIlroy", "Golf Channel", "Charlotte", "Wells Fargo Championship", "Wells Fargo", "Players' Championship", "Irish Open", "K Club", "Memorial Tournament", "United States", "Wentworth event", "US Open", "Ryder Cup", "October", "August", "Augusta"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['McIlroy'...', 'August', 'Augusta']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse ExtractedKeyphrases from completion {}. Got: 1 validation error for ExtractedKeyphrases
keyphrases
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Prof Francesco Battaglia", "Mr Grillo", "Sandro Pertini", "Nelson Mandela", "Silvio Berlusconi", "Beppe Grillo", "University of Modena and Reggio Emilia", "Five Star", "Prime Minister Matteo Renzi", "centre-left Democratic Party", "2011", "€50,000", "£36,000", "$56,000"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Prof Fra... '£36,000', '$56,000']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Glenn Murray", "Bolasie", "Connor Wickham", "Costel Pantilimon", "Alan Pardew", "Jermain Defoe", "Steven Fletcher", "Patrick van Aanholt", "Dick Advocaat", "Gus Poyet", "Aston Villa", "Premier League", "Stadium of Light", "Arsenal", "Chelsea", "Stoke", "Crystal Palace", "Newcastle", "Sunderland", "Black Cats", "Eagles", "January", "14 days", "4-0", "4-1", "14-minute spell", "six games"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Glenn Mu...te spell', 'six games']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["John McGrogan", "Blackridge", "Edinburgh", "High Court", "Glasgow", "1981", "2005", "2014", "four years", "1981 and 2005"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['John McG...ears', '1981 and 2005']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt extract_keyphrase_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: OutputParserException(Failed to parse StringIO from completion {"keyphrases": ["Ciaran Murtagh", "Roscommon", "Donegal", "Conor Devaney", "Mark Anthony McGinley", "Ronan Stack", "Seanie Johnston", "Conor McManus", "Owen Duffy", "Dr Hyde Park", "Rory Gallagher", "Kevin McStay", "Michael Carroll", "Marty O'Reilly", "Caolan Ward", "Jamie Brennan", "Darach O'Connor", "Eoin McHugh", "Frank McGlynn", "Breffnimen", "Farneymen", "Monaghan", "Cavan", "Castleblayney", "Dublin", "Division One", "Division Two", "Division Three", "Division Four", "Clare", "Down", "Fermanagh", "Galway", "Kildare", "Cork", "Meath", "Derry", "Louth", "Longford", "Offaly", "Antrim", "Tipperary", "Sligo", "Armagh", "Laois", "Carlow", "London", "Limerick", "Westmeath", "Leitrim", "Wexford", "Waterford", "Wicklow"]}. Got: 1 validation error for StringIO
text
  Field required [type=missing, input_value={'keyphrases': ['Ciaran M...'Waterford', 'Wicklow']}, input_type=dict]
    For further informa

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  ragas_llm = LangchainLLMWrapper(langchain_llm)


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

evaluating bert score


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertModel LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Unnamed: 0,document,summary,id,is_correct,n_prompts,region,temperatures,max_tokens,threshold,model_id,model_name,temperature,function,method,score,passed,reason
0,Media playback is not supported on this device...,Europe fought back from a 4-0 foursomes whitew...,37526279,True,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,anthropic.claude-3-haiku-20240307-v1:0,Claude 3 Haiku,0,Summary,Deepeval,0.800000,True,
1,They are seeking guarantees of his total co-op...,The Northern Ireland Survivors and Victims of ...,17332685,True,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,anthropic.claude-3-haiku-20240307-v1:0,Claude 3 Haiku,0,Summary,Deepeval,0.700000,True,
2,McIlroy has opted out of the European Tour's f...,Rory McIlroy has pulled out of the BMW PGA Cha...,36207769,False,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,anthropic.claude-3-haiku-20240307-v1:0,Claude 3 Haiku,0,Summary,Deepeval,0.800000,True,
3,The duration of Ireland fly-half Johnny Sexton...,"As the start of the Six Nations nears, the res...",38845499,False,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,anthropic.claude-3-haiku-20240307-v1:0,Claude 3 Haiku,0,Summary,Deepeval,0.700000,True,
4,The Californian rock band were performing when...,Eagles of Death Metal have appeared on stage i...,35586337,True,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,anthropic.claude-3-haiku-20240307-v1:0,Claude 3 Haiku,0,Summary,Deepeval,0.700000,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,Media playback is unsupported on your device\n...,A forgotten part of Alexandra Palace could be ...,29582794,False,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,distilbert-base-uncased,Distilbert Base Uncased,,Summary,Bert Score,0.733312,True,
1369,City were held to a 1-1 draw at League One Rot...,Manchester City Under-21s won on penalties in ...,40942164,False,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,distilbert-base-uncased,Distilbert Base Uncased,,Summary,Bert Score,0.754338,True,
1370,"Ward, 27, won 6-7 (3-7) 6-4 7-6 (8-6) 6-1 in t...",World number 855 Alex Ward became the only Bri...,40449803,True,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,distilbert-base-uncased,Distilbert Base Uncased,,Summary,Bert Score,0.722049,True,
1371,Ciaran Murtagh's goal helped Roscommon lead by...,Eoin McHugh's late point helped Donegal snatch...,38951339,True,30,eu-west-2,"[0, 0.1, 0.2]",4096,0.7,distilbert-base-uncased,Distilbert Base Uncased,,Summary,Bert Score,0.751865,True,
