In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Evaluation

In [27]:
import os
import urllib.parse
from typing import cast

import pandas as pd
from datasets import DatasetDict, load_dataset
from intelligence_layer.connectors import (
    DocumentIndexClient,
    DocumentPath,
    CollectionPath,
    DocumentContents,
    DocumentIndexRetriever
)
from intelligence_layer.core import (
    Language,
    ControlModel,
    Llama3InstructModel,
)
from intelligence_layer.evaluation import (
    Dataset,
    Example,
    RepositoryNavigator,
    RunOverview,
    EvaluationOverview,
    AggregationOverview,
    run_lineages_to_pandas,
    evaluation_lineages_to_pandas,
    aggregation_overviews_to_pandas,
    Evaluator,
    Runner,
    Aggregator,
)
from intelligence_layer.examples import (
    RetrieverBasedQaInput,
    MultipleChunkRetrieverQaOutput,
    MultipleChunkRetrieverQa,
)
from pydantic import BaseModel, ConfigDict

from src.workshop.evaluation import (
    RetrieverQaEvaluationLogic,
    RetrieverQaAggregationLogic,
    RetrieverQaEvaluation,
)
from src.workshop.repositories import init_repos

## Dataset
### Load Data

In [28]:
HF_DATASET_NAME = "deepset/germanquad"
NAMESPACE = "aleph-alpha"
COLLECTION_NAME = "bwi-workshop-prep-100"
# COLLECTION_NAME = "bwi-workshop-prep"
# DATASET_ID = "deepset/germanquad"
DATASET_ID = "deepset/germanquad-100"

In [29]:
from dotenv import load_dotenv
load_dotenv()

if not COLLECTION_NAME in DocumentIndexClient(os.getenv("AA_TOKEN")).list_collections(namespace=NAMESPACE):
    DocumentIndexClient(os.getenv("AA_TOKEN")).create_collection(
        CollectionPath(namespace=NAMESPACE, collection=COLLECTION_NAME)
    )

In [30]:
def load_german_quad():
    dataset = load_dataset(HF_DATASET_NAME, trust_remote_code=True)
    dataset = cast(DatasetDict, dataset)
    dataset = dataset["train"]

    data = dataset.to_pandas()
    data = cast(pd.DataFrame, data)

    data = data.sample(100, random_state=4711)
    # data = data.sample(20, random_state=4711)

    return data


In [31]:
data = load_german_quad()

### Store Data in Document Index for Retrieval

In [32]:
def setup_collection(di_client: DocumentIndexClient) -> CollectionPath:
    collection_path = CollectionPath(
        namespace=NAMESPACE, collection=COLLECTION_NAME
    )

    di_client.create_collection(collection_path=collection_path)

    di_client.assign_index_to_collection(
        collection_path=collection_path, index_name="asymmetric"
    )

    return collection_path

def store_german_quad_in_di(data: pd.DataFrame):
    texts = data.context.unique()

    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    collection_path = setup_collection(di_client)

    for text in texts:
        slug = urllib.parse.quote_plus(text[:10])
        document_path = DocumentPath(
            collection_path=collection_path, document_name=slug
        )

        try:
            document_contents = di_client.document(document_path)
        except Exception:
            print(f"Adding new doc: {slug}")
            document_contents = DocumentContents(contents=[text])
            di_client.add_document(
                document_path=document_path, contents=document_contents
            )



In [33]:
store_german_quad_in_di(data)

### Store Data (questions) in DatasetRepository for Evaluation

In [34]:
def store_german_quad_in_dataset_repo(data: pd.DataFrame) -> Dataset:
    repos = init_repos()

    questions = data.question

    dataset = repos.dataset_repo.dataset(DATASET_ID)
    if dataset is not None:
        return dataset

    examples: list[Example] = [
        Example(
            input=RetrieverBasedQaInput(question=question, language=Language("de")),
            expected_output=None,
        )
        for question in questions
    ]
    return repos.dataset_repo.create_dataset(
        examples=examples, dataset_name=DATASET_ID, id=DATASET_ID
    )


In [35]:
store_german_quad_in_dataset_repo(data)

Dataset ID = deepset/germanquad-100
Name = deepset/germanquad-100
Labels = set()
Metadata = {}

## Run Step

In [36]:
repositories = init_repos()
navigator = RepositoryNavigator(
    dataset_repository=repositories.dataset_repo,
    run_repository=repositories.run_repo,
    evaluation_repository=repositories.evaluation_repo,
)

In [37]:
class RunConfig(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    description: str
    model: ControlModel


In [38]:
configs = [
    RunConfig(
        description="pharia-1-llm-7b-control",
        model=Llama3InstructModel("pharia-1-llm-7b-control"),
    ),
    RunConfig(
        description="llama-3.1-8b-instruct", model=Llama3InstructModel("llama-3.1-8b-instruct")
    ),
]




In [39]:
def build_task(model: ControlModel):
    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    retriever = DocumentIndexRetriever(
        di_client,
        index_name="asymmetric",
        namespace="aleph-alpha",
        collection="wikipedia-de",
        k=3,
    )

    task = MultipleChunkRetrieverQa(retriever=retriever, model=model)
    return task

def run_for_configs(configs: list[RunConfig]):
    run_overviews: list[RunOverview] = []
    for config in configs:
        task = build_task(config.model)

        print(config.description)
        runner = Runner(
            task,
            repositories.dataset_repo,
            repositories.run_repo,
            description=config.description,
        )

        run_overview = runner.run_dataset(DATASET_ID)
        run_overviews.append(run_overview)
    return run_overviews

In [40]:
run_overviews = run_for_configs(configs)

pharia-1-llm-7b-control
llama-3.1-8b-instruct


In [41]:
run_lineages = list(
    navigator.run_lineages(
        run_overviews[0].id, RetrieverBasedQaInput, None, MultipleChunkRetrieverQaOutput
    )
)
run_lineages_df = run_lineages_to_pandas(run_lineages)
run_lineages_df

Unnamed: 0_level_0,Unnamed: 1_level_0,input,expected_output,metadata,output,lineage
example_id,run_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0484610a-2a1e-4e3d-976f-cb9af1e12e79,cd241456-3428-4e59-8713-e65ae33636ce,question='Wer fördert das Öl in der Republik K...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
04957245-6c0d-4e8a-9f4f-b60084e6bb0d,cd241456-3428-4e59-8713-e65ae33636ce,question='in welcher Beziehung steht das Erinn...,,,answer='Das Erinnerungsvermögen ist eng mit an...,<intelligence_layer.evaluation.infrastructure....
054b44a0-e2c1-4d9f-b8a0-a6fd30d89011,cd241456-3428-4e59-8713-e65ae33636ce,question='Welcher neue Flughafen soll in Lissa...,,,"answer='Der neue Flughafen, der in Lissabon ge...",<intelligence_layer.evaluation.infrastructure....
078c24b6-6e98-4cd0-95aa-bcf15c5261b0,cd241456-3428-4e59-8713-e65ae33636ce,question='Wie lange im Jahr waren Flüsse in Os...,,,answer='Die Frage kann anhand des Textes nicht...,<intelligence_layer.evaluation.infrastructure....
0799e943-9b09-4aef-9711-e934c008e74b,cd241456-3428-4e59-8713-e65ae33636ce,question='Anhand welcher Holzmerkmale kann man...,,,answer='Anhand welcher Holzmerkmale kann man H...,<intelligence_layer.evaluation.infrastructure....
...,...,...,...,...,...,...
f48895ac-87de-4029-aaaf-60b53b995154,cd241456-3428-4e59-8713-e65ae33636ce,question='Was passiert heute im Progr in Bern?...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
f6f7e84f-a38c-4b8e-818a-7b811fa79f45,cd241456-3428-4e59-8713-e65ae33636ce,question='In welchen Ländern wurde der Neoklas...,,,answer='Der Neoklassizismus war in den 1930er ...,<intelligence_layer.evaluation.infrastructure....
fd2aa7eb-8e6c-4ab5-ab8a-3edc58591bf3,cd241456-3428-4e59-8713-e65ae33636ce,question='Welche Displayauflösung hat der IPod...,,,answer='Die sechste Generation des iPod (iPod ...,<intelligence_layer.evaluation.infrastructure....
fd73e0c9-cd3f-426e-bd7a-dc74159ff781,cd241456-3428-4e59-8713-e65ae33636ce,question='Bei welcher Schlacht im britische-fr...,,,"answer='Die Schlacht, bei der die britische Na...",<intelligence_layer.evaluation.infrastructure....


## Evaluation Step

In [42]:
def evaluate_runs(run_overviews: list[RunOverview])-> list[EvaluationOverview]:
    eval_logic = RetrieverQaEvaluationLogic()

    evaluator = Evaluator(
        dataset_repository=repositories.dataset_repo,
        run_repository=repositories.run_repo,
        evaluation_repository=repositories.evaluation_repo,
        description="default-evaluation",
        evaluation_logic=eval_logic,
    )

    evaluation_overviews = [
        evaluator.evaluate_runs(run_overview.id) for run_overview in run_overviews
    ]
    return evaluation_overviews


In [43]:
evaluation_overviews = evaluate_runs(run_overviews)

Evaluating: 100it [14:48,  8.88s/it]
Evaluating: 0it [00:00, ?it/s]

Evaluating: 100it [05:34,  3.34s/it]


### Display Evaluation Lineages

In [44]:
evaluation_lineages = list(navigator.evaluation_lineages(
    evaluation_overviews[0].id, # of pharia-1-llm-7b-control
    RetrieverBasedQaInput,
    None,
    MultipleChunkRetrieverQaOutput,
    RetrieverQaEvaluation,
))

evaluation_lineages_df = evaluation_lineages_to_pandas(evaluation_lineages)
evaluation_lineages_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,input,expected_output,metadata,output,result,tracer,lineage
example_id,evaluation_id,run_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0484610a-2a1e-4e3d-976f-cb9af1e12e79,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Wer fördert das Öl in der Republik K...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
04957245-6c0d-4e8a-9f4f-b60084e6bb0d,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='in welcher Beziehung steht das Erinn...,,,answer='Das Erinnerungsvermögen ist eng mit an...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
054b44a0-e2c1-4d9f-b8a0-a6fd30d89011,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Welcher neue Flughafen soll in Lissa...,,,"answer='Der neue Flughafen, der in Lissabon ge...",answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
078c24b6-6e98-4cd0-95aa-bcf15c5261b0,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Wie lange im Jahr waren Flüsse in Os...,,,answer='Die Frage kann anhand des Textes nicht...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
0799e943-9b09-4aef-9711-e934c008e74b,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Anhand welcher Holzmerkmale kann man...,,,answer='Anhand welcher Holzmerkmale kann man H...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
...,...,...,...,...,...,...,...,...,...
f48895ac-87de-4029-aaaf-60b53b995154,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Was passiert heute im Progr in Bern?...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
f6f7e84f-a38c-4b8e-818a-7b811fa79f45,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='In welchen Ländern wurde der Neoklas...,,,answer='Der Neoklassizismus war in den 1930er ...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
fd2aa7eb-8e6c-4ab5-ab8a-3edc58591bf3,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Welche Displayauflösung hat der IPod...,,,answer='Die sechste Generation des iPod (iPod ...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
fd73e0c9-cd3f-426e-bd7a-dc74159ff781,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Bei welcher Schlacht im britische-fr...,,,"answer='Die Schlacht, bei der die britische Na...",answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....


In [45]:
def expand_pydantic_column(df: pd.DataFrame, column: str):
    normalized = pd.json_normalize(df[column].map(lambda x: x.model_dump())).add_prefix(
        column + "."
    )

    normalized.index = df.index
    return pd.concat(
        [
            df,
            normalized,
        ],
        axis=1,
    )
expand_pydantic_column(evaluation_lineages_df, "result")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,input,expected_output,metadata,output,result,tracer,lineage,result.answer_generated,result.world_knowledge_grading_output,result.world_knowledge_grading_output.reasoning,result.world_knowledge_grading_output.contains_world_knowledge
example_id,evaluation_id,run_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0484610a-2a1e-4e3d-976f-cb9af1e12e79,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Wer fördert das Öl in der Republik K...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,False,,,
04957245-6c0d-4e8a-9f4f-b60084e6bb0d,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='in welcher Beziehung steht das Erinn...,,,answer='Das Erinnerungsvermögen ist eng mit an...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,"Der Vergleichstext enthält Informationen, die ...",False
054b44a0-e2c1-4d9f-b8a0-a6fd30d89011,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Welcher neue Flughafen soll in Lissa...,,,"answer='Der neue Flughafen, der in Lissabon ge...",answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,"Der Vergleichstext enthält nur Informationen, ...",False
078c24b6-6e98-4cd0-95aa-bcf15c5261b0,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Wie lange im Jahr waren Flüsse in Os...,,,answer='Die Frage kann anhand des Textes nicht...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,Der Vergleichstext enthält keine Informationen...,False
0799e943-9b09-4aef-9711-e934c008e74b,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Anhand welcher Holzmerkmale kann man...,,,answer='Anhand welcher Holzmerkmale kann man H...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,"Der Vergleichstext enthält alle Informationen,...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
f48895ac-87de-4029-aaaf-60b53b995154,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Was passiert heute im Progr in Bern?...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,False,,,
f6f7e84f-a38c-4b8e-818a-7b811fa79f45,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='In welchen Ländern wurde der Neoklas...,,,answer='Der Neoklassizismus war in den 1930er ...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,"Der Vergleichstext enthält die Information, da...",True
fd2aa7eb-8e6c-4ab5-ab8a-3edc58591bf3,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Welche Displayauflösung hat der IPod...,,,answer='Die sechste Generation des iPod (iPod ...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,Der Vergleichstext enthält Informationen über ...,True
fd73e0c9-cd3f-426e-bd7a-dc74159ff781,c38cefcb-7cdd-4c8e-9c2c-a299de8604d8,cd241456-3428-4e59-8713-e65ae33636ce,question='Bei welcher Schlacht im britische-fr...,,,"answer='Die Schlacht, bei der die britische Na...",answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True,,"Der Vergleichstext enthält Informationen, die ...",True


In [46]:
from pathlib import Path
from intelligence_layer.core import FileTracer
FileTracer(
    Path("/Users/jens.luecke/PycharmProjects/two-day-rag-qa-workshop/day_2/repositories/runs/afc59634-82db-4d85-ad7f-534bfb8dd1f3/trace/0b9d3faf-b049-4354-bd2f-e0b5b6a11f7c.jsonl")
).submit_to_trace_viewer()

True

## Aggregation Step

In [47]:
def aggregate_evaluations(
    evaluation_overviews: list[EvaluationOverview],
) -> list[AggregationOverview]:
    aggregation_logic = RetrieverQaAggregationLogic()
    aggregator = Aggregator(
        evaluation_repository=repositories.evaluation_repo,
        aggregation_repository=repositories.aggregation_repo,
        description="default-aggregation",
        aggregation_logic=aggregation_logic,
    )
    aggregation_overviews = [
        aggregator.aggregate_evaluation(evaluation_overview.id)
        for evaluation_overview in evaluation_overviews
    ]
    return aggregation_overviews

In [48]:
aggregation_overviews = aggregate_evaluations(evaluation_overviews)

In [24]:
aggregation_df = aggregation_overviews_to_pandas(aggregation_overviews)
aggregation_df["description"] = aggregation_df["evaluation_overviews"].map(
    lambda evaluation_overviews: evaluation_overviews[0]["run_overviews"][0][
        "description"
    ]
)

In [25]:
aggregation_df

Unnamed: 0,evaluation_overviews,id,start,end,successful_evaluation_count,crashed_during_evaluation_count,description,labels,contains_no_world_knowledge,n_answers_generated
0,[{'run_overviews': [{'dataset_id': 'deepset/ge...,59744ee9-930a-4142-b7b0-4eebe978665d,2024-07-30T11:17:25.422158Z,2024-07-30T11:17:25.422278Z,100,0,luminous nextgen 7b,[],0.95,20
1,[{'run_overviews': [{'dataset_id': 'deepset/ge...,8ca0d5c1-1bd4-4f85-b18f-31bfd904f291,2024-07-30T11:17:25.436749Z,2024-07-30T11:17:25.436876Z,100,0,luminous nextgen 66b,[],1.0,19
2,[{'run_overviews': [{'dataset_id': 'deepset/ge...,cc78ac30-7406-4a78-bbe1-d5fc3d39774b,2024-07-30T11:17:25.447932Z,2024-07-30T11:17:25.448023Z,100,0,llama 3 8b,[],0.762712,59
3,[{'run_overviews': [{'dataset_id': 'deepset/ge...,910edc10-b234-446d-9e9c-b6163cb799ef,2024-07-30T11:17:25.456109Z,2024-07-30T11:17:25.456169Z,100,0,llama 3.1 8b,[],0.952381,21


In [26]:
# llama_3_1_index = 1
# luminous_nextgen_7b_index = 0

# evaluation_lineages_df = evaluation_lineages_to_pandas(
#     list(
#         navigator.evaluation_lineages(
#             evaluation_overviews[llama_3_1_index].id,
#             RetrieverBasedQaInput,
#             None,
#             MultipleChunkRetrieverQaOutput,
#             RetrieverQaEvaluation,
#         )
#     )
# )


# evaluation_lineages_df["contains_world_knowledge"] = evaluation_lineages_df.result.map(
#     lambda result: (
#         result.world_knowledge_grading_output.contains_world_knowledge
#         if result.world_knowledge_grading_output
#         else None
#     )
# )
# evaluation_lineages_df = evaluation_lineages_df[(evaluation_lineages_df["contains_world_knowledge"] | False)]
# hallucinated_example_ids = evaluation_lineages_df.reset_index().example_id
# evaluation_lineages_df
# # evaluation_lineages_df.iloc[0].lineage

In [27]:
# evaluation_lineages_df.iloc[0].lineage.tracers[0]