In [1]:
%load_ext autoreload
%autoreload 2

# Evaluation

In [2]:
from typing import cast
import os
import urllib.parse

import pandas as pd
from datasets import DatasetDict, load_dataset
from intelligence_layer.connectors import (
    DocumentIndexClient,
    DocumentPath,
    CollectionPath,
    DocumentContents,
 DocumentIndexRetriever
)
from intelligence_layer.evaluation import (
    Dataset,
    Example,
    RepositoryNavigator,
    RunOverview,
    EvaluationOverview,
    AggregationOverview,
    run_lineages_to_pandas,
    evaluation_lineages_to_pandas,
    aggregation_overviews_to_pandas,
    Evaluator,
    Runner,
    Aggregator,
)
from intelligence_layer.examples import (
    RetrieverBasedQaInput,
    MultipleChunkRetrieverQaOutput,
    MultipleChunkRetrieverQa,
)
from intelligence_layer.core import (
    Language,
    ControlModel,
    LuminousControlModel,
    Llama3InstructModel,
)
from pydantic import BaseModel, ConfigDict

from rewe_workshop.repositories import init_repos
from rewe_workshop.evaluation import (
    RetrieverQaEvaluationLogic,
    RetrieverQaAggregationLogic,
    RetrieverQaEvaluation,
)

  from .autonotebook import tqdm as notebook_tqdm


## Dataset
### Load Data

In [3]:
HF_DATASET_NAME = "deepset/germanquad"
COLLECTION_NAME = "rewe-workshop-prep-100"
# COLLECTION_NAME = "rewe-workshop-prep"
# DATASET_ID = "deepset/germanquad"
DATASET_ID = "deepset/germanquad-100"

In [4]:
def load_german_quad():
    dataset = load_dataset(HF_DATASET_NAME, trust_remote_code=True)
    dataset = cast(DatasetDict, dataset)
    dataset = dataset["train"]

    data = dataset.to_pandas()
    data = cast(pd.DataFrame, data)

    data = data.sample(100, random_state=4711)
    # data = data.sample(20, random_state=4711)

    return data


In [5]:
data = load_german_quad()

### Store Data in Document Index for Retrieval

In [6]:
def setup_collection(di_client: DocumentIndexClient) -> CollectionPath:
    collection_path = CollectionPath(
        namespace="aleph-alpha", collection=COLLECTION_NAME
    )

    di_client.create_collection(collection_path=collection_path)

    di_client.assign_index_to_collection(
        collection_path=collection_path, index_name="asymmetric"
    )

    return collection_path

def store_german_quad_in_di(data: pd.DataFrame):
    texts = data.context.unique()

    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    collection_path = setup_collection(di_client)

    for text in texts:
        slug = urllib.parse.quote_plus(text[:10])
        document_path = DocumentPath(
            collection_path=collection_path, document_name=slug
        )

        try:
            document_contents = di_client.document(document_path)
        except Exception:
            print(f"Adding new doc: {slug}")
            document_contents = DocumentContents(contents=[text])
            di_client.add_document(
                document_path=document_path, contents=document_contents
            )



In [7]:
store_german_quad_in_di(data)

### Store Data (questions) in DatasetRepository for Evaluation

In [8]:
def store_german_quad_in_dataset_repo(data: pd.DataFrame) -> Dataset:
    repos = init_repos()

    questions = data.question

    dataset = repos.dataset_repo.dataset(DATASET_ID)
    if dataset is not None:
        return dataset

    examples: list[Example] = [
        Example(
            input=RetrieverBasedQaInput(question=question, language=Language("de")),
            expected_output=None,
        )
        for question in questions
    ]
    return repos.dataset_repo.create_dataset(
        examples=examples, dataset_name=DATASET_ID, id=DATASET_ID
    )


In [9]:
store_german_quad_in_dataset_repo(data)

Dataset ID = deepset/germanquad-100
Name = deepset/germanquad-100
Labels = set()
Metadata = {}

## Run Step

In [10]:
repositories = init_repos()
navigator = RepositoryNavigator(
    dataset_repository=repositories.dataset_repo,
    run_repository=repositories.run_repo,
    evaluation_repository=repositories.evaluation_repo,
)

In [11]:
class RunConfig(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    description: str
    model: ControlModel


In [12]:
configs = [
    RunConfig(
        description="luminous nextgen 7b",
        model=LuminousControlModel("luminous-nextgen-7b-control-384k-kto"),
    ),
    RunConfig(
        description="luminous nextgen 66b",
        model=LuminousControlModel("luminous-nextgen-66b-control-512k"),
    ),
    RunConfig(
        description="llama 3 8b", model=LuminousControlModel("llama-3-8b-instruct")
    ),
    RunConfig(
        description="llama 3.1 8b", model=Llama3InstructModel("llama-3.1-8b-instruct")
    ),
]




No CLIENT_URL specified in environment, using default: https://api.aleph-alpha.com.


In [13]:
def build_task(model: ControlModel):
    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    retriever = DocumentIndexRetriever(
        di_client,
        index_name="asymmetric",
        namespace="aleph-alpha",
        collection="wikipedia-de",
        k=3,
    )

    task = MultipleChunkRetrieverQa(retriever=retriever, model=model)
    return task

def run_for_configs(configs: list[RunConfig]):
    run_overviews: list[RunOverview] = []
    for config in configs:
        task = build_task(config.model)

        print(config.description)
        runner = Runner(
            task,
            repositories.dataset_repo,
            repositories.run_repo,
            description=config.description,
        )

        run_overview = runner.run_dataset(DATASET_ID)
        run_overviews.append(run_overview)
    return run_overviews

In [14]:
run_overviews = run_for_configs(configs)

luminous nextgen 7b
luminous nextgen 66b
llama 3 8b
llama 3.1 8b


In [15]:
run_lineages = list(
    navigator.run_lineages(
        run_overviews[0].id, RetrieverBasedQaInput, None, MultipleChunkRetrieverQaOutput
    )
)
run_lineages_df = run_lineages_to_pandas(run_lineages)
run_lineages_df

Unnamed: 0_level_0,Unnamed: 1_level_0,input,expected_output,metadata,output,lineage
example_id,run_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
073116a6-0fe8-46e2-bdff-c4c70f40aa21,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Seit wann gibt es Heavy Metal?' lang...,,,answer='Heavy Metal entstand in den 1970er und...,<intelligence_layer.evaluation.infrastructure....
09873257-a0a1-4e3a-a44b-4bccd1cb0adc,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Was ist die Übertragungsgeschwindigk...,,,answer='DisplayPort 2.0 hat eine Übertragungsg...,<intelligence_layer.evaluation.infrastructure....
0c528633-ca2a-48f7-8999-6c022738ed47,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Welche Straßen führen von Sichuan na...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
0da21a3c-55ac-401a-a6b7-006fbecb35a2,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wer möchte das Tauschbörsen für Film...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
15f6bde7-03ab-4c51-bfcc-9968fe327577,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wo in Indien liegt die Wüste Thar?' ...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
...,...,...,...,...,...,...
eff4d5a4-a4aa-4083-beb3-46ff8c3ef49a,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Ab welchem Jahr nahm die Bedeutung d...,,,"answer='Die Bedeutung der NWA nahm ab 1957 ab,...",<intelligence_layer.evaluation.infrastructure....
f3c2bd27-3711-436e-9c23-fa181f806241,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='wie lautet der Fachbegriff für das K...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
f59748c0-4631-4f2d-bd26-8f97ce8181d6,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Was bedeutet phonematisches Orthogra...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....
f793510f-2303-48e4-99a4-671198ad77d7,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wie wurde die Region Taoudénit Mali ...,,,answer=None sources=[AnswerSource(chunk=Enrich...,<intelligence_layer.evaluation.infrastructure....


## Evaluation Step

In [16]:
def evaluate_runs(run_overviews: list[RunOverview])-> list[EvaluationOverview]:
    eval_logic = RetrieverQaEvaluationLogic()

    evaluator = Evaluator(
        dataset_repository=repositories.dataset_repo,
        run_repository=repositories.run_repo,
        evaluation_repository=repositories.evaluation_repo,
        description="default-evaluation",
        evaluation_logic=eval_logic,
    )

    evaluation_overviews = [
        evaluator.evaluate_runs(run_overview.id) for run_overview in run_overviews
    ]
    return evaluation_overviews


In [17]:
evaluation_overviews = evaluate_runs(run_overviews)

Evaluating: 100it [00:30,  3.25it/s]
Evaluating: 100it [00:28,  3.54it/s]
Evaluating: 100it [02:07,  1.27s/it]
Evaluating: 100it [00:34,  2.86it/s]


In [49]:
evaluation_lineages = list(navigator.evaluation_lineages(
    evaluation_overviews[0].id,
    RetrieverBasedQaInput,
    None,
    MultipleChunkRetrieverQaOutput,
    RetrieverQaEvaluation,
))

evaluation_lineages_df = evaluation_lineages_to_pandas(evaluation_lineages)
evaluation_lineages_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,input,expected_output,metadata,output,result,tracer,lineage
example_id,evaluation_id,run_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
073116a6-0fe8-46e2-bdff-c4c70f40aa21,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Seit wann gibt es Heavy Metal?' lang...,,,answer='Heavy Metal entstand in den 1970er und...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
09873257-a0a1-4e3a-a44b-4bccd1cb0adc,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Was ist die Übertragungsgeschwindigk...,,,answer='DisplayPort 2.0 hat eine Übertragungsg...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
0c528633-ca2a-48f7-8999-6c022738ed47,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Welche Straßen führen von Sichuan na...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
0da21a3c-55ac-401a-a6b7-006fbecb35a2,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wer möchte das Tauschbörsen für Film...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
15f6bde7-03ab-4c51-bfcc-9968fe327577,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wo in Indien liegt die Wüste Thar?' ...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
...,...,...,...,...,...,...,...,...,...
eff4d5a4-a4aa-4083-beb3-46ff8c3ef49a,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Ab welchem Jahr nahm die Bedeutung d...,,,"answer='Die Bedeutung der NWA nahm ab 1957 ab,...",answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
f3c2bd27-3711-436e-9c23-fa181f806241,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='wie lautet der Fachbegriff für das K...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
f59748c0-4631-4f2d-bd26-8f97ce8181d6,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Was bedeutet phonematisches Orthogra...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....
f793510f-2303-48e4-99a4-671198ad77d7,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wie wurde die Region Taoudénit Mali ...,,,answer=None sources=[AnswerSource(chunk=Enrich...,answer_generated=False world_knowledge_grading...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....


In [56]:
def expand_pydantic_column(df: pd.DataFrame, column: str):
    normalized = pd.json_normalize(df[column].map(lambda x: x.model_dump())).add_prefix(
        column + "."
    )

    normalized.index = df.index
    return pd.concat(
        [
            df,
            normalized,
        ],
        axis=1,
    )
# expand_pydantic_column(evaluation_lineages_df, "result")

(Optional) Manually edit the run example output to add hallucination.

In [20]:
evaluation_lineages_df[
    (evaluation_lineages_df["contains_world_knowledge"] | False)
]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,input,expected_output,metadata,output,result,tracer,lineage,contains_world_knowledge
example_id,evaluation_id,run_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3173ac7b-3ce7-4a3c-8b48-327c858c5d36,0fefb6ca-16da-4521-a429-c4fdb37f5414,97656db1-fae8-4d67-b88d-b7f7cbbb3156,question='Wann wurde die RSFSR gebildet? ' lan...,,,answer='Die Russische Sozialistische Föderativ...,answer_generated=True world_knowledge_grading_...,<intelligence_layer.core.tracer.in_memory_trac...,<intelligence_layer.evaluation.infrastructure....,True


In [21]:
from intelligence_layer.core import FileTracer
FileTracer(
    os.path.join("eval_tracer/e968469c-09df-410c-8392-afb3a41e1150")
).submit_to_trace_viewer()

True

## Aggregation Step

In [22]:
def aggregate_evaluations(
    evaluation_overviews: list[EvaluationOverview],
) -> list[AggregationOverview]:
    aggregation_logic = RetrieverQaAggregationLogic()
    aggregator = Aggregator(
        evaluation_repository=repositories.evaluation_repo,
        aggregation_repository=repositories.aggregation_repo,
        description="default-aggregation",
        aggregation_logic=aggregation_logic,
    )
    aggregation_overviews = [
        aggregator.aggregate_evaluation(evaluation_overview.id)
        for evaluation_overview in evaluation_overviews
    ]
    return aggregation_overviews

In [23]:
aggregation_overviews = aggregate_evaluations(evaluation_overviews)

In [24]:
aggregation_df = aggregation_overviews_to_pandas(aggregation_overviews)
aggregation_df["description"] = aggregation_df["evaluation_overviews"].map(
    lambda evaluation_overviews: evaluation_overviews[0]["run_overviews"][0][
        "description"
    ]
)

In [25]:
aggregation_df

Unnamed: 0,evaluation_overviews,id,start,end,successful_evaluation_count,crashed_during_evaluation_count,description,labels,contains_no_world_knowledge,n_answers_generated
0,[{'run_overviews': [{'dataset_id': 'deepset/ge...,59744ee9-930a-4142-b7b0-4eebe978665d,2024-07-30T11:17:25.422158Z,2024-07-30T11:17:25.422278Z,100,0,luminous nextgen 7b,[],0.95,20
1,[{'run_overviews': [{'dataset_id': 'deepset/ge...,8ca0d5c1-1bd4-4f85-b18f-31bfd904f291,2024-07-30T11:17:25.436749Z,2024-07-30T11:17:25.436876Z,100,0,luminous nextgen 66b,[],1.0,19
2,[{'run_overviews': [{'dataset_id': 'deepset/ge...,cc78ac30-7406-4a78-bbe1-d5fc3d39774b,2024-07-30T11:17:25.447932Z,2024-07-30T11:17:25.448023Z,100,0,llama 3 8b,[],0.762712,59
3,[{'run_overviews': [{'dataset_id': 'deepset/ge...,910edc10-b234-446d-9e9c-b6163cb799ef,2024-07-30T11:17:25.456109Z,2024-07-30T11:17:25.456169Z,100,0,llama 3.1 8b,[],0.952381,21


In [26]:
# llama_3_1_index = 1
# luminous_nextgen_7b_index = 0

# evaluation_lineages_df = evaluation_lineages_to_pandas(
#     list(
#         navigator.evaluation_lineages(
#             evaluation_overviews[llama_3_1_index].id,
#             RetrieverBasedQaInput,
#             None,
#             MultipleChunkRetrieverQaOutput,
#             RetrieverQaEvaluation,
#         )
#     )
# )


# evaluation_lineages_df["contains_world_knowledge"] = evaluation_lineages_df.result.map(
#     lambda result: (
#         result.world_knowledge_grading_output.contains_world_knowledge
#         if result.world_knowledge_grading_output
#         else None
#     )
# )
# evaluation_lineages_df = evaluation_lineages_df[(evaluation_lineages_df["contains_world_knowledge"] | False)]
# hallucinated_example_ids = evaluation_lineages_df.reset_index().example_id
# evaluation_lineages_df
# # evaluation_lineages_df.iloc[0].lineage

In [27]:
# evaluation_lineages_df.iloc[0].lineage.tracers[0]