In [1]:
%load_ext autoreload
%autoreload 2

# Evaluation

In [2]:
from typing import cast
import os
import urllib.parse

import pandas as pd
from datasets import DatasetDict, load_dataset
from intelligence_layer.connectors import (
    DocumentIndexClient,
    DocumentPath,
    CollectionPath,
    DocumentContents,
 DocumentIndexRetriever
)
from intelligence_layer.evaluation import (
    Dataset,
    Example,
    RepositoryNavigator,
    RunOverview,
    EvaluationOverview,
    AggregationOverview,
    run_lineages_to_pandas,
    evaluation_lineages_to_pandas,
    aggregation_overviews_to_pandas,
    Evaluator,
    Runner,
    Aggregator,
)
from intelligence_layer.examples import (
    RetrieverBasedQaInput,
    MultipleChunkRetrieverQaOutput,
    MultipleChunkRetrieverQa,
)
from intelligence_layer.core import (
    Language,
    ControlModel,
    LuminousControlModel,
    Llama3InstructModel,
    Task,
    TaskSpan
)
from pydantic import BaseModel, ConfigDict

from rewe_workshop.repositories import init_repos
from rewe_workshop.evaluation import (
    RetrieverQaEvaluationLogic,
    RetrieverQaAggregationLogic,
    RetrieverQaEvaluation,
)

## Dataset

1. Load the following HF dataset: `deepset/germanquad` (only use a sample, e.g. 10).
2. Insert the context into the document index.
3. Save the dataset in a `FileDatasetRepository`.


### Load Data

### Store Contexts in Document Index for Retrieval

### Store Questions in DatasetRepository for Evaluation

## Run Step
1. Create a `FileRunRepository` and a `Runner` and run the `MultipleChunkRetrieverQA` task for our dataset.
2. Create a `RepositoryNavigator` and retrieve the lineages for the run.
3. Convert the lineages to a pandas DataFrame with the `run_lineages_to_pandas` method.
4. Restructure the code, so that you can specify multiple configurations for the task. Then each configuration should create its own run. 

## Evaluation Step

1. Write a custom `EvaluationLogic` and use it on your runs with an `Evaluator`. The custom `EvaluationLogic` does not need to do anything sensible as a first step.
2. Write a new task `WorldKnowledgeGrader` that detects whether a given text contains information that is not found in a reference text. Use the classes below as a template.
3. Adapt your custom `EvaluationLogic` to use the `WorldKnowledgeGrader`.
4. Retrieve the `EvaluationLineage`s with the `RepositoryNavigator` and convert them to a Dataframe with the `evaluation_lineages_to_pandas` function.   
**Hint**: To make the dataframe output more readable, you can use the `expand_pydantic_column` function.

In [5]:
class WorldKnowledgeGradingInput(BaseModel):
    reference_text: str
    compare_text: str


class WorldKnowledgeGradingOutput(BaseModel):
    reasoning: str
    contains_world_knowledge: bool

class WorldKnowledgeGrader(
    Task[WorldKnowledgeGradingInput, WorldKnowledgeGradingOutput]
):
    def __init__(self, model: ControlModel):
        super().__init__()
        self._model = model

    def do_run(
        self, input: WorldKnowledgeGradingInput, task_span: TaskSpan
    ) -> WorldKnowledgeGradingOutput:
        return WorldKnowledgeGradingOutput(
            reasoning="",
            contains_world_knowledge=False,
        )



In [6]:
def expand_pydantic_column(df: pd.DataFrame, column: str):
    normalized = pd.json_normalize(df[column].map(lambda x: x.model_dump())).add_prefix(
        column + "."
    )

    normalized.index = df.index
    return pd.concat(
        [
            df,
            normalized,
        ],
        axis=1,
    )

(Optional) Manually edit the run example output to add hallucination.

## Aggregation Step
1. Create a `FileAggregationRepository` and an `Aggregator` and aggregate each evaluation from the previous step.
2. Convert the resulting `AggregationOverview`s to pandas with the `aggregation_overviews_to_pandas` function and look at the final results.
3. (Optional) Edit a run manually to produce a hallucinated answer to check whether the evaluation actually works.

**Hint**: You can use the following code snippet to show the run description in the dataframe.

```python
aggregation_df["description"] = aggregation_df["evaluation_overviews"].map(
    lambda evaluation_overviews: evaluation_overviews[0]["run_overviews"][0][
        "description"
    ]
)
```