# Eval

In [None]:
#| default_exp eval

In [None]:
# | hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import os
import json
import yaml
from stringdale import (
    Define,
    Scope,
    V,
    E,
    Condition,
    draw_nx
)
from stringdale.podtw import (
    parse_expected_trace,
    align_traces,
    word_overlap,
    regex,
    ExpectedTrace,
    Trace
)


from pathlib import Path
from frozendict import frozendict
from stringdale.core import  checkLogs
import pytest
import asyncio
from pydantic import BaseModel, ConfigDict

from typing import List, Union
import jsonlines

## Using podtw

In [None]:
# TODO multiple inputs
# TODO any comparison so we can check for existance of nodes

In [None]:
#| export

def parse_trace(trace_path:Union[str,Path]) -> List[Trace]:
    """
    Parse a trace file into a list of Trace objects.
    """
    with jsonlines.open(trace_path) as reader:
        return [Trace.model_validate(trace) for trace in reader]

In [None]:
from stringdale.core import get_git_root

In [None]:
sample_data_dir = get_git_root() / "sample_data" / "eval"

example_trace_path = sample_data_dir / "traces0.jsonl"
example_expected_path = sample_data_dir / "expected0.yaml"


example_comparisons = {
    "word_overlap":word_overlap,
    "regex":regex,
}

In [None]:
example_trace = parse_trace(example_trace_path)
example_expected = parse_expected_trace(example_expected_path)

In [None]:
best_match,score,dist =await align_traces(example_trace,example_expected,comparisons=example_comparisons,default_comparison=word_overlap)
best_match

frozendict.frozendict({'node_a1': 1, '3': 2, '1': 5, 'node_z': 4})

In [None]:
dist

{'node_a1': {1: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'jimmy went\nto the store\n',
     'actual': 'jimmy went\nto the store\nto buy some milk',
     'distance': 0.375}],
   'distance': 0.375,
   'expected_idx': 0,
   'actual_idx': 1,
   'actual_name': 'node_a',
   'expected_name': 'node_a',
   'expected_label': 'node_a1'}},
 'node_z': {1: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'store',
     'actual': {'b': {'c': 'jimmy went\nto the store\nto buy some milk'}},
     'distance': inf}],
   'distance': inf,
   'expected_idx': 2,
   'actual_idx': 1,
   'actual_name': 'node_a',
   'expected_name': 'node_.*',
   'expected_label': 'node_z'},
  2: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'store',
     'actual': {'b': {'c': 'store is good'}},
     'distance': inf}],
   'distance': inf,
   'expected_idx': 2,
   'actual_idx': 2,
   'actual_name': 'node_c',
   'expec

## Realistic Comparison Functions

In [None]:
#| export
import numpy as np
import asyncio
from stringdale.db import openai_embed
from stringdale.chat import Chat

In [None]:
#| export
async def cosine_dist(out: str, expected: str, model: str = 'text-embedding-3-small') -> float:
    """Compute cosine distance between two strings using OpenAI embeddings.
    
    Args:
        out: First string to compare
        expected: Second string to compare
        model: OpenAI embedding model to use (default: 'text-embedding-3-small')
        
    Returns:
        float: Cosine similarity between the two strings (between -1 and 1)
    """
    # Get embeddings for both strings
    out_embedding = await openai_embed(out, model=model)
    expected_embedding = await openai_embed(expected, model=model)
    
    # Compute cosine similarity
    dot_product = np.dot(out_embedding, expected_embedding)
    norm_out = np.linalg.norm(out_embedding)
    norm_expected = np.linalg.norm(expected_embedding)
    
    # Return cosine similarity
    return 1-dot_product / (norm_out * norm_expected)



In [None]:
basic_dist = await cosine_dist("hello","hello")
basic_dist
assert basic_dist < 0.0001


In [None]:
await cosine_dist("hello","hello stranger")

0.39451383328055045

In [None]:
class ChatEvalScore(BaseModel):
    score:float

class ChatEval:
    def __init__(self,model:str="gpt-4o-mini",system_prompt:str=None):
        self.model = model
        base_prompt = """
            You are a helpful assistant that evaluates the similarity of two strings.
            You will be given two strings, and you will need to evaluate the similarity of the two strings.
            You will need to return a score between 0 and 1, where 0 is the lowest similarity and 1 is the highest similarity.
            """    
        self.messages = [
            {"role":"system","content":base_prompt},
        ]
        if system_prompt is not None:
            self.messages.append({"role":"system","content":system_prompt})

            

    async def __call__(self,out:str,expected:str)->float:
        self.messages.append({"role":"user","content":f"string1: {out}\nstring2: {expected}"})
        chat = Chat(model=self.model,messages=self.messages,output_schema=ChatEvalScore)
        response = await chat()
        return response['content'].score


In [None]:
eval = ChatEval(system_prompt="if one of the strings is world, output 0.5")
result = await eval("hello","world")
assert result == 0.5

In [None]:
#| export
def eq(a,b):
    if a == b:
        return 0
    else:
        return np.inf

def any(a,b):
    return 0

## Running and evaluating a single data point

In [None]:
#| export
from typing import List,Dict,Callable

In [None]:
#| export
class DataPoint(BaseModel):
    traces:List[Trace]
    expected:ExpectedTrace
    

In [None]:
from stringdale.examples.react import ReactAgent

In [None]:
#| export
async def evaluate_datapoint(agent,comparisons,default_comparison,expected_yaml,trace_out=None):
    if trace_out is None:
        trace_out = expected_yaml.parent/expected_yaml.name.replace(".yaml", ".jsonl").replace("expected", "actual")
    
    expected_trace = parse_expected_trace(expected_yaml)

    d=agent()
    with jsonlines.open(trace_out,'w') as writer:
        for input in expected_trace.input:
            async for trace in d.arun(input):
                writer.write(json.loads(trace.model_dump_json(include={'name','output','duration'})))
            if d.finished:
                break

    parsed_trace = parse_trace(trace_out)
    aligned_trace,score,debug_info = await align_traces(parsed_trace,expected_trace,comparisons,default_comparison)
    
    return aligned_trace,score,debug_info,trace_out


In [None]:
agent = ReactAgent
expected_yaml = sample_data_dir/"react_expected.yaml"
comparisons = {
    "eq":eq
}
default_comparison = cosine_dist


In [None]:
alignment,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,expected_yaml)
alignment,score,trace_out

(frozendict.frozendict({'0': 2, '1': 5}),
 0.3245190060308185,
 PosixPath('/Users/dean/dl/stringdale/sample_data/eval/react_actual.jsonl'))

In [None]:
# TODO from here try moving to UV

In [None]:
# aligned_trace,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,expected_yaml)
# aligned_trace,score,trace_out

input=[[{'role': 'user', 'content': 'Question: what is obamas age to the power of 2?'}]] expected=[ExpectedTraceStep(name='use_tool', label='0', conditions=[Condition(accessor=('content', 'name'), value='wikipedia_search', comparison='eq', kwargs={}), Condition(accessor=('content', 'input', 'q'), value='Obama', comparison=None, kwargs={})], before=[], after=[]), ExpectedTraceStep(name='use_tool', label='1', conditions=[Condition(accessor=('content', 'name'), value='run_python_code', comparison='eq', kwargs={}), Condition(accessor=('content', 'output'), value=3969, comparison='eq', kwargs={})], before=[], after=['0'])]
[Trace(name='Start', output=[{'role': 'user', 'content': 'Question: what is obamas age to the power of 2?'}]), Trace(name='thinker', output={'role': 'assistant', 'content': {'type': 'final_answer', 'text': '3969'}, 'meta': {'input_tokens': 18047, 'output_tokens': 18}}), Trace(name='End', output={'role': 'assistant', 'content': {'type': 'final_answer', 'text': '3969'}, 'me

KeyError: '0'

In [None]:
assert aligned_trace == frozendict({'0':2,'1':5})

In [None]:
debug_info.keys()

dict_keys(['0', '1'])

In [None]:
debug_info['0'][2]

{'comparisons': [{'comparison': 'eq',
   'kwargs': {},
   'expected': 'wikipedia_search',
   'actual': 'wikipedia_search',
   'distance': 0},
  {'comparison': 'cosine_dist',
   'kwargs': {},
   'expected': 'Obama',
   'actual': 'Barack Obama',
   'distance': 0.3245190060308185}],
 'distance': 0.3245190060308185,
 'expected_idx': 0,
 'actual_idx': 2,
 'actual_name': 'use_tool',
 'expected_name': 'use_tool',
 'expected_label': '0'}

In [None]:
# TODO from here:
# get it by evalu
# get a dataSeries the distance per step and total metrics such as sum of distances and coverage

## DataSets

In [None]:
# lift this to a least of datapoints
# make the dataset class
# get it from a directory, and then eval it.

# make the time series into a dataframe of all datapoints (in case of directory take the filename relative to the directory)
# maybe give ability to have nested directories with regexs to filter the files?
 
# then we output a datasetRun class that has the results dataframe but also metadata about the run
# for now lets start with an agent name and run id

# when returning the datasetrun dataframe, also return the debug info which will be a dict of debug info for each datapoint

## Comparing Runs

In [None]:
# a function that takes 2 datasetRuns and returns a comparison per datapoint on the difference between the two runs
# then have a utility function that prints the summary
# and have a utility function that returns the k datapoints that regressed the most
# have a pprint version of it that actually plots the traces and the difference between them



## End to End regression testing

In [None]:
## show the following example

# always have a factory (or a nested factory expression)
# take the same factory and change the prompt
# take the same agent but give it a different db
# or maybe the same db but a different filter
# same factory but different fewshot examples yaml files
# and show regression testing

# our function should take 2 agents, and a EvalDataSet 
# return the comparison and print the summary as the k datapoints that regressed the most



## training and test set

In [None]:
## TODO in future, specify 2 datasets (by dirs and regex)
# one is the train one is the test
# we take an agent (or 2 for comparison)
# and we do the same logic for evaluating and comparing, however, we print the statistics only for the test set
# but we show the regression for the train set.

## Design

In [None]:
"""

Data model

We have a dataset
* containing tests
* each test has the input to the agent
* and the expected output
* test is any object that can be serialized to json
* expected output is a partial trace spec

* partial trace spec is a list of steps
* each step has a name is a dict with accessors and value are how to check them
* names are the node name we expect to see in the trace
* the dict defines what we expect the value to look like


When we run a dataset, we take the input, run the agent, and check the output against the partial trace spec
since the partial trace spec does not 


"""

'\n\nData model\n\nWe have a dataset\n* containing tests\n* each test has the input to the agent\n* and the expected output\n* test is any object that can be serialized to json\n* expected output is a partial trace spec\n\n* partial trace spec is a list of steps\n* each step has a name is a dict with accessors and value are how to check them\n* names are the node name we expect to see in the trace\n* the dict defines what we expect the value to look like\n\n\nWhen we run a dataset, we take the input, run the agent, and check the output against the partial trace spec\nsince the partial trace spec does not \n\n\n'

In [None]:
class ExpectedTrace:
    pass

class DataPointRun:
    # basically a list of traces, agent input and agent output
    pass




In [None]:
def collect_traces_from_file(file_path):
    pass

def collect_traces_from_logg_aggregator(logger):
    pass

def run_dataset(agent,dataset,output_dir):
    # for each data point in the dataset
    # run the agent
    # collect the traces into a file
    # return the file path
    pass

def write_comparison_to_file(dataset_run,expected_traces,output_dir):
    # run the comparison and write the results to a file
    pass


In [None]:
def runs_summary(runs,dir):
    # get the run files and the comparison files
    # get the total metrics per expected node and total
    # make them into a dataframe
    pass

def plot_runs(runs,dir):
    # call runs_summary
    # plot the results
    pass

def check_regressions(runs,dir):
    # get two runs
    # for each input, if the second run is worse than the first, then flag it
    # make a dataframe of the regressions on a whole run basis
    
    # also make a dataframe of the regressions on a per node basis for the runs that regressed.
    pass


In [None]:
class DataSet:
    pass




## Experiment runs

In [None]:
# TODO start with directories of files with traces.
# here we just run the agent on the input and collect the traces to files
# Later, add a way to customize the runs from a logger or something 
# I think the best way would be to be able to turn the logs into a dataset file and work on it locally.

## Experiment scoring

In [None]:
# here we use the DPTW to match each trace to an expected trace
# than we have multiple scores
    # total distance, 
    # total distance per expected trace, 
    # coverage (percent of nodes expected), 
    # time coverage (percent of time of nodes expected), used to ignore nodes with no logic


# this experiment object can be dumped into a directory


## Regression detection

In [None]:

## Regression detection
# here we just compare the runs to each other


## export

In [None]:
# |hide
import nbdev; nbdev.nbdev_export()