# Eval

In [None]:
#| default_exp eval

In [None]:
# | hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import os
import json
import yaml
from stringdale import (
    Define,
    Scope,
    V,
    E,
    Condition,
    draw_nx
)
from stringdale.podtw import (
    parse_expected_trace,
    align_traces,
    word_overlap,
    regex,
    ExpectedTrace,
    Trace
)


from pathlib import Path
from frozendict import frozendict
from stringdale.core import  checkLogs
import pytest
import asyncio
from pydantic import BaseModel, ConfigDict

from typing import List, Union
import jsonlines
import logging


In [None]:
#| export
logger = logging.getLogger(__name__)

## Using podtw

In [None]:
# TODO multiple inputs
# TODO any comparison so we can check for existance of nodes

In [None]:
#| export

def parse_trace(trace_path:Union[str,Path]) -> List[Trace]:
    """
    Parse a trace file into a list of Trace objects.
    """
    with jsonlines.open(trace_path) as reader:
        return [Trace.model_validate(trace) for trace in reader]

In [None]:
from stringdale.core import get_git_root

In [None]:
sample_data_dir = get_git_root() / "sample_data" / "eval"

example_trace_path = sample_data_dir / "traces0.jsonl"
example_expected_path = sample_data_dir / "expected0.yaml"


example_comparisons = {
    "word_overlap":word_overlap,
    "regex":regex,
}

In [None]:
example_trace = parse_trace(example_trace_path)
example_expected = parse_expected_trace(example_expected_path)

In [None]:
best_match,score,dist =await align_traces(example_trace,example_expected,comparisons=example_comparisons,default_comparison=word_overlap)
best_match

frozendict.frozendict({'node_a1': 1, '3': 2, '1': 5, 'node_z': 4})

In [None]:
dist

{'node_a1': {1: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'jimmy went\nto the store\n',
     'actual': 'jimmy went\nto the store\nto buy some milk',
     'distance': 0.375,
     'accessor': 'b.c'}],
   'distance': 0.375,
   'expected_idx': 0,
   'actual_idx': 1,
   'actual_name': 'node_a',
   'expected_name': 'node_a',
   'expected_label': 'node_a1'}},
 'node_z': {1: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'store',
     'actual': {'b': {'c': 'jimmy went\nto the store\nto buy some milk'}},
     'distance': inf,
     'accessor': '.'}],
   'distance': inf,
   'expected_idx': 2,
   'actual_idx': 1,
   'actual_name': 'node_a',
   'expected_name': 'node_.*',
   'expected_label': 'node_z'},
  2: {'comparisons': [{'comparison': 'word_overlap',
     'kwargs': {},
     'expected': 'store',
     'actual': {'b': {'c': 'store is good'}},
     'distance': inf,
     'accessor': '.'}],
   'distance': inf,
   'expecte

## Realistic Comparison Functions

In [None]:
#| export
import numpy as np
import asyncio
from stringdale.db import openai_embed
from stringdale.chat import Chat

In [None]:
#| export
async def cosine_dist(out: str, expected: str, model: str = 'text-embedding-3-small') -> float:
    """Compute cosine distance between two strings using OpenAI embeddings.
    
    Args:
        out: First string to compare
        expected: Second string to compare
        model: OpenAI embedding model to use (default: 'text-embedding-3-small')
        
    Returns:
        float: Cosine similarity between the two strings (between -1 and 1)
    """
    # Get embeddings for both strings
    out_embedding = await openai_embed(out, model=model)
    expected_embedding = await openai_embed(expected, model=model)
    
    # Compute cosine similarity
    dot_product = np.dot(out_embedding, expected_embedding)
    norm_out = np.linalg.norm(out_embedding)
    norm_expected = np.linalg.norm(expected_embedding)
    
    # Return cosine similarity
    return 1-dot_product / (norm_out * norm_expected)



In [None]:
basic_dist = await cosine_dist("hello","hello")
basic_dist
assert basic_dist < 0.0001


In [None]:
await cosine_dist("hello","hello stranger")

np.float64(0.3944818489490096)

In [None]:
class ChatEvalScore(BaseModel):
    score:float

class ChatEval:
    def __init__(self,model:str="gpt-4o-mini",system_prompt:str=None):
        self.model = model
        base_prompt = """
            You are a helpful assistant that evaluates the similarity of two strings.
            You will be given two strings, and you will need to evaluate the similarity of the two strings.
            You will need to return a score between 0 and 1, where 0 is the lowest similarity and 1 is the highest similarity.
            """    
        self.messages = [
            {"role":"system","content":base_prompt},
        ]
        if system_prompt is not None:
            self.messages.append({"role":"system","content":system_prompt})

            

    async def __call__(self,out:str,expected:str)->float:
        self.messages.append({"role":"user","content":f"string1: {out}\nstring2: {expected}"})
        chat = Chat(model=self.model,messages=self.messages,output_schema=ChatEvalScore)
        response = await chat()
        return response['content'].score


In [None]:
chat_eval = ChatEval(system_prompt="if one of the strings is world, output 0.5")
result = await chat_eval("hello","world")
assert result == 0.5

In [None]:
#| export
def eq(a,b):
    if a == b:
        return 0
    else:
        return np.inf

def any(a,b):
    return 0

In [None]:
out = 3500
expr = "({0} < 4000) & ({0} > 3000)"

In [None]:
f_expr = expr.format(out)
f_expr

'(3500 < 4000) & (3500 > 3000)'

In [None]:
#| export
from stringdale.tools import run_python_code


In [None]:
#| export
def safe_eval(out,expression):
    try:
        formatted_expressions = expression.format(out)
    except Exception as e:
        logger.warning(f"Error formatting expression: {expression} with value {out}, error: {e}")
        return np.inf
    value = run_python_code(formatted_expressions)
    if isinstance(value,str) and value.startswith("Error"):
        logger.warning(
            f"Error evaluating expression: {formatted_expressions} = {value}\n"
            f"out: {out}\n"
            f"expression: {expression}\n"
            f"error: {e}"
        )
        return np.inf
    logger.debug(f"safe_eval: {formatted_expressions} = {value}")
    if isinstance(value,bool):
        return 0 if value else np.inf
    elif isinstance(value,float):
        return value
    else:
        logger.debug(
            f"When evaluating {expression} with value {out}\n"
            f"Expected float or bool, got {type(value)} with value {repr(value)}"
            )
        return np.inf

In [None]:
eval_string ="""
x=4000
({0} < x) & ({0} > 3000)
"""

with checkLogs():
    y =safe_eval(3500,eval_string)
y

__main__ - DEBUG - safe_eval: 
x=4000
(3500 < x) & (3500 > 3000)
 = True


0

In [None]:
safe_eval(3500,"""
x=4000
({0} < x) & ({0} > 3000)
""")

0

## Running and evaluating a single data point

In [None]:
#| export
from typing import List,Dict,Callable

In [None]:
#| export
class DataPoint(BaseModel):
    traces:List[Trace]
    expected:ExpectedTrace
    

In [None]:
from stringdale.examples.react import ReactAgent

In [None]:
#| export

async def _run_agent(Agent,expected_trace,trace_out):
    d=Agent()
    with jsonlines.open(trace_out,'w') as writer:
        for input in expected_trace.input:
            async for trace in d.arun(input):
                writer.write(json.loads(trace.model_dump_json(include={'name','output','duration'})))
            if d.finished:
                break

async def evaluate_datapoint(Agent,comparisons,default_comparison,expected_yaml,trace_out=None,force_run=False):
    if trace_out is None:
        trace_out = expected_yaml.parent/expected_yaml.name.replace(".yaml", ".jsonl").replace("expected", "actual")

    if not trace_out.parent.exists():
        os.makedirs(trace_out.parent,exist_ok=True)
    try:
        expected_trace = parse_expected_trace(expected_yaml)
    except Exception as e:
        raise ValueError(f"Error parsing expected trace {expected_yaml}: {e}") from e
        

    if force_run or not trace_out.exists():
        if not trace_out.exists():
            logger.info(f"Trace file {trace_out.name} does not exist, running agent")
        else:
            logger.info(f"Force running {trace_out.name}")
        await _run_agent(Agent,expected_trace,trace_out)
    else:
        logger.info(f"Trace file {trace_out.name} already exists, skipping agent run")

    parsed_trace = parse_trace(trace_out)
    aligned_trace,score,debug_info = await align_traces(parsed_trace,expected_trace,comparisons,default_comparison)
    
    return aligned_trace,score,debug_info,trace_out


In [None]:
agent = ReactAgent
expected_yaml = sample_data_dir/"react_expected.yaml"
bad_expected_yaml = sample_data_dir/"react_bad_expected.yaml"
comparisons = {
    "eq":eq,
    "eval":safe_eval,
}
default_comparison = cosine_dist


In [None]:
#| export

with checkLogs():
    alignment,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,bad_expected_yaml)

assert alignment is None
alignment,score,trace_out


__main__ - INFO - Trace file react_bad_actual.jsonl already exists, skipping agent run
No viable trace row nums for expected trace 1
No possible mappings found


(None,
 inf,
 PosixPath('/Users/dean/dl/stringdale/sample_data/eval/react_bad_actual.jsonl'))

In [None]:
with checkLogs(level='INFO'):
    alignment,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,expected_yaml)

assert dict(alignment) == {'0': 2, '1': 8}
alignment,score,trace_out

__main__ - INFO - Trace file react_actual.jsonl already exists, skipping agent run


(frozendict.frozendict({'0': 2, '1': 8}),
 np.float64(0.3244313858854829),
 PosixPath('/Users/dean/dl/stringdale/sample_data/eval/react_actual.jsonl'))

In [None]:
#| export
import pandas as pd

In [None]:
#| export

def _pd_order_columns_first(df:pd.DataFrame,first_columns:list[str]):
    """
    Reorder the columns of a pandas dataframe to put the first_columns first.
    """
    return df[first_columns + [c for c in df.columns if c not in first_columns]]



In [None]:
x = pd.DataFrame([
    {'distance':1,'comparison':'eq','actual':1,'expected':1},
    {'distance':2,'comparison':'eq','actual':2,'expected':2},
    {'distance':3,'comparison':'eq','actual':3,'expected':3},
])

_pd_order_columns_first(x,['actual','expected'])

Unnamed: 0,actual,expected,distance,comparison
0,1,1,1,eq
1,2,2,2,eq
2,3,3,3,eq


In [None]:
# debug_info

In [None]:
#| export
from copy import deepcopy

In [None]:
#| export

def summarize_datapoint(name,alignment,debug_info):
    """
    Summarize the datapoint by getting the distance per step and total metrics such as sum of distances and coverage
    by using the alignment and the debug info
    """
    deep_dive_fit = []

    for expected_node_id,trace_idx in alignment.items():
        match_data = debug_info[expected_node_id][trace_idx]
        for comp in match_data['comparisons']:
            summary = deepcopy(comp)
            summary['node_name'] = match_data['actual_name']
            summary['expected_name'] = match_data['expected_name']
            summary['expected_node_id'] = expected_node_id
            summary['trace_idx'] = trace_idx
            # TODO put node name and node pattern
            deep_dive_fit.append(summary)

    df = pd.DataFrame(deep_dive_fit)
    df['datapoint'] = name
    df = _pd_order_columns_first(df,['datapoint','expected_node_id','trace_idx','accessor','comparison','actual','expected','distance'])
    return df

In [None]:
df = summarize_datapoint('react',alignment,debug_info)
df

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,react,0,2,content.name,eq,wikipedia_search,wikipedia_search,0.0,{},use_tool,use_tool
1,react,0,2,content.input.q,cosine_dist,Barack Obama,Obama,0.324431,{},use_tool,use_tool
2,react,1,8,content.name,eq,run_python_code,run_python_code,0.0,{},use_tool,use_tool
3,react,1,8,content.output,safe_eval,3844,({0} < 4000) & ({0} > 3000),0.0,{},use_tool,use_tool


In [None]:
assert df['expected_node_id'].to_list() == ['0','0','1','1']
assert df['accessor'].to_list() == ['content.name','content.input.q','content.name','content.output']

## DataSets

In [None]:
#| export
from stringdale import DiagramSchema
from pprint import pprint, pformat

In [None]:
#| export
def _trace_out_path(expected_yaml:Path,expected_dir:Path,trace_dir:Path):
    return trace_dir / expected_yaml.relative_to(expected_dir).with_suffix(".jsonl")



In [None]:
#| export
class EvalDataset(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    expected_dir: Path
    trace_dir: Path
    summary: pd.DataFrame
    details: pd.DataFrame
    debug: dict

    def __repr__(self):     
        return (
            f"EvalDataset(expected_dir={self.expected_dir}, \n"
            f"  trace_dir={self.trace_dir}, \n"
            f"  summary=Dataframe({self.summary.shape}), \n"
            f"  details=Dataframe({self.details.shape}), \n"
            f"  debug=dict)")
        

In [None]:
#| export

def _find_yamls(expected_dir:Path):
    expected_yamls = list(expected_dir.glob("**/*.yaml")) + list(expected_dir.glob("**/*.yml"))
    return expected_yamls


async def eval_dataset(Agent:DiagramSchema,expected_dir,trace_dir,force_run=False,comparisons=None,default_comparison=None):

    expected_yamls = _find_yamls(expected_dir)
    relative_expected_yamls = [expected_yamls.relative_to(expected_dir) for expected_yamls in expected_yamls]

    trace_files  = [_trace_out_path(expected_yaml,expected_dir,trace_dir) for expected_yaml in expected_yamls]

    logger.info(f"Evaluating {len(expected_yamls)} datapoints, logging to {trace_dir}")
    datapoint_tasks = [evaluate_datapoint(
            Agent=Agent,
            comparisons=comparisons,
            default_comparison=default_comparison,
            expected_yaml=expected_yaml,
            trace_out=trace_file,
            force_run=force_run,
        ) for expected_yaml,trace_file in zip(expected_yamls,trace_files) if trace_file in trace_files]
    
    datapoint_results = await asyncio.gather(*datapoint_tasks)

    summary_data = list()
    deep_dives = list()
    debug_infos = dict()

    for alignment,score,debug_info,trace_out in datapoint_results:
        datapoint_name = trace_out.relative_to(trace_dir).with_suffix("")
        summary = {'datapoint_name':datapoint_name,'score':score,'alignment':alignment}
        summary_data.append(summary)
        deep_dives.append(summarize_datapoint(datapoint_name,alignment,debug_info))
        debug_infos[datapoint_name] = debug_info
    

    summary_df = pd.DataFrame(summary_data)
    if len(deep_dives) > 0:
        deep_dives_df = pd.concat(deep_dives).reset_index(drop=True)
    else:
        deep_dives_df = pd.DataFrame()

    return EvalDataset(
        expected_dir=expected_dir,
        trace_dir=trace_dir,
        summary=summary_df,
        details=deep_dives_df,
        debug=debug_infos
    )

In [None]:
from stringdale.examples.rag import Rag
from stringdale.db import ChromaClient

In [None]:
def load_agent(conf_dir: Path):
    agent_yaml_path = conf_dir / 'agent.yml'
    vec_db_yaml_path = conf_dir / 'vec_db.yml'

    agent_conf = yaml.safe_load(agent_yaml_path.read_text())
    vec_db_conf = yaml.safe_load(vec_db_yaml_path.read_text())

    db = ChromaClient()
    for collection_name, docs in vec_db_conf.items():
        db.add_collection(collection_name, exists_ok=True)
        db.upsert(collection_name, docs)

    agent_conf['db'] = db
    
    Agent = Rag(**agent_conf)

    return Agent

In [None]:
expected_dir = get_git_root() / "sample_data" / "eval_datasets" / "expected_traces"
trace_dir = get_git_root() / "sample_data" / "eval_datasets" / "traces"
conf_dir = get_git_root() / "sample_data" / "eval_datasets" / "agent_configs"


comparisons = {
    'eq':eq,
    'eval':safe_eval,
    'cosine_dist':cosine_dist,
    # TODO make a chat_eval where you can put the system prompt as a kwarg
}

default_comparison = cosine_dist

In [None]:
with checkLogs(level='INFO'):
    ds1 = await eval_dataset(
        Agent=load_agent(conf_dir/'v001'),
        expected_dir=expected_dir,
        trace_dir=trace_dir/'v001',
        comparisons=comparisons,
        default_comparison=default_comparison,
        )

    ds2 = await eval_dataset(
        Agent=load_agent(conf_dir/'v002'),
        expected_dir=expected_dir,
        trace_dir=trace_dir/'v002',
        comparisons=comparisons,
        default_comparison=default_comparison)

    ds3 = await eval_dataset(
        Agent=load_agent(conf_dir/'v003'),
        expected_dir=expected_dir,
        trace_dir=trace_dir/'v003',
        comparisons=comparisons,
        default_comparison=default_comparison)



__main__ - INFO - Evaluating 2 datapoints, logging to /Users/dean/dl/stringdale/sample_data/eval_datasets/traces/v001
__main__ - INFO - Trace file pikachus.jsonl already exists, skipping agent run
__main__ - INFO - Trace file huskies.jsonl already exists, skipping agent run
__main__ - INFO - Evaluating 2 datapoints, logging to /Users/dean/dl/stringdale/sample_data/eval_datasets/traces/v002
__main__ - INFO - Trace file pikachus.jsonl already exists, skipping agent run
__main__ - INFO - Trace file huskies.jsonl already exists, skipping agent run
__main__ - INFO - Evaluating 2 datapoints, logging to /Users/dean/dl/stringdale/sample_data/eval_datasets/traces/v003
__main__ - INFO - Trace file pikachus.jsonl already exists, skipping agent run
__main__ - INFO - Trace file huskies.jsonl already exists, skipping agent run


In [None]:
ds1

EvalDataset(expected_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/expected_traces, 
  trace_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/traces/v001, 
  summary=Dataframe((2, 3)), 
  details=Dataframe((3, 11)), 
  debug=dict)

In [None]:
ds1.details

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,pikachus,0,2,content,cosine_dist,"I'm sorry, but I can only provide information ...",They are:\n * dangerous\n * smoke cigarettes,0.943595,{},chat,chat
1,huskies,0,1,.,safe_eval,"[{'id': 'dog1', 'text': 'The Golden Retriever ...",'dog3' in [doc['id'] for doc in {}],0.0,{},get_docs,get_docs
2,huskies,1,2,content,cosine_dist,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent,0.655937,{},chat,chat


In [None]:
ds2.details

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,pikachus,0,2,content,cosine_dist,"Pikachus are not dogs, they are fictional crea...",They are:\n * dangerous\n * smoke cigarettes,0.818815,{},chat,chat
1,huskies,0,1,.,safe_eval,"[{'id': 'dog1', 'text': 'The Golden Retriever ...",'dog3' in [doc['id'] for doc in {}],0.0,{},get_docs,get_docs
2,huskies,1,2,content,cosine_dist,"Golden Retrievers are friendly, intelligent do...",They are:\n * friendly\n * intelligent,0.610838,{},chat,chat


In [None]:
ds3.details

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,pikachus,0,2,content,cosine_dist,Pikachus are dangerous creatures that smoke to...,They are:\n * dangerous\n * smoke cigarettes,0.443801,{},chat,chat
1,huskies,0,1,.,safe_eval,"[{'id': 'dog1', 'text': 'The Golden Retriever ...",'dog3' in [doc['id'] for doc in {}],0.0,{},get_docs,get_docs
2,huskies,1,2,content,cosine_dist,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent,0.651285,{},chat,chat


In [None]:
out = [{'id': 'dog1', 'text': 'The Golden Retriever is a friendly, intelligent breed known for its golden coat. They make excellent family pets and are great with children.', 'metadata': {'breed': 'Golden Retriever'}, 'distance': 0.6440681219100952}, {'id': 'dog3', 'text': 'The Golden Retriever is a friendly, intelligent breed with a beautiful golden coat. They are wonderful family pets that get along well with kids.', 'metadata': {'breed': 'Golden Retriever'}, 'distance': 0.6522300839424133}, {'id': 'dog2', 'text': 'German Shepherds are loyal, protective dogs often used in police work. They are highly trainable and good at various tasks.', 'metadata': {'breed': 'German Shepherd'}, 'distance': 1.287759780883789}, {'id': 'dog4', 'text': 'Huskies are energetic working dogs bred for cold climates. They have thick fur and often blue eyes.', 'metadata': {'breed': 'Husky'}, 'distance': 1.344233751296997}, {'id': 'dog5', 'text': 'Siberian Huskies are active working dogs that thrive in cold weather. They are known for their thick coats and striking blue eyes.', 'metadata': {'breed': 'Husky'}, 'distance': 1.465799331665039}]
expected = """'dog3' in [doc['id'] for doc in {}]  """

safe_eval(out,expected)

0

In [None]:
! ls {conf_dir}

[1m[36mv001[m[m [1m[36mv002[m[m [1m[36mv003[m[m


In [None]:
ds1

EvalDataset(expected_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/expected, 
  trace_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/traces/v001, 
  summary=Dataframe((0, 0)), 
  details=Dataframe((0, 0)), 
  debug=dict)

In [None]:
ds1 = eval_dataset(
    Agent=Rag01,
    expected_dir=expected_dir,
    trace_dir=trace_dir/'v001',
    comparisons=comparisons,
    default_comparison=default_comparison)

ds1

In [None]:
# TODO later, add more metrics over traces and expected outs beyond total distance. Such as, total coverage
# TODO see how we can make an "in" modifier, so we can make sure that we retreived "close" documents from a document list.
# TODO add to "V" ability to specifiy the funcname for presenting in the drawing

# TODO add a serialize and deserialize function for the DatasetEval class #rename
# that allows you to write and load it to a directory

## Comparing Runs

In [None]:
# a function that takes 2 datasetRuns and returns a comparison per datapoint on the difference between the two runs
# then have a utility function that prints the summary
# and have a utility function that returns the k datapoints that regressed the most
# have a pprint version of it that actually plots the traces and the difference between them



In [None]:
def describe_changes(ds1,ds2,datapoint,epsilon=1e-3):
    """
    Describe the changes between two datapoints
    """

    # take the detailed version of the datasets and limit to only rows of the given datapoint

    # since these datapoints or not extended or reduced, we expect the same set of expected nodes and the same set of tuples of the type (content,comparison)
    # lets assert this in the code
    
    # we make a list of changes with three types of changes:
        # expected_nodes that changed trace
        # comparisons that saw improvement
        # comparisons that saw regression

    # we then pass a dataframe of these changes # with a column that is the datapoint

    # we then pass a dataframe of these changes # with a column that is the datapoint
    pass


def compare_datasets(ds1,ds2,epsilon=1e-3,metrics=None):
    """
    Compare two datasets
    """

    # TODO epsilons are either a fixed value, or a value per metric
    # if metrics are none, default is 

    # we assume all datasets have the same set of datapoints 
    # assert it
    # and that datapoints with the same name have identical expected outputs
    # this we dont assert 

    # improved cases - non of the above and at least one metric improved by more than epsilon and the no metric worsened
    # regressed cases - non of the above and at least one metric worsened by more than epsilon and the no metric improved

    # changed cases - non of the above, some metrics improved and some worsened by more than epsilon
    # unchanged cases - non of the above, no metrics improved or worsened by more than epsilon


    # for each case in improved, regressed, and changed, we want to get a detailed summary of the changes
    # we compute a detailed summary of the changes for each datapoint
    # and then we return one big dataframe of comparison level conflicts
    # by concatenating the datapoint level summaries and marking them a column of the correct tag (imporved,regressed,changed)
    pass





In [None]:
# TODO add num expected and numtraces to summary
# TODO and from that derive coverage

In [None]:
ds1.details

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,pikachus,0,2,content,cosine_dist,"I'm sorry, but I can only provide information ...",They are:\n * dangerous\n * smoke cigarettes,0.943595,{},chat,chat
1,huskies,0,1,.,safe_eval,"[{'id': 'dog1', 'text': 'The Golden Retriever ...",'dog3' in [doc['id'] for doc in {}],0.0,{},get_docs,get_docs
2,huskies,1,2,content,cosine_dist,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent,0.655937,{},chat,chat


In [None]:
ds2.details

Unnamed: 0,datapoint,expected_node_id,trace_idx,accessor,comparison,actual,expected,distance,kwargs,node_name,expected_name
0,pikachus,0,2,content,cosine_dist,"Pikachus are not dogs, they are fictional crea...",They are:\n * dangerous\n * smoke cigarettes,0.818815,{},chat,chat
1,huskies,0,1,.,safe_eval,"[{'id': 'dog1', 'text': 'The Golden Retriever ...",'dog3' in [doc['id'] for doc in {}],0.0,{},get_docs,get_docs
2,huskies,1,2,content,cosine_dist,"Golden Retrievers are friendly, intelligent do...",They are:\n * friendly\n * intelligent,0.610838,{},chat,chat


In [None]:
# TODO when pprinting comparisons, give urls to files, so that its easy in nb to navigate to them

## Eval main entrypoint

In [None]:
def eval()
   """
    we take
     - a dir with expected traced
     - a dir to write traces to
     - a dict of agents and their names/codes (ie v001, v002 etc)
     - a cache_dir (if not, its a temp dir that we abandon later)
     - a force flag (means invalidate the cache and re run the agents)
     - a baseline version  (used to compare to), assumed to be the first in the list of agents unless otherwise specified
     - summary_file=None if None pprint summary to console. Else save summary to file
     - k = None, how many datapoints of each type to print to summary at most by default all
     - silent = False, if True, dont pprint comparisons
     - force_run = False, if True, delete the cache and re run the agents
   """

    # TODO we make eval dataset able to load from a cachedir and run only those where the expected is not the same
      # we check if its the same up to yaml whitespace, by comparing the yaml string after loading and serializing
    
    # we eval all datasets, expected over all agents concurrently

    # then we run comparisons between the baseline dataset and the other datasets

    # then, we pprint the summary of each comparison seperately

    # then we group the per datapoint comps across all dataset comparisons by the datapoint id
    # and for each datapoint we pprint a combined datapoint comparison.
    # combined datapoints for each datapoint, the total metrics of each version
    # and then for each comparison that is different from baseline, say how it is different for every version.

    # we return an EvalResult object that tracks the input of the EvalData, but also has the DataSet and DataSetComp objects for each dataset and comparison
   
   pass


In [None]:
# TODO add asyncio sempathores to Chat and DB operations etc, so that we dont get rate limited due to tons of async requests

""" # Optional semaphore
from contextlib import asynccontextmanager
from typing import Optional

@asynccontextmanager
async def optional_semaphore(semaphore: Optional[asyncio.Semaphore] = None):
    if semaphore is not None:
        async with semaphore:
            yield
    else:
        yield

# Usage example:
async def my_function(limit_concurrency: bool = False):
    sem = asyncio.Semaphore(2) if limit_concurrency else None
    
    async with optional_semaphore(sem):
        # Your async code here
        await asyncio.sleep(1)
        print("Function executed")


"""

## training and test set

In [None]:
# how do we do training and validation on workflows?

# we have 2 expected datasets, train and test

# we look at the total distance of the validation set to see that we are improving on it

# but we only look at the comparisons and fix our configs or diagrams based on the training set

In [None]:
## TODO in future, specify 2 datasets (by dirs and regex)
# one is the train one is the test
# we take an agent (or 2 for comparison)
# and we do the same logic for evaluating and comparing, however, we print the statistics only for the test set
# but we show the regression for the train set.

## Design

In [None]:
"""

Data model

We have a dataset
* containing tests
* each test has the input to the agent
* and the expected output
* test is any object that can be serialized to json
* expected output is a partial trace spec

* partial trace spec is a list of steps
* each step has a name is a dict with accessors and value are how to check them
* names are the node name we expect to see in the trace
* the dict defines what we expect the value to look like


When we run a dataset, we take the input, run the agent, and check the output against the partial trace spec
since the partial trace spec does not 


"""

'\n\nData model\n\nWe have a dataset\n* containing tests\n* each test has the input to the agent\n* and the expected output\n* test is any object that can be serialized to json\n* expected output is a partial trace spec\n\n* partial trace spec is a list of steps\n* each step has a name is a dict with accessors and value are how to check them\n* names are the node name we expect to see in the trace\n* the dict defines what we expect the value to look like\n\n\nWhen we run a dataset, we take the input, run the agent, and check the output against the partial trace spec\nsince the partial trace spec does not \n\n\n'

In [None]:
class ExpectedTrace:
    pass

class DataPointRun:
    # basically a list of traces, agent input and agent output
    pass




In [None]:
def collect_traces_from_file(file_path):
    pass

def collect_traces_from_logg_aggregator(logger):
    pass

def run_dataset(agent,dataset,output_dir):
    # for each data point in the dataset
    # run the agent
    # collect the traces into a file
    # return the file path
    pass

def write_comparison_to_file(dataset_run,expected_traces,output_dir):
    # run the comparison and write the results to a file
    pass


In [None]:
def runs_summary(runs,dir):
    # get the run files and the comparison files
    # get the total metrics per expected node and total
    # make them into a dataframe
    pass

def plot_runs(runs,dir):
    # call runs_summary
    # plot the results
    pass

def check_regressions(runs,dir):
    # get two runs
    # for each input, if the second run is worse than the first, then flag it
    # make a dataframe of the regressions on a whole run basis
    
    # also make a dataframe of the regressions on a per node basis for the runs that regressed.
    pass


In [None]:
class DataSet:
    pass




## Experiment runs

In [None]:
# TODO start with directories of files with traces.
# here we just run the agent on the input and collect the traces to files
# Later, add a way to customize the runs from a logger or something 
# I think the best way would be to be able to turn the logs into a dataset file and work on it locally.

## Experiment scoring

In [None]:
# here we use the DPTW to match each trace to an expected trace
# than we have multiple scores
    # total distance, 
    # total distance per expected trace, 
    # coverage (percent of nodes expected), 
    # time coverage (percent of time of nodes expected), used to ignore nodes with no logic


# this experiment object can be dumped into a directory


## Regression detection

In [None]:

## Regression detection
# here we just compare the runs to each other


In [None]:
# TODO 
# in the end, we want 3 entrypoints:

# eval, eval_single, and align_trace

# eval will get lists of versions, and which comparisons to do, and log dir to save results to etc..

## export

In [None]:
# |hide
import nbdev; nbdev.nbdev_export()