# Eval

In [None]:
#| default_exp eval

In [None]:
# | hide
%load_ext autoreload
%autoreload 2
# %load_ext rich

In [None]:
#| export
import os
import json
import yaml
from stringdale import (
    Define,
    Scope,
    V,
    E,
    Condition,
    draw_nx
)
from stringdale.stream_warping import (
    TestCase,
    parse_test_case,
    TraceLog,
    event_stream_warp,
    word_overlap,
    regex,
)


from pathlib import Path
from frozendict import frozendict
from stringdale.core import  checkLogs,await_all
import pytest
import asyncio
from pydantic import BaseModel, ConfigDict

from typing import List, Union
import jsonlines
import logging


In [None]:
#| export
logger = logging.getLogger(__name__)

## Using podtw

In [None]:
# TODO multiple inputs
# TODO ANY comparison so we can check for existance of nodes

In [None]:
#| export

def parse_trace_log(trace_path:Union[str,Path]) -> TraceLog:
    """
    Parse a trace file into a list of Trace objects.
    """
    with jsonlines.open(trace_path) as reader:
        traces = [trace for trace in reader]
        return TraceLog(steps=traces)

In [None]:
from stringdale.core import get_git_root

In [None]:
sample_data_dir = get_git_root() / "sample_data" / "eval"

example_trace_log_path = sample_data_dir / "traces0.jsonl"
example_case_path = sample_data_dir / "expected0.yaml"


example_comparisons = {
    "word_overlap":word_overlap,
    "regex":regex,
}

In [None]:
example_test_case = parse_test_case(example_case_path)
example_trace_log  = parse_trace_log(example_trace_log_path)

In [None]:
best_match,score,debug_info =await event_stream_warp(example_trace_log,example_test_case,comparisons=example_comparisons,default_comparison="word_overlap")
best_match

frozendict.frozendict({'3': 2, 'node_a1': 1, 'node_z': 4, '1': 5})

In [None]:
# debug_info

## Realistic Comparison Functions

In [None]:
#| export
import numpy as np
import asyncio
from stringdale.db import openai_embed
from stringdale.chat import Chat

In [None]:
#| export
async def cosine_dist(out: str, expected: str, model: str = 'text-embedding-3-small') -> float:
    """Compute cosine distance between two strings using OpenAI embeddings.
    
    Args:
        out: First string to compare
        expected: Second string to compare
        model: OpenAI embedding model to use (default: 'text-embedding-3-small')
        
    Returns:
        float: Cosine similarity between the two strings (between -1 and 1)
    """
    # Get embeddings for both strings
    if not isinstance(out,str):
        return np.inf
    if not isinstance(expected,str):
        raise ValueError(f"cosine_dist: expected is not a string: {expected}")
    out_embedding = await openai_embed(out, model=model)
    expected_embedding = await openai_embed(expected, model=model)
    
    # Compute cosine similarity
    dot_product = np.dot(out_embedding, expected_embedding)
    norm_out = np.linalg.norm(out_embedding)
    norm_expected = np.linalg.norm(expected_embedding)
    
    # Return cosine similarity
    return 1-dot_product / (norm_out * norm_expected)



In [None]:
basic_dist = await cosine_dist("hello","hello")
basic_dist
assert basic_dist < 0.0001


In [None]:
await cosine_dist("hello","hello stranger")

np.float64(0.3944818489490096)

In [None]:
#|export
from stringdale.core import jinja_undeclared_vars
from typing import Any

In [None]:
class ChatEvalScore(BaseModel):
    score:float


async def chat_eval(out:Any,expected:Any,model:str="gpt-4o-mini",system_prompt:str=None)->float:

    if system_prompt is None:
        system_prompt = """
            You are a helpful assistant that evaluates the similarity of two strings.
            You will be given two strings, and you will need to evaluate the similarity of the two strings.
            You will need to return a score between 0 and 1, where 0 is the lowest similarity and 1 is the highest similarity.

            string1: {{out}}
            string2: {{expected}}

            return a score between 0 and 1, where 0 is the lowest similarity and 1 is the highest similarity.
            """

    if not jinja_undeclared_vars(system_prompt) == {'out','expected'}:
        raise ValueError("System prompt must contain {{out}} and {{expected}} jinja variables")

    chat = Chat(model=model,messages=
        [{"role":"system","content":system_prompt}],
        output_schema=ChatEvalScore,
        out = out,
        expected = expected,
        )
    response = await chat()
    return response['content'].score

In [None]:
custom_prompt = """
if one of the strings contains "hello", return 0.5

string1: {{out}}
string2: {{expected}}
"""


result = await chat_eval("hello","world",system_prompt=custom_prompt)
assert result == 0.5

In [None]:
custom_prompt = """
if one of the strings contains "hello", return 0.5

string1: {{outs}}
string2: {{expected}}
"""

with pytest.raises(ValueError,match="System prompt must contain {{out}} and {{expected}} jinja variables"):
    result = await chat_eval("hello","world",system_prompt=custom_prompt)


In [None]:
#| export
def eq(a,b):
    if a == b:
        return 0
    else:
        return np.inf

def any(a,b):
    return 0

In [None]:
out = 3500
expr = "({0} < 4000) & ({0} > 3000)"

In [None]:
f_expr = expr.format(out)
f_expr

'(3500 < 4000) & (3500 > 3000)'

In [None]:
#| export
from stringdale.tools import run_python_code


In [None]:
#| export
def safe_eval(out,expression):
    try:
        formatted_expressions = expression.format(out)
    except Exception as e:
        logger.warning(f"Error formatting expression: {expression} with value {out}, error: {e}")
        return np.inf
    value = run_python_code(formatted_expressions)
    if isinstance(value,str) and value.startswith("Error"):
        logger.warning(
            f"Error evaluating expression: {formatted_expressions} = {value}\n"
            f"out: {out}\n"
            f"expression: {expression}\n"
            f"error: {e}"
        )
        return np.inf
    logger.debug(f"safe_eval: {formatted_expressions} = {value}")
    if isinstance(value,bool):
        return 0 if value else np.inf
    elif isinstance(value,float):
        return value
    else:
        logger.debug(
            f"When evaluating {expression} with value {out}\n"
            f"Expected float or bool, got {type(value)} with value {repr(value)}"
            )
        return np.inf

In [None]:
eval_string ="""
x=4000
({0} < x) & ({0} > 3000)
"""

with checkLogs():
    y =safe_eval(3500,eval_string)
y

__main__ - DEBUG - safe_eval: 
x=4000
(3500 < x) & (3500 > 3000)
 = True


0

In [None]:
safe_eval(3500,"""
x=4000
({0} < x) & ({0} > 3000)
""")

0

## Running and evaluating a single data point

In [None]:
#| export
from typing import List,Dict,Callable

In [None]:
#| export
class DataPoint(BaseModel):
    traces:TraceLog
    expected:TestCase
    

In [None]:
#| export

async def _run_agent(Agent,test_case:TestCase,trace_log_path:Path):
    d=Agent()
    with jsonlines.open(trace_log_path,'w') as writer:
        for input in test_case.inputs:
            async for trace in d.arun(input):
                if trace.node_func is None:
                    continue
                writer.write(json.loads(trace.model_dump_json(include={'name','output','duration'})))
            if d.finished:
                break

async def evaluate_datapoint(Agent,comparisons,default_comparison,test_case_path,trace_log_path=None,force_run=False):
    if trace_log_path is None:
        trace_log_path = test_case_path.parent/test_case_path.name.replace(".yaml", ".jsonl").replace("expected", "actual")

    if not trace_log_path.parent.exists():
        os.makedirs(trace_log_path.parent,exist_ok=True)
    try:
        test_case = parse_test_case(test_case_path)
    except Exception as e:
        raise ValueError(f"Error parsing test case {test_case_path}: {e}") from e
        

    if force_run or not trace_log_path.exists():
        if not trace_log_path.exists():
            logger.info(f"Trace file {trace_log_path.name} does not exist, running agent")
        else:
            logger.info(f"Force running {trace_log_path.name}")
        await _run_agent(Agent,test_case,trace_log_path)
    else:
        logger.info(f"Trace file {trace_log_path.name} already exists, skipping agent run")

    parsed_trace = parse_trace_log(trace_log_path)
    aligned_trace,score,debug_info = await event_stream_warp(parsed_trace,test_case,comparisons,default_comparison)
    
    return aligned_trace,score,debug_info,trace_log_path


In [None]:
from stringdale.examples.react import ReactAgent

In [None]:
agent = ReactAgent
expected_yaml = sample_data_dir/"react_expected.yaml"
bad_expected_yaml = sample_data_dir/"react_bad_expected.yaml"
comparisons = {
    "eq":eq,
    "eval":safe_eval,
    "cosine_dist":cosine_dist,
}
default_comparison = 'cosine_dist'


In [None]:
#| export

with checkLogs():
    alignment,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,bad_expected_yaml)

assert alignment is None
alignment,score,trace_out


__main__ - INFO - Trace file react_bad_actual.jsonl already exists, skipping agent run
No viable trace row nums for expected trace 1
No possible mappings found


(None,
 inf,
 PosixPath('/Users/dean/dl/stringdale/sample_data/eval/react_bad_actual.jsonl'))

In [None]:
with checkLogs(level='INFO'):
    alignment,score,debug_info,trace_out = await evaluate_datapoint(agent,comparisons,default_comparison,expected_yaml)

assert dict(alignment) == {'0': 2, '1': 8}
alignment,score,trace_out

__main__ - INFO - Trace file react_actual.jsonl already exists, skipping agent run


(frozendict.frozendict({'0': 2, '1': 8}),
 np.float64(0.3244313858854829),
 PosixPath('/Users/dean/dl/stringdale/sample_data/eval/react_actual.jsonl'))

In [None]:
#| export
import pandas as pd

In [None]:
#| export

def _pd_order_columns_first(df:pd.DataFrame,first_columns:list[str]):
    """
    Reorder the columns of a pandas dataframe to put the first_columns first.
    """
    return df[first_columns + [c for c in df.columns if c not in first_columns]]



In [None]:
x = pd.DataFrame([
    {'distance':1,'comparison':'eq','actual':1,'expected':1},
    {'distance':2,'comparison':'eq','actual':2,'expected':2},
    {'distance':3,'comparison':'eq','actual':3,'expected':3},
])

_pd_order_columns_first(x,['actual','expected'])

Unnamed: 0,actual,expected,distance,comparison
0,1,1,1,eq
1,2,2,2,eq
2,3,3,3,eq


In [None]:
# debug_info

In [None]:
#| export
from copy import deepcopy
from itertools import count


In [None]:
#| export

def summarize_datapoint(name,alignment,debug_info):
    """
    Summarize the datapoint by getting the distance per step and total metrics such as sum of distances and coverage
    by using the alignment and the debug info
    """
    deep_dive_fit = []

    comp_counter = count()
    for expected_node_id,trace_idx in alignment.items():
        match_data = debug_info[expected_node_id][trace_idx]
        for comp in match_data['comparisons']:
            summary = deepcopy(match_data) | deepcopy(comp) 
            summary['comp_id'] = next(comp_counter)
            summary.pop('comparisons')
            summary['aggregation'] = comp['aggregation']
            deep_dive_fit.append(summary)


    df = pd.DataFrame(deep_dive_fit)
    df['datapoint'] = str(name)
    df = _pd_order_columns_first(df,['datapoint','node_label','trace_idx','comparison','key','actual','expected','distance'])
    return df

In [None]:
df = summarize_datapoint('react',alignment,debug_info)
df

Unnamed: 0,datapoint,node_label,trace_idx,comparison,key,actual,expected,distance,node_idx,trace_name,node_name,kwargs,aggregation,comp_id
0,react,0,2,eq,content.name,wikipedia_search,wikipedia_search,0.0,0,use_tool,use_tool,{},,0
1,react,0,2,cosine_dist,content.input.q,Barack Obama,Obama,0.324431,0,use_tool,use_tool,{},,1
2,react,1,8,eq,content.name,run_python_code,run_python_code,0.0,1,use_tool,use_tool,{},,2
3,react,1,8,eval,content.output,3844,({0} < 4000) & ({0} > 3000),0.0,1,use_tool,use_tool,{},,3


In [None]:
assert df['node_label'].to_list() == ['0','0','1','1']
assert df['key'].to_list() == ['content.name','content.input.q','content.name','content.output']

## DataSets

In [None]:
#| export
def filter_and_concat(df1: pd.DataFrame, df2: pd.DataFrame, keys: list) -> pd.DataFrame:
    """
    Filter df1 by removing rows with matching key values in df2, then concatenate with df2.
    
    Args:
        df1 (pd.DataFrame): First DataFrame to filter
        df2 (pd.DataFrame): Second DataFrame to concatenate
        keys (list): List of column names to use as keys for matching
        
    Returns:
        pd.DataFrame: Concatenated DataFrame with filtered df1 and df2
    """
    if df1.empty:
        return df2
    if df2.empty:
        return df1
    # Create tuples of key values for comparison
    mask = df1[keys].apply(tuple, axis=1).isin(df2[keys].apply(tuple, axis=1))
    
    # Filter df1 to keep only rows that don't exist in df2 (using inverse mask)
    df1_filtered = df1[~mask]
    
    # Concatenate the filtered df1 with df2
    result = pd.concat([df1_filtered, df2], ignore_index=True)
    
    return result

In [None]:
# Example DataFrames
df1 = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': ['a', 'b', 'c', 'd'],
    'C': [10, 20, 30, 40]
})

df2 = pd.DataFrame({
    'A': [3, 4],
    'B': ['c', 'd'],
    'C': [50, 60]
})

# List of keys to match on
keys = ['A', 'B']

# Apply the function
result = filter_and_concat(df1, df2, keys)

assert result.equals(pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': ['a', 'b', 'c', 'd'],
    'C': [10, 20, 50, 60]
}))
print(result)

   A  B   C
0  1  a  10
1  2  b  20
2  3  c  50
3  4  d  60


In [None]:
#| export
from stringdale import DiagramSchema
from pprint import pprint, pformat
from fastcore.basics import patch
from typing import Optional
from pydantic import BaseModel, ConfigDict, PrivateAttr


In [None]:
#| export
class TestSetRun(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    # Private attributes
    _summary_dict: dict = PrivateAttr(default_factory=dict)
    _details_dict: dict = PrivateAttr(default_factory=dict)
    
    # Regular fields
    test_dir: Path
    dir: Path
    summary: pd.DataFrame
    details: pd.DataFrame
    debug: dict

    def find_cases(self):
        yaml_paths =  list(self.test_dir.glob("**/*.yaml"))
        return [str(p.relative_to(self.test_dir).with_suffix("")) for p in yaml_paths]

    def trace_log_path(self,datapoint:str):
        return self.dir/'logs'/f'{datapoint}.jsonl'

    def trace_log_len(self,datapoint:str):
        log_path = self.trace_log_path(datapoint)
        return len(log_path.read_text().splitlines())

    def testcase_path(self,datapoint:str):
        return self.test_dir/f'{datapoint}.yaml'

    def serialize_test_case(self,datapoint:str):
        yml_version = yaml.safe_load(self.testcase_path(datapoint).read_text())
        json_version = json.dumps(yml_version,indent=2)
        return json_version

    def datapoint_len(self,datapoint:str):
        datapoint_yaml = (self.test_dir/datapoint).with_suffix(".yaml")
        return trace_log_len(self.dir/datapoint_yaml)
        
    def __repr__(self):     
        return (
            f"TestSetRun(\n"
            f"  test_dir={self.test_dir}, \n"
            f"  dir={self.dir}, \n"
            f"  summary=Dataframe({self.summary.shape}), \n"
            f"  details=Dataframe({self.details.shape}), \n"
            f"  debug=dict)")

    def __str__(self):
        return self.__repr__()

    def save(self,dir:Path):
        self.summary.to_csv(dir/"summary.csv",index=False)
        self.details.to_csv(dir/"details.csv",index=False)
        test_dir_rel = Path(os.path.relpath(self.test_dir,self.dir))
        (dir/'test_cases_loc.txt').write_text(str(test_dir_rel))
        with open(dir/"debug.json","w") as f:
            json.dump(self.debug,f)

    @classmethod
    def load(cls, dir: Path,test_dir:Optional[Path]=None):
        # Initialize empty DataFrames and dict for missing files
        summary = pd.DataFrame()
        details = pd.DataFrame()
        debug = {}
        if test_dir is None:
            test_dir = dir
        
        # Try to load files if they exist
        try:
            if (dir/"summary.csv").exists():
                summary = pd.read_csv(dir/"summary.csv",index_col=False)
            if (dir/"details.csv").exists():
                details = pd.read_csv(dir/"details.csv",index_col=False)
            if (dir/"debug.json").exists():
                with open(dir/"debug.json") as f:
                    debug = json.load(f)
            if (dir/'test_cases_loc.txt').exists():
                test_cases_loc = (dir/'test_cases_loc.txt').read_text().strip()
                test_cases_loc = dir/test_cases_loc
        except Exception as e:
            # Log the error but continue with empty/default values
            print(f"Warning: Error loading some files: {str(e)}")
        
        return cls(
            test_dir=test_dir,
            dir=dir,
            summary=summary,
            details=details,
            debug=debug
        )

    def is_datapoint_stale(self,datapoint_path):
        if self.summary.empty:
            return True
        datapoints  = self.summary['datapoint'].unique().tolist()
        # if the datapoint is not in the summary, it is stale
        if datapoint_path not in datapoints:
            return True
        
        summarized_test_case = self.summary.loc[self.summary['datapoint'] == datapoint_path]['serialized_test_case'].iloc[0]
        current_test_case = self.serialize_test_case(datapoint_path)
        return summarized_test_case != current_test_case
        


In [None]:
eval_ds_dir = get_git_root() / "sample_data" / "eval_datasets"
test_dir = eval_ds_dir / "test_cases"
out_dir = eval_ds_dir / "out_rag"


In [None]:
run = TestSetRun.load(out_dir,test_dir=test_dir)
cases = run.find_cases()
assert cases == ['pikachus','goldens'],cases

In [None]:
run.trace_log_path('pikachus')

PosixPath('/Users/dean/dl/stringdale/sample_data/eval_datasets/out_rag/logs/pikachus.jsonl')

In [None]:
run.summary

In [None]:
run.details

In [None]:
#| export

async def eval_dataset(Agent:DiagramSchema,test_dir,out_dir,comparisons,default_comparison,force_run=False):

    run = TestSetRun.load(out_dir)
    run.test_dir = test_dir    
    datapoints = run.find_cases()

    if not force_run:
        stale_datapoints = [p for p in datapoints if run.is_datapoint_stale(p)]
    else:
        stale_datapoints = datapoints

    if len(stale_datapoints) > 0:
        logger.info(f"{run.dir.name}: Evaluating {len(stale_datapoints)}/{len(datapoints)} datapoints")
    else:
        logger.info(f"{run.dir.name}: No stale datapoints, skipping evaluation")
        return run
    
    datapoint_results = await await_all(
        [
            evaluate_datapoint(
                Agent=Agent,
                comparisons=comparisons,
                default_comparison=default_comparison,
                test_case_path=run.testcase_path(datapoint),
                trace_log_path=run.trace_log_path(datapoint),
                force_run=True, # since we computed which datapoints to run, we can force run them
            ) for datapoint in stale_datapoints
        ],
        error_prefix=[
            f"When evaluating datapoint {datapoint}"
            for datapoint in stale_datapoints
        ]
    )
    
    summary_data = list()
    deep_dives = list()
    debug_infos = dict()

    for (alignment,score,debug_info,trace_out),datapoint in zip(datapoint_results,stale_datapoints):
        debug_infos[datapoint] = debug_info
        deep_dive = summarize_datapoint(datapoint,alignment,debug_info)
        deep_dive['datapoint'] = datapoint
        deep_dives.append(deep_dive)
        summary_data.append({
            'datapoint':str(datapoint),
            'distance':score,
            'avg_distance':deep_dive.distance.mean(),
            'coverage':len(alignment) / run.trace_log_len(datapoint),
            'alignment':alignment,
            'serialized_test_case':run.serialize_test_case(datapoint)
            })

    
    new_summary = pd.DataFrame.from_records(summary_data).reset_index(drop=True)
    run.summary = filter_and_concat(run.summary,new_summary,['datapoint'])

    details_data = pd.concat(deep_dives,ignore_index=True)
    run.details = filter_and_concat(run.details,details_data,['datapoint'])
    run.debug = {**run.debug,**debug_infos}
    run.save(out_dir)
    
    return run

In [None]:
from stringdale.examples.rag import Rag
from stringdale.db import ChromaClient
import tempfile

In [None]:

def load_agent(conf_dir: Path):
    agent_yaml_path = conf_dir / 'agent.yml'
    vec_db_yaml_path = conf_dir / 'vec_db.yml'

    agent_conf = yaml.safe_load(agent_yaml_path.read_text())
    vec_db_conf = yaml.safe_load(vec_db_yaml_path.read_text())

    db = ChromaClient(persist_path=tempfile.mkdtemp())
    for collection_name, docs in vec_db_conf.items():
        db.add_collection(collection_name, exists_ok=True)
        db.upsert(collection_name, docs)

    agent_conf['db'] = db
    
    Agent = Rag(**agent_conf)

    return Agent

In [None]:
conf_dir = eval_ds_dir / "agent_configs"

comparisons = {
    'eq':eq,
    'eval':safe_eval,
    'chat_eval':chat_eval,
    'cosine_dist':cosine_dist,
}

default_comparison = 'cosine_dist'

In [None]:
with checkLogs(level='INFO'):
    run1 = await eval_dataset(
        Agent=load_agent(conf_dir/'v001'),
        test_dir=test_dir,
        out_dir=out_dir/'v001',
        comparisons=comparisons,
        default_comparison=default_comparison,
        )

    run2 = await eval_dataset(
        Agent=load_agent(conf_dir/'v002'),
        test_dir=test_dir,
        out_dir=out_dir/'v002',
        comparisons=comparisons,
        default_comparison=default_comparison)

    run3 = await eval_dataset(
        Agent=load_agent(conf_dir/'v003'),
        test_dir=test_dir,
        out_dir=out_dir/'v003',
        comparisons=comparisons,
        default_comparison=default_comparison)



__main__ - INFO - v001: Evaluating 2/2 datapoints
__main__ - INFO - Force running pikachus.jsonl
__main__ - INFO - Force running goldens.jsonl
__main__ - INFO - v002: Evaluating 2/2 datapoints
__main__ - INFO - Trace file pikachus.jsonl does not exist, running agent
__main__ - INFO - Trace file goldens.jsonl does not exist, running agent
__main__ - INFO - v003: Evaluating 2/2 datapoints
__main__ - INFO - Trace file pikachus.jsonl does not exist, running agent
__main__ - INFO - Trace file goldens.jsonl does not exist, running agent


In [None]:
run1.summary

Unnamed: 0,datapoint,distance,avg_distance,coverage,alignment,serialized_test_case
0,pikachus,0.943595,0.943595,0.5,{'0': 1},"{\n ""inputs"": [\n {\n ""query"": ""tell ..."
1,goldens,0.751374,0.375687,1.0,"{'0': 0, '1': 1}","{\n ""inputs"": [\n {\n ""query"": ""tell ..."


In [None]:
run1.details

Unnamed: 0,datapoint,node_label,trace_idx,comparison,key,actual,expected,distance,node_idx,trace_name,node_name,kwargs,aggregation,comp_id
0,pikachus,0,1,cosine_dist,content,"I'm sorry, but I can only provide information ...",They are:\n * dangerous\n * smoke cigarettes,0.943595,0,chat,chat,{},,0
1,goldens,0,0,chat_eval,.,"[{'id': 'dog1', 'text': 'The Golden Retriever ...","The Golden Retriever is a friendly, intelligen...",0.1,0,get_docs,get_docs,{'system_prompt': 'how close are these 2 docum...,min,0
2,goldens,1,1,cosine_dist,content,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent\n,0.651374,1,chat,chat,{},,1


In [None]:
run2.details

Unnamed: 0,datapoint,node_label,trace_idx,comparison,key,actual,expected,distance,node_idx,trace_name,node_name,kwargs,aggregation,comp_id
0,pikachus,0,1,cosine_dist,content,"Pikachus are not dogs, they are fictional crea...",They are:\n * dangerous\n * smoke cigarettes,0.818815,0,chat,chat,{},,0
1,goldens,0,0,chat_eval,.,"[{'id': 'dog1', 'text': 'The Golden Retriever ...","The Golden Retriever is a friendly, intelligen...",0.1,0,get_docs,get_docs,{'system_prompt': 'how close are these 2 docum...,min,0
2,goldens,1,1,cosine_dist,content,"Golden Retrievers are friendly, intelligent do...",They are:\n * friendly\n * intelligent\n,0.602882,1,chat,chat,{},,1


In [None]:
run3.details

Unnamed: 0,datapoint,node_label,trace_idx,comparison,key,actual,expected,distance,node_idx,trace_name,node_name,kwargs,aggregation,comp_id
0,pikachus,0,1,cosine_dist,content,Pikachus are dangerous creatures that smoke to...,They are:\n * dangerous\n * smoke cigarettes,0.443801,0,chat,chat,{},,0
1,goldens,0,0,chat_eval,.,"[{'id': 'dog1', 'text': 'The Golden Retriever ...","The Golden Retriever is a friendly, intelligen...",0.1,0,get_docs,get_docs,{'system_prompt': 'how close are these 2 docum...,min,0
2,goldens,1,1,cosine_dist,content,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent\n,0.647368,1,chat,chat,{},,1


## Comparing Runs

In [None]:
# TODO pprint
# have a utility function that prints the summary
# and have a utility function that returns the k datapoints that regressed the most
# have a pprint version of it that actually plots the traces and the difference between them



In [None]:
#| export
import math
from typing import Optional
import textwrap

In [None]:
#| export
class Comparison(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    from_run: TestSetRun
    to_run: TestSetRun
    dir: Optional[Path]
    summary: pd.DataFrame
    details: pd.DataFrame

    def __repr__(self):     
        return (
            f"Comparison(\n"
            f"  from_run={textwrap.indent(self.from_run.__repr__(), '  ').strip()}, \n"
            f"  to_run={textwrap.indent(self.to_run.__repr__(), '  ').strip()}, \n"
            f"  summary=Dataframe({self.summary.shape}), \n"
            f"  details=Dataframe({self.details.shape}), \n"
            f")")

    def __str__(self):
        return self.__repr__()

    def save(self):
        if self.dir is None:
            return
        self.dir.mkdir(parents=True,exist_ok=True)
        self.summary.to_csv(self.dir/"summary.csv",index=False)
        self.details.to_csv(self.dir/"details.csv",index=False)
    


In [None]:
#| export
def sort_conditions(df):
    return df.sort_values(by=['node_idx','comparison','expected'])

def limit_to_datapoint(df,datapoint):
    return df.loc[df['datapoint'] == datapoint]

def get_datapoint(ds,datapoint):
    return sort_conditions(limit_to_datapoint(ds.details,datapoint))


def describe_changes(ds1,ds2,datapoint,epsilon=1e-3):
    """
    Describe the changes between two datapoints
    """
    
    # get the detailed version of the datasets and limit to only rows of the given datapoint
    datapoint_df1 = get_datapoint(ds1,datapoint)
    datapoint_df2 = get_datapoint(ds2,datapoint)

    # since these datapoints or not extended or reduced, we expect the same set of expected nodes and the same set of tuples of the type (content,comparison)
    # lets assert this in the code
    assert datapoint_df1.shape == datapoint_df2.shape, f"Datapoint {datapoint} has different number of rows in the two datasets {ds1} and {ds2}"

    changes = []

    # get the first comparison whose node aligned to a different trace
    for row1,row2 in zip(datapoint_df1.itertuples(),datapoint_df2.itertuples()):
        if row1.trace_idx != row2.trace_idx:
            changes.append({
                'datapoint':datapoint,
                'change_type':'alignment_change',
                'before':row1.trace_idx,
                'after':row2.trace_idx,
                'comparison_id':row1.comp_id,
            })
            break
            

    for row1,row2 in zip(datapoint_df1.itertuples(),datapoint_df2.itertuples()):
        if math.isclose(row1.distance,row2.distance,abs_tol=epsilon):
            continue
        if row2.distance + epsilon > row1.distance:
            change_types = 'regressed'
        elif row2.distance - epsilon < row1.distance:
            change_types = 'improved'
        else:
            continue
        
        changes.append({
            'datapoint':datapoint,
            'change_type':change_types,
            'value':row1.distance - row2.distance,
            'comp_id':row1.comp_id,
            'node_label':row1.node_label,
            'expected':row1.expected,
            'before':row1.actual,
            'after':row2.actual,
        })
    
    return pd.DataFrame(changes)


            

In [None]:
describe_changes(run1,run2,'goldens')

Unnamed: 0,datapoint,change_type,value,comp_id,node_label,expected,before,after
0,goldens,improved,0.048492,1,1,They are:\n * friendly\n * intelligent\n,"The Golden Retriever is a friendly, intelligen...","Golden Retrievers are friendly, intelligent do..."


In [None]:
#| export
def compare_datasets(ds1,ds2,epsilon=1e-3,out_dir=None):
    """
    Compare two datasets
    """
    
    summary_1 = ds1.summary.sort_values(by='datapoint')
    summary_2 = ds2.summary.sort_values(by='datapoint')

    changed_datapoints = []
    change_summary = []
    detailed_changes = []

    for row1,row2 in zip(summary_1.itertuples(),summary_2.itertuples()):
        datapoint = row1.datapoint
        distance_change = not math.isclose(row1.distance,row2.distance,abs_tol=epsilon)
        coverage_change = row1.coverage != row2.coverage

        if distance_change or coverage_change:
            changed_datapoints.append(datapoint)

            detailed_change = describe_changes(ds1,ds2,datapoint,epsilon)
            detailed_changes.append(detailed_change)

            change_types = set(detailed_change['change_type'])
            if 'alignment_change' in change_types:
                alignment_change = True
            else:
                alignment_change = False
            
            if 'improved' in change_types and not 'regressed' in change_types:
                score_change = 'improved'
            elif 'regressed' in change_types and not 'improved' in change_types:
                score_change = 'regressed'
            else:
                score_change = 'changed'
            
            total_score_change = row1.distance-row2.distance

            change_summary.append({
                'datapoint':datapoint,
                'alignment_change':alignment_change,
                'score_change_type':score_change,
                'total_score_change':total_score_change,
            })

    changes_summary = pd.DataFrame(change_summary)
    if len(detailed_changes) > 0:
        detailed_changes = pd.concat(detailed_changes)
    else:
        detailed_changes = pd.DataFrame()

    comp =  Comparison(
        from_run=ds1,
        to_run=ds2,
        summary=changes_summary,
        details=detailed_changes,
        dir=out_dir,
    )
    comp.save()
    return comp

In [None]:
comparison_2 = compare_datasets(run1,run2)
comparison_2

Comparison(
  from_run=TestSetRun(
    test_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/test_cases, 
    dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/out_rag/v001, 
    summary=Dataframe((2, 6)), 
    details=Dataframe((3, 14)), 
    debug=dict), 
  to_run=TestSetRun(
    test_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/test_cases, 
    dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/out_rag/v002, 
    summary=Dataframe((2, 6)), 
    details=Dataframe((3, 14)), 
    debug=dict), 
  summary=Dataframe((2, 4)), 
  details=Dataframe((2, 8)), 
)

In [None]:
comparison_2.summary

Unnamed: 0,datapoint,alignment_change,score_change_type,total_score_change
0,goldens,False,improved,0.048492
1,pikachus,False,improved,0.12478


In [None]:
comparison_2.details

Unnamed: 0,datapoint,change_type,value,comp_id,node_label,expected,before,after
0,goldens,improved,0.048492,1,1,They are:\n * friendly\n * intelligent\n,"The Golden Retriever is a friendly, intelligen...","Golden Retrievers are friendly, intelligent do..."
0,pikachus,improved,0.12478,0,0,They are:\n * dangerous\n * smoke cigarettes,"I'm sorry, but I can only provide information ...","Pikachus are not dogs, they are fictional crea..."


In [None]:
comparison_3 = compare_datasets(run1,run3)
comparison_3.summary


Unnamed: 0,datapoint,alignment_change,score_change_type,total_score_change
0,goldens,False,improved,0.004006
1,pikachus,False,improved,0.499794


In [None]:
comparison_3.details

Unnamed: 0,datapoint,change_type,value,comp_id,node_label,expected,before,after
0,goldens,improved,0.004006,1,1,They are:\n * friendly\n * intelligent\n,"The Golden Retriever is a friendly, intelligen...","The Golden Retriever is a friendly, intelligen..."
0,pikachus,improved,0.499794,0,0,They are:\n * dangerous\n * smoke cigarettes,"I'm sorry, but I can only provide information ...",Pikachus are dangerous creatures that smoke to...


In [None]:
run1

TestSetRun(
  test_dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/test_cases, 
  dir=/Users/dean/dl/stringdale/sample_data/eval_datasets/out_rag/v001, 
  summary=Dataframe((2, 6)), 
  details=Dataframe((3, 14)), 
  debug=dict)

In [None]:
# then continue to make the pprint of a change class with topk changes of each type

In [None]:
# TODO when pprinting comparisons, give urls to files, so that its easy in nb to navigate to them

## Eval main entrypoint

In [None]:
#| export
from typing import Callable,Dict,List,Optional,Tuple

In [None]:
#| export
EVAL_COMPARISONS = {
    'eq':eq,
    'eval':safe_eval,
    'chat_eval':chat_eval,
    'cosine_dist':cosine_dist,
}

EVAL_DEFAULT_COMPARISON = 'cosine_dist'

In [None]:
#| export
class EvalResult(BaseModel):
    """
    A class to track evaluation results, including individual runs and comparisons between runs.
    
    Attributes:
        runs (Dict[str, TestSetRun]): Dictionary mapping agent names to their test run results
        comparisons (Dict[Tuple[str, str], Comparison]): Dictionary mapping pairs of agent names 
            (base_run, other_run) to their comparison results
    """
    model_config = ConfigDict(arbitrary_types_allowed=True)
    
    runs: Dict[str, TestSetRun]
    comparisons: Dict[Tuple[str, str], Comparison]
    
    def __repr__(self) -> str:
        runs_str = f"runs: {list(self.runs.keys())}"
        comparisons_str = f"comparisons: {list(self.comparisons.keys())}"
        return f"EvalResult(\n  {runs_str},\n  {comparisons_str}\n)"
    
    def __str__(self) -> str:
        return self.__repr__()

In [None]:
#| export
async def eval(
  test_dir:Path,
  out_dir:Path,
  agents:List[Tuple[str,DiagramSchema]],
  k:Optional[int]=5,
  force_run:bool=False,
  silent:bool=False,
  comparisons: Optional[Dict[str,Callable]]=None,
  default_comparison:Optional[Callable]=None,
  ):
  """
  The main eval function.
  Evaluates a set of agents on a set of tests.
  Compares the results of all agents to the first agent.
  pprints a summary of the results to the console.
  and saves all files to the out_dir.

  Args:
    tests_dir: Path to the directory containing the tests.
    out_dir: Path to the directory to write the results to.
    agents: A list of tuples of agent names and their DiagramSchema.
    k: The number of datapoints to print to the summary at most. Defaults to 5.
    force_run: If True, deletes out dir content and reruns the agents. 
      If False, we skip the agents that have already been run.
      Defaults to False.
    silent: If True, dont pprint comparisons. Defaults to False.
    comparisons: A dictionary of comparison names and their functions, to add to the allowed comparisons.
      Defaults to None.
    default_comparison: The default comparison function to use if no comparison is specified.
      This is used to compare the first agent to the rest of the agents.
      Defaults to stringdale.eval.cosine_dist
  """

  global EVAL_COMPARISONS
  global EVAL_DEFAULT_COMPARISON
  if comparisons is not None:
    comparisons = EVAL_COMPARISONS | comparisons
  else: 
    comparisons = EVAL_COMPARISONS
  if default_comparison is None:
    default_comparison = EVAL_DEFAULT_COMPARISON

  eval_dataset_tasks = []
  for agent_name,agent_schema in agents:
    log_dir = out_dir / 'logs'/ agent_name
    eval_dataset_tasks.append(eval_dataset(
      Agent=agent_schema,
      test_dir=test_dir,
      out_dir=log_dir,
      comparisons=comparisons,
      default_comparison=default_comparison))

  datasets = await asyncio.gather(*eval_dataset_tasks,return_exceptions=True)

  for result,(agent_name,_) in zip(datasets,agents):
    if isinstance(result,Exception):
      result.args = (f"When evaluating agent {agent_name}:\n{result.args[0]}", )+ result.args[1:]
      raise result

  first_dataset = datasets[0]
  comparisons = dict()
  for dataset in datasets[1:]:
    comp_dir = out_dir / 'comparisons' / f'{first_dataset.dir.name}_{dataset.dir.name}'
    comp = compare_datasets(first_dataset,dataset,out_dir=comp_dir)
    comparisons[(first_dataset.dir.name,dataset.dir.name)] = comp
  
  res = EvalResult(
    runs={agent_name:dataset for (agent_name,_),dataset in zip(agents,datasets)},
    comparisons=comparisons)


  return res



In [None]:
agents = [
    (version,load_agent(conf_dir/version))
    for version in ['v001','v002','v003']
]
agents

[('v001', <stringdale.base.DiagramSchema>),
 ('v002', <stringdale.base.DiagramSchema>),
 ('v003', <stringdale.base.DiagramSchema>)]

In [None]:
test_dir = get_git_root() / "sample_data" / "eval_datasets" / "test_cases"
eval_out = get_git_root() / "sample_data" / "eval_datasets" / "eval_out"

In [None]:
res = await eval(
    test_dir=test_dir,
    out_dir=eval_out,
    agents=agents,
)

In [None]:
res

EvalResult(
  runs: ['v001', 'v002', 'v003'],
  comparisons: [('v001', 'v002'), ('v001', 'v003')]
)

In [None]:
res.runs['v001'].summary

Unnamed: 0,datapoint,distance,avg_distance,coverage,alignment,serialized_test_case
0,pikachus,0.943595,0.943595,0.5,{'0': 1},"{\n ""inputs"": [\n {\n ""query"": ""tell ..."
1,goldens,0.751374,0.375687,1.0,"{'0': 0, '1': 1}","{\n ""inputs"": [\n {\n ""query"": ""tell ..."


In [None]:
res.runs['v001'].details

Unnamed: 0,datapoint,node_label,trace_idx,comparison,key,actual,expected,distance,node_idx,trace_name,node_name,kwargs,aggregation,comp_id
0,pikachus,0,1,cosine_dist,content,"I'm sorry, but I can only provide information ...",They are:\n * dangerous\n * smoke cigarettes,0.943595,0,chat,chat,{},,0
1,goldens,0,0,chat_eval,.,"[{'id': 'dog1', 'text': 'The Golden Retriever ...","The Golden Retriever is a friendly, intelligen...",0.1,0,get_docs,get_docs,{'system_prompt': 'how close are these 2 docum...,min,0
2,goldens,1,1,cosine_dist,content,"The Golden Retriever is a friendly, intelligen...",They are:\n * friendly\n * intelligent\n,0.651374,1,chat,chat,{},,1


In [None]:
res.comparisons[('v001','v002')].summary

Unnamed: 0,datapoint,alignment_change,score_change_type,total_score_change
0,goldens,False,improved,0.048492
1,pikachus,False,improved,0.12478


In [None]:
res.comparisons[('v001','v002')].details

Unnamed: 0,datapoint,change_type,value,comp_id,node_label,expected,before,after
0,goldens,improved,0.048492,1,1,They are:\n * friendly\n * intelligent\n,"The Golden Retriever is a friendly, intelligen...","Golden Retrievers are friendly, intelligent do..."
0,pikachus,improved,0.12478,0,0,They are:\n * dangerous\n * smoke cigarettes,"I'm sorry, but I can only provide information ...","Pikachus are not dogs, they are fictional crea..."


In [None]:
res.runs['v001'].summary

Unnamed: 0,datapoint,distance,avg_distance,coverage,alignment,serialized_test_case
0,pikachus,0.943595,0.943595,0.5,{'0': 1},"{\n ""inputs"": [\n {\n ""query"": ""tell ..."
1,goldens,0.751374,0.375687,1.0,"{'0': 0, '1': 1}","{\n ""inputs"": [\n {\n ""query"": ""tell ..."


In [None]:
#| export
import rich
from rich.padding import Padding

In [None]:
#| export
def rprint(obj,indent:int=0,sep_by:int=2):
    rich.print(Padding(obj,pad=(0,0,0,indent*sep_by)))

In [None]:
def pprint_run_summary(res:EvalResult,run_name:str,topk:int=5):
    pass
    # pprint name of the run, average distance and coverage

def pprint_comparison_summary(res:EvalResult,comparison_name:Tuple[str,str],topk:int=5):
    pass
    # number of datapoints whose alignment changed (write names of topk)
    # number of datapoints whose alignment improved (write names of topk)
    # number of datapoints whose alignment regressed (write names of topk)



In [None]:
def pprint_datapoint(res:EvalResult,datapoint:str,default_comparison="cosine_dist"):
    
    base_name = list(res.comparisons.keys())[0][0]
    jinja_params = {
        'base_name':base_name,
        'datapoint':datapoint,
        'run_summaries':{},
        'comp_summaries':{},
        'per_comp':[],

        'version_style':'purple',
        'param_style':'cyan bold',
        'comp_config_style':'green bold',
        'output_style':'#CE9178',
    }


    for run_name,run in res.runs.items():
        jinja_params['run_summaries'][run_name] = dict(run.summary.set_index('datapoint').loc[datapoint])
    
    for (base_run,other_run),comparison in res.comparisons.items():
        jinja_params['comp_summaries'][base_run,other_run] = comparison.summary.set_index('datapoint').loc[datapoint].to_dict()
    
    details = {
        run_name:run.details[run.details['datapoint'] == datapoint]
        for run_name,run in res.runs.items()
    }
    comp_details = {
        (base_run,other_run):comparison.details[comparison.details['datapoint'] == datapoint]
        for (base_run,other_run),comparison in res.comparisons.items()
    }
    comparison_ids = pd.concat(comparison['comp_id'] for comparison in comp_details.values()).unique()

    per_comp = {}

    for comparison_id in comparison_ids:
        per_comp[comparison_id] = {}
        for run_name,run in details.items():
            run_det = run[run['comp_id'] == comparison_id].iloc[0].to_dict()
            if run_det['kwargs'] == '{}':
                run_det['kwargs'] = None
            
            if not run_name == base_name:
                sub_comp_details = comp_details[base_name,run_name]
                sub_comp_details = sub_comp_details[sub_comp_details['comp_id'] == comparison_id]
                run_det['change_type'] = sub_comp_details['change_type'].values[0]
                run_det['change_value'] = sub_comp_details['value'].values[0]
            per_comp[comparison_id][run_name] = run_det
        
    jinja_params['per_comp'] = per_comp
    return jinja_params
    

In [None]:
from stringdale.core import jinja_render

In [None]:
jinja_params = pprint_datapoint(res,'pikachus')
jinja_params

{'base_name': 'v001',
 'datapoint': 'pikachus',
 'run_summaries': {'v001': {'distance': np.float64(0.9435950697036162),
   'avg_distance': np.float64(0.9435950697036162),
   'coverage': np.float64(0.5),
   'alignment': frozendict.frozendict({'0': 1}),
   'serialized_test_case': '{\n  "inputs": [\n    {\n      "query": "tell me about pikachus"\n    }\n  ],\n  "test_nodes": [\n    {\n      "name": "chat",\n      "conditions": [\n        {\n          "key": "content",\n          "value": "They are:\\n  * dangerous\\n  * smoke cigarettes"\n        }\n      ]\n    }\n  ]\n}'},
  'v002': {'distance': np.float64(0.818815274469515),
   'avg_distance': np.float64(0.818815274469515),
   'coverage': np.float64(0.5),
   'alignment': frozendict.frozendict({'0': 1}),
   'serialized_test_case': '{\n  "inputs": [\n    {\n      "query": "tell me about pikachus"\n    }\n  ],\n  "test_nodes": [\n    {\n      "name": "chat",\n      "conditions": [\n        {\n          "key": "content",\n          "value"

In [None]:


datapoint_template="""
{{datapoint}}
  summary:
    {%- for run_name,run in run_summaries.items() %}
      [{{version_style}}]{{run_name}}[/{{version_style}}] - Dist: {{"$%.2f"|format(run.distance)}} AvgDist: {{"$%.2f"|format(run.avg_distance)}} Coverage: {{ "$%.2f"|format(run.coverage)}}
    {%- endfor %}
    {%- for (base_run,other_run),comparison in comp_summaries.items() %}
      [{{version_style}}]{{base_run}}[/{{version_style}}] vs [{{version_style}}]{{other_run}}[/{{version_style}}]: \
Alignment change: [{{param_style}}]{{comparison.alignment_change}}[/{{param_style}}] \
Score change: [{{param_style}}]{{comparison.score_change_type}}[/{{param_style}}] \
Score by: {{ "%.2f"|format(comparison.total_score_change)}}
    {%- endfor%}
  details:
    {%- for comp_id,comp in per_comp.items() %}
    Comparison #[{{comp_config_style}}]{{comp_id}}[/{{comp_config_style}}], node_pattern: [{{comp_config_style}}]{{comp[base_name].node_name}}[/{{comp_config_style}}], key: [{{comp_config_style}}]{{comp[base_name].key}}[/{{comp_config_style}}], func: [{{comp_config_style}}]{{comp[base_name].comparison}}[/{{comp_config_style}}]
      {% if comp[base_name].kwargs -%}
        kwargs: [{{output_style}}]{{comp[base_name].kwargs}}[/{{output_style}}]
      {%- endif -%}
      expected: 
[{{output_style}}]{{comp[base_name].expected | wordwrap(width=100) | indent(10,true) }}[/{{output_style}}]  
      {% for run_name,run in comp.items() %}
      {{run_name}} - matched [green]{{run.trace_name}}[/green](#{{run.trace_idx}})
      {%- if run.alignment_change %}
          , Alignment change: {{run.alignment_change}}
      {%- endif -%}
      {%- if run.change_type -%}
          , {{run.change_type}}: {{ "%.2f"|format(run.change_value) }}
      {%- endif -%}:
[{{output_style}}]{{run.actual | wordwrap(width=100) | indent(10,true)}}[/{{output_style}}]
      {% endfor -%}
    {% endfor %}
"""


In [None]:
with rich.get_console():
    rprint(jinja_render(datapoint_template,jinja_params))

In [None]:
# TODO from here, turn that into a single function that generates the string.
# then make the templates for the summaries of whole runs and whole comparisons.s

In [None]:
pprint_datapoint(res,'pikachus') # TODO from here, make a ppprint function that does the indentations and stuff

{'base_name': 'v001',
 'datapoint': 'pikachus',
 'run_summaries': {'v001': {'distance': np.float64(0.9435950697036162),
   'avg_distance': np.float64(0.9435950697036162),
   'coverage': np.float64(0.5),
   'alignment': "frozendict.frozendict({'0': 1})",
   'serialized_test_case': '{\n  "inputs": [\n    {\n      "query": "tell me about pikachus"\n    }\n  ],\n  "test_nodes": [\n    {\n      "name": "chat",\n      "conditions": [\n        {\n          "key": "content",\n          "value": "They are:\\n  * dangerous\\n  * smoke cigarettes"\n        }\n      ]\n    }\n  ]\n}'},
  'v002': {'distance': np.float64(0.818815274469515),
   'avg_distance': np.float64(0.818815274469515),
   'coverage': np.float64(0.5),
   'alignment': "frozendict.frozendict({'0': 1})",
   'serialized_test_case': '{\n  "inputs": [\n    {\n      "query": "tell me about pikachus"\n    }\n  ],\n  "test_nodes": [\n    {\n      "name": "chat",\n      "conditions": [\n        {\n          "key": "content",\n          "va

In [None]:
from rich.console import Console
from rich.padding import Padding

console = Console()

console.print("This text is not indented.")
console.print(Padding("This text is indented by 4 spaces on the left.", pad=(0, 0, 0, 4)))
console.print(Padding("This text has padding on all sides.", pad=(1, 2, 3, 4))) # top, right, bottom, left

In [None]:
def pprint_eval(res:EvalResult,topk:int=5):
    pass
    # print summary of each run seperately?
    # then, we pprint the summary of each comparison seperately

    # then we group the per datapoint comps across all dataset comparisons by the datapoint id
    # and for each datapoint we pprint a combined datapoint comparison.
    # combined datapoints for each datapoint, the total metrics of each version
    # and then for each comparison that is different from baseline, say how it is different for every version.

In [None]:
# TODO patch the EvalResult class to be able to pprint

In [None]:
# TODO 
# in the end, we want few entrypoints:

# eval, eval_single, and align_trace, validate_tests (just to make sure they are formatted correctly)

# eval will get lists of versions, and which comparisons to do, and log dir to save results to etc..


# TODO add to the tutorial a performance section. Explain the lazy eval of distances
# TODO add to "V" ability to specifiy the funcname for presenting in the drawing


In [None]:
# explain in tutorial

# how do we do training and validation on workflows?

# we have 2 expected datasets, train and test

# we look at the total distance of the validation set to see that we are improving on it

# but we only look at the comparisons and fix our configs or diagrams based on the training set

In [None]:
# TODO add asyncio sempathores to Chat and DB operations etc, so that we dont get rate limited due to tons of async requests

""" # Optional semaphore
from contextlib import asynccontextmanager
from typing import Optional

@asynccontextmanager
async def optional_semaphore(semaphore: Optional[asyncio.Semaphore] = None):
    if semaphore is not None:
        async with semaphore:
            yield
    else:
        yield

# Usage example:
async def my_function(limit_concurrency: bool = False):
    sem = asyncio.Semaphore(2) if limit_concurrency else None
    
    async with optional_semaphore(sem):
        # Your async code here
        await asyncio.sleep(1)
        print("Function executed")


"""

' # Optional semaphore\nfrom contextlib import asynccontextmanager\nfrom typing import Optional\n\n@asynccontextmanager\nasync def optional_semaphore(semaphore: Optional[asyncio.Semaphore] = None):\n    if semaphore is not None:\n        async with semaphore:\n            yield\n    else:\n        yield\n\n# Usage example:\nasync def my_function(limit_concurrency: bool = False):\n    sem = asyncio.Semaphore(2) if limit_concurrency else None\n\n    async with optional_semaphore(sem):\n        # Your async code here\n        await asyncio.sleep(1)\n        print("Function executed")\n\n\n'

## export

In [None]:
# |hide
import nbdev; nbdev.nbdev_export()