In [133]:
import sys
from pathlib import Path
import pandas as pd
import json

sys.path.insert(0, str(Path.cwd().parent / 'src'))

from meta_ally.eval.analyze_reports import (
    load_evaluation_run,
    reports_to_dataframe,
    run_to_dataframe,
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

## 1. Discover All Evaluation Runs

In [134]:
eval_base_dir = Path.cwd().parent / 'evaluation_results'

run_dirs = sorted([d for d in eval_base_dir.iterdir() if d.is_dir()])
print(f"Found {len(run_dirs)} evaluation runs:\n")
for run_dir in run_dirs:
    print(f"  - {run_dir.name}")

Found 12 evaluation runs:

  - multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_131941
  - multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_135309
  - multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_165341
  - single_agent_improved_descriptions_gpt-4.1-mini_20260217_112944
  - single_agent_improved_descriptions_gpt-4.1-mini_20260217_142937
  - single_agent_improved_descriptions_gpt-4.1-mini_20260218_171249
  - single_agent_improved_descriptions_gpt-5-mini_20260217_144852
  - single_agent_improved_descriptions_gpt-5-mini_20260217_153733
  - single_agent_improved_descriptions_gpt-5-mini_20260217_164843
  - single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_100322
  - single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_102954
  - single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_173251


## 2. Load All Case Reports into DataFrames

We'll create a function to load all reports from all folders and organize them by configuration.

In [135]:
def load_all_case_reports(base_dir: Path) -> dict[str, pd.DataFrame]:
    """
    Load all case reports from all evaluation runs.
    
    Returns a dictionary where keys are folder names and values are DataFrames
    containing all cases from all reports in that folder.
    """
    all_reports = {}
    
    for run_dir in sorted([d for d in base_dir.iterdir() if d.is_dir()]):
        reports_dir = run_dir / 'reports'
        if not reports_dir.exists():
            continue
        
        # Load all case JSON files in the reports directory
        case_files = sorted(reports_dir.glob('*.json'))
        
        all_cases = []
        for case_file in case_files:
            with open(case_file, 'r', encoding='utf-8') as f:
                report = json.load(f)
                
            # Extract case name from filename
            dataset_id = case_file.stem
            
            # Process each case in the report
            for case in report.get('cases', []):
                case_dict = {
                    'run_folder': run_dir.name,
                    'dataset_id': dataset_id,
                    'case_name': case['name'],
                }
                
                # Add metrics (with 0.5 multiplier to fix doubled counts)
                metrics = case.get('metrics', {})
                case_dict['input_tokens'] = metrics.get('input_tokens', 0) * 0.5
                case_dict['output_tokens'] = metrics.get('output_tokens', 0) * 0.5
                case_dict['requests'] = metrics.get('requests', 0)
                case_dict['cost'] = metrics.get('cost', 0.0)
                
                # Add scores
                scores = case.get('scores', {})
                for score_name, score_data in scores.items():
                    col_name = score_name.replace(' ', '_').replace('-', '_')
                    case_dict[f'score_{col_name}'] = score_data.get('value', 0.0)
                    case_dict[f'score_{col_name}_reason'] = score_data.get('reason', None)
                
                # Add assertions
                assertions = case.get('assertions', {})
                for assertion_name, assertion_data in assertions.items():
                    col_name = assertion_name.replace(' ', '_').replace('-', '_')
                    case_dict[f'assertion_{col_name}'] = assertion_data.get('value', None)
                
                all_cases.append(case_dict)
        
        if all_cases:
            all_reports[run_dir.name] = pd.DataFrame(all_cases)
    
    return all_reports

# Load all reports
all_case_dfs = load_all_case_reports(eval_base_dir)

print(f"Loaded {len(all_case_dfs)} evaluation runs:\n")
for folder_name, df in all_case_dfs.items():
    print(f"{folder_name}: {len(df)} cases")

Loaded 12 evaluation runs:

multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_131941: 20 cases
multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_135309: 24 cases
multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_165341: 19 cases
single_agent_improved_descriptions_gpt-4.1-mini_20260217_112944: 24 cases
single_agent_improved_descriptions_gpt-4.1-mini_20260217_142937: 20 cases
single_agent_improved_descriptions_gpt-4.1-mini_20260218_171249: 19 cases
single_agent_improved_descriptions_gpt-5-mini_20260217_144852: 20 cases
single_agent_improved_descriptions_gpt-5-mini_20260217_153733: 24 cases
single_agent_improved_descriptions_gpt-5-mini_20260217_164843: 19 cases
single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_100322: 24 cases
single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_102954: 20 cases
single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_173251: 19 cases


## 3. Create Combined DataFrame

Combine all runs into a single DataFrame for easier analysis.

In [136]:
combined_df = pd.concat(all_case_dfs.values(), ignore_index=True)

print(f"Combined DataFrame shape: {combined_df.shape}")
print(f"\nColumns: {list(combined_df.columns)}")
print(f"\nFirst few rows:")
combined_df.head()

Combined DataFrame shape: (252, 15)

Columns: ['run_folder', 'dataset_id', 'case_name', 'input_tokens', 'output_tokens', 'requests', 'cost', 'score_ToolCallEvaluator', 'score_ToolCallEvaluator_reason', 'score_Helpfulness_and_accuracy', 'score_Helpfulness_and_accuracy_reason', 'score_Tool_Call_Evaluation', 'score_Tool_Call_Evaluation_reason', 'assertion_LLMJudge_pass', 'assertion_LLMJudge_pass_2']

First few rows:


Unnamed: 0,run_folder,dataset_id,case_name,input_tokens,output_tokens,requests,cost,score_ToolCallEvaluator,score_ToolCallEvaluator_reason,score_Helpfulness_and_accuracy,score_Helpfulness_and_accuracy_reason,score_Tool_Call_Evaluation,score_Tool_Call_Evaluation_reason,assertion_LLMJudge_pass,assertion_LLMJudge_pass_2
0,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage,190918.0,656.0,8,0.077417,0.0,,1.0,"The model's last response provides a clear, ac...",1.0,The model correctly identified the user's busi...,True,True
1,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,3904.0,85.0,2,0.001698,0.0,,1.0,The model correctly identified the user's busi...,1.0,The model correctly identified the user's busi...,True,True
2,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,184134.0,364.0,7,0.074236,0.0,,1.0,The model correctly identified that no Copilot...,1.0,The model correctly identified the user's busi...,True,True
3,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,286270.0,728.0,10,0.115673,0.0,,1.0,The model's last response provides a detailed ...,1.0,The model correctly identified the user's busi...,True,True
4,multi_agent_orchestrator_improved_descriptions...,analytics_case_2,INFORM Webseite Analytics - Vollst채ndige Analy...,60698.0,545.0,4,0.025151,0.0,,1.0,"The model's final response provides a clear, a...",1.0,The model correctly identified the business ar...,True,True


## 4. Parse Configuration Information

Extract agent type, model, and improved descriptions flag from folder names.

In [137]:
# Hardcoded configuration mapping
FOLDER_CONFIGS = {
    'single_agent_improved_descriptions_gpt-5-mini_20260217_144852': {
        'agent_type': 'single_agent',
        'model': 'gpt-5-mini',
        'improved_descriptions': True
    },
    'single_agent_improved_descriptions_gpt-5-mini_20260217_153733': {
        'agent_type': 'single_agent',
        'model': 'gpt-5-mini',
        'improved_descriptions': True
    },
    'single_agent_improved_descriptions_gpt-5-mini_20260217_164843': {
        'agent_type': 'single_agent',
        'model': 'gpt-5-mini',
        'improved_descriptions': True
    },
    'single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_173251': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': False
    },
    'single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_100322': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': False
    },
    'single_agent_no_improved_descriptions_gpt-4.1-mini_20260218_102954': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': False
    },
    'single_agent_improved_descriptions_gpt-4.1-mini_20260217_112944': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    },
    'single_agent_improved_descriptions_gpt-4.1-mini_20260217_142937': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    },
    'single_agent_improved_descriptions_gpt-4.1-mini_20260218_171249': {
        'agent_type': 'single_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    },
    'multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_131941': {
        'agent_type': 'multi_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    },
    'multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_135309': {
        'agent_type': 'multi_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    },
    'multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_165341': {
        'agent_type': 'multi_agent',
        'model': 'gpt-4.1-mini',
        'improved_descriptions': True
    }
}

def get_config_from_folder(folder_name: str) -> dict:
    """
    Get configuration info from hardcoded mapping.
    
    Returns: {'agent_type': str, 'improved_descriptions': bool, 'model': str}
    """
    return FOLDER_CONFIGS.get(folder_name, {
        'agent_type': 'unknown',
        'improved_descriptions': False,
        'model': 'unknown'
    })

# Add configuration columns
config_info = combined_df['run_folder'].apply(get_config_from_folder)
combined_df['agent_type'] = config_info.apply(lambda x: x['agent_type'])
combined_df['improved_descriptions'] = config_info.apply(lambda x: x['improved_descriptions'])
combined_df['model'] = config_info.apply(lambda x: x['model'])

print("Configuration distribution:")
print(combined_df.groupby(['agent_type', 'model', 'improved_descriptions']).size())

Configuration distribution:
agent_type    model         improved_descriptions
multi_agent   gpt-4.1-mini  True                     63
single_agent  gpt-4.1-mini  False                    63
                            True                     63
              gpt-5-mini    True                     63
dtype: int64


## 5. Parse Dataset Type

Extract dataset type (addone, analytics, testing_and_access) from dataset_id.

In [138]:
def parse_dataset_type(dataset_id: str) -> str:
    """
    Extract dataset type from dataset_id.
    
    Example: 'addone_case_1' -> 'addone'
             'analytics_case_2' -> 'analytics'
    """
    if dataset_id.startswith('addone'):
        return 'addone'
    elif dataset_id.startswith('analytics'):
        return 'analytics'
    elif dataset_id.startswith('testing') or dataset_id.startswith('access'):
        return 'testing_and_access'
    else:
        return 'unknown'

combined_df['dataset_type'] = combined_df['dataset_id'].apply(parse_dataset_type)

print("Dataset type distribution:")
print(combined_df['dataset_type'].value_counts())

Dataset type distribution:
dataset_type
addone                96
analytics             80
testing_and_access    76
Name: count, dtype: int64


## 6. Individual DataFrames by Configuration

Access individual DataFrames by folder name:

In [139]:
# Example: Get one specific run
example_folder = list(all_case_dfs.keys())[0]
example_df = all_case_dfs[example_folder]

print(f"Example: {example_folder}")
print(f"Shape: {example_df.shape}")
example_df.head()

Example: multi_agent_orchestrator_improved_descriptions_gpt-4.1-mini_20260218_131941
Shape: (20, 15)


Unnamed: 0,run_folder,dataset_id,case_name,input_tokens,output_tokens,requests,cost,score_ToolCallEvaluator,score_ToolCallEvaluator_reason,score_Helpfulness_and_accuracy,score_Helpfulness_and_accuracy_reason,score_Tool_Call_Evaluation,score_Tool_Call_Evaluation_reason,assertion_LLMJudge_pass,assertion_LLMJudge_pass_2
0,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage,190918.0,656.0,8,0.077417,0.0,,1.0,"The model's last response provides a clear, ac...",1.0,The model correctly identified the user's busi...,True,True
1,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,3904.0,85.0,2,0.001698,0.0,,1.0,The model correctly identified the user's busi...,1.0,The model correctly identified the user's busi...,True,True
2,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,184134.0,364.0,7,0.074236,0.0,,1.0,The model correctly identified that no Copilot...,1.0,The model correctly identified the user's busi...,True,True
3,multi_agent_orchestrator_improved_descriptions...,analytics_case_1,INFORM Webseite Analytics - Initiale Anfrage -...,286270.0,728.0,10,0.115673,0.0,,1.0,The model's last response provides a detailed ...,1.0,The model correctly identified the user's busi...,True,True
4,multi_agent_orchestrator_improved_descriptions...,analytics_case_2,INFORM Webseite Analytics - Vollst채ndige Analy...,60698.0,545.0,4,0.025151,0.0,,1.0,"The model's final response provides a clear, a...",1.0,The model correctly identified the business ar...,True,True


## 7. Summary Statistics

Calculate summary statistics across configurations.

In [140]:
# Get score columns
score_cols = [col for col in combined_df.columns if col.startswith('score_') and not col.endswith('_reason')]

# Create a copy and rename values for cleaner display
df_for_summary = combined_df.copy()
df_for_summary['agent_type'] = df_for_summary['agent_type'].replace({
    'multi_agent': 'Multi',
    'single_agent': 'Single'
})
df_for_summary['dataset_type'] = df_for_summary['dataset_type'].replace({
    'addone': 'AddOne',
    'analytics': 'Analytics',
    'testing_and_access': 'Testing and Access'
})

# Summary by configuration (rename dataset_type to User Story in groupby)
summary = df_for_summary.rename(columns={'dataset_type': 'User Story'}).groupby(['agent_type', 'model', 'improved_descriptions', 'User Story']).agg({
    'case_name': 'count',
    'input_tokens': 'mean',
    'output_tokens': 'mean',
    'cost': 'mean',
    **{col: 'mean' for col in score_cols}
}).round(2)

# Rename columns to abbreviated report-style names
column_renames = {
    'case_name': 'Cases',
    'input_tokens': 'Avg. In. Tok.',
    'output_tokens': 'Avg. Out. Tok.',
    'cost': 'Avg. Cost'
}

# Add shortened score column names (remove 'score_' prefix)
for col in score_cols:
    # Convert score_some_name -> Some Name
    clean_name = col.replace('score_', '').replace('_', ' ').title()
    # Special case for Toolcallevaluator
    if clean_name == 'Toolcallevaluator':
        clean_name = 'Tool Names'
    column_renames[col] = clean_name

summary.rename(columns=column_renames, inplace=True)

# Rename index levels for cleaner display

summary.index.names = ['Agent Type', 'Model', 'Improved Descriptions', 'User Story']
summary


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Cases,Avg. In. Tok.,Avg. Out. Tok.,Avg. Cost,Tool Names,Helpfulness And Accuracy,Tool Call Evaluation
Agent Type,Model,Improved Descriptions,User Story,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Multi,gpt-4.1-mini,True,AddOne,24,21154.42,294.96,0.01,0.62,0.99,0.96
Multi,gpt-4.1-mini,True,Analytics,20,62307.5,541.6,0.03,0.1,0.93,0.82
Multi,gpt-4.1-mini,True,Testing and Access,19,26000.47,364.68,0.01,0.46,0.96,1.0
Single,gpt-4.1-mini,False,AddOne,24,61426.79,203.33,0.02,0.8,0.96,0.92
Single,gpt-4.1-mini,False,Analytics,20,55883.05,227.85,0.02,0.39,0.96,0.74
Single,gpt-4.1-mini,False,Testing and Access,19,61817.84,110.95,0.02,0.61,0.78,0.78
Single,gpt-4.1-mini,True,AddOne,24,47465.17,215.29,0.02,0.63,0.96,0.92
Single,gpt-4.1-mini,True,Analytics,20,48653.2,242.35,0.02,0.45,0.97,0.74
Single,gpt-4.1-mini,True,Testing and Access,19,64103.16,116.11,0.03,0.58,0.79,0.76
Single,gpt-5-mini,True,AddOne,24,94873.08,2233.88,0.03,0.71,0.96,0.92


In [None]:
# Convert summary to LaTeX table with line breaks in column names, index names, and values
summary_latex = summary.copy()

# Helper function to add line breaks using makecell with alignment
def add_line_breaks(text, min_length=5):
    """Add line breaks to text longer than min_length"""
    if len(text) <= min_length:
        return text
    
    # Special handling for model names with hyphens (split after hyphen)
    if '-' in text and '.' in text:  # Likely a model name like gpt-4.1-mini
        parts = text.split('-', 1)  # Split on first hyphen
        if len(parts) == 2:
            return f"\\makecell[tl]{{{parts[0]}-\\\\{parts[1]}}}"
    
    # Split at spaces
    words = text.split()
    if len(words) > 1:
        mid = len(words) // 2
        line1 = ' '.join(words[:mid])
        line2 = ' '.join(words[mid:])
        return f"\\makecell[tl]{{{line1}\\\\{line2}}}"
    return text

# Apply line breaks to User Story and Model values in the index FIRST (before changing index names)
# Reset index to modify, then set it back
summary_latex = summary_latex.reset_index()
summary_latex['User Story'] = summary_latex['User Story'].apply(lambda x: add_line_breaks(x) if isinstance(x, str) else x)
summary_latex['Model'] = summary_latex['Model'].apply(lambda x: add_line_breaks(x) if isinstance(x, str) else x)
summary_latex = summary_latex.set_index(['Agent Type', 'Model', 'Improved Descriptions', 'User Story'])

# Now apply line breaks to column names
summary_latex.columns = [add_line_breaks(col) for col in summary_latex.columns]

# Apply line breaks to index level names (after setting the index)
summary_latex.index.names = [add_line_breaks(name) if name else name for name in summary_latex.index.names]

latex_table = summary_latex.to_latex(
    float_format="%.2f",
    caption="Evaluation Summary by Configuration and User Story",
    label="tab:eval_summary",
    escape=False,  # Don't escape because we're adding LaTeX commands
    multirow=True,
    multicolumn=True
)


print(latex_table)

\begin{table}
\caption{Evaluation Summary by Configuration and User Story}
\label{tab:eval_summary}
\begin{tabular}{llllrrrrrrr}
\toprule
 &  &  &  & Cases & \makecell[t]{Avg.\\In. Tok.} & \makecell[t]{Avg.\\Out. Tok.} & \makecell[t]{Avg.\\Cost} & \makecell[t]{Tool\\Names} & \makecell[t]{Helpfulness\\And Accuracy} & \makecell[t]{Tool\\Call Evaluation} \\
\makecell[t]{Agent\\Type} & Model & \makecell[t]{Improved\\Descriptions} & \makecell[t]{User\\Story} &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{3}{*}{Multi} & \multirow[t]{3}{*}{\makecell[t]{gpt-\\4.1-mini}} & \multirow[t]{3}{*}{True} & AddOne & 24 & 21154.42 & 294.96 & 0.01 & 0.62 & 0.99 & 0.96 \\
 &  &  & Analytics & 20 & 62307.50 & 541.60 & 0.03 & 0.10 & 0.93 & 0.82 \\
 &  &  & \makecell[t]{Testing\\and Access} & 19 & 26000.47 & 364.68 & 0.01 & 0.46 & 0.96 & 1.00 \\
\cline{1-11} \cline{2-11} \cline{3-11}
\multirow[t]{9}{*}{Single} & \multirow[t]{6}{*}{\makecell[t]{gpt-\\4.1-mini}} & \multirow[t]{3}{*}{False} & AddOne & 24 & 61426

## 8. Export Options

Save DataFrames for further analysis.

In [142]:
# Export combined dataframe
# combined_df.to_csv('combined_evaluation_results.csv', index=False)

# Export summary
# summary.to_csv('evaluation_summary.csv')

# Export individual run dataframes
# for folder_name, df in all_case_dfs.items():
#     df.to_csv(f'{folder_name}.csv', index=False)

print("To export, uncomment the desired lines above.")

To export, uncomment the desired lines above.
