In [1]:
import json
from datetime import datetime
from pathlib import Path

import pandas as pd

# Configure pandas display
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50)

In [2]:
# Import functions from the analysis script
from analyze_user_records import (
    USER_RECORDS_DIR,
    analyze_user_records,
    create_token_dataframe,
)

print(f"User records directory: {USER_RECORDS_DIR}")

User records directory: /home/jschillb/meta_ally/Data/UserRecords


In [3]:
# Run the analysis
results = analyze_user_records()

print(f"Single-agent sessions: {len(results['single_agent'])}")
print(f"Multi-agent sessions: {len(results['multi_agent'])}")

Single-agent sessions: 12
Multi-agent sessions: 4


## Session Overview DataFrame

In [4]:
# Create session overview DataFrame
session_data = []

for session in results["single_agent"]:
    tokens = session.get("token_records", [])
    turns = session.get("turn_times", [])
    total_input = sum(t.get("input_tokens", 0) for t in tokens)
    total_output = sum(t.get("output_tokens", 0) for t in tokens)
    session_data.append({
        "name": session["name"],
        "agent_type": "single",
        "model": session.get("model", "Unknown"),
        "sus_score": session.get("sus_score"),
        "num_turns": len(turns),
        "num_llm_calls": len(tokens),
        "total_input_tokens": total_input,
        "total_output_tokens": total_output,
        "total_tokens": total_input + total_output,
        "avg_turn_time_s": sum(turns) / len(turns) if turns else 0,
    })

for session in results["multi_agent"]:
    orch_tokens = session.get("orchestrator_tokens", [])
    spec_tokens = session.get("specialist_tokens", [])
    turns = session.get("turn_times", [])
    all_tokens = orch_tokens + spec_tokens
    total_input = sum(t.get("input_tokens", 0) for t in all_tokens)
    total_output = sum(t.get("output_tokens", 0) for t in all_tokens)
    session_data.append({
        "name": session["name"],
        "agent_type": "multi",
        "model": session.get("model", "Unknown"),
        "sus_score": session.get("sus_score"),
        "num_turns": len(turns),
        "num_llm_calls": len(all_tokens),
        "total_input_tokens": total_input,
        "total_output_tokens": total_output,
        "total_tokens": total_input + total_output,
        "avg_turn_time_s": sum(turns) / len(turns) if turns else 0,
    })

df_sessions = pd.DataFrame(session_data)
df_sessions

Unnamed: 0,name,agent_type,model,sus_score,num_turns,num_llm_calls,total_input_tokens,total_output_tokens,total_tokens,avg_turn_time_s
0,Philipp Langen v2 - fix,single,gpt-5-mini,,12,33,456542,6016,462558,67.716265
1,list collections,single,Unknown,,2,4,6440,476,6916,11.188429
2,natalie_m,single,gpt-4.1-mini,90.0,15,27,677893,2396,680289,12.725024
3,natalie_m,single,gpt-4.1-mini,87.5,20,40,1180873,3513,1184386,15.388313
4,Philipp Langen (2nd run),single,gpt-5-mini,72.5,7,16,357526,12227,369753,47.284064
5,list_all_models,single,Unknown,,1,2,3840,354,4194,24.196873
6,nw-wout-orchestrator-replicated,single,gpt-4.1,57.5,15,24,559558,3086,562644,16.877596
7,joke bot creation,single,Unknown,,12,32,123641,2849,126490,23.658178
8,delete collections,single,Unknown,85.0,3,7,14514,491,15005,19.78366
9,Philipp Langen,single,gpt-4.1-mini,70.0,11,23,552162,3025,555187,25.807288


## Single-Agent Token Details

In [5]:
# Collect all single-agent token records
single_token_records = []
for session in results["single_agent"]:
    for record in session.get("token_records", []):
        record_copy = record.copy()
        record_copy["session_name"] = session["name"]
        single_token_records.append(record_copy)

df_single_tokens = create_token_dataframe(single_token_records)
if "session_name" in df_single_tokens.columns:
    df_single_tokens = df_single_tokens[["session_name", "input_tokens", "output_tokens", "total_tokens", "model_name", "timestamp"]]

print(f"Single-agent token records: {len(df_single_tokens)}")
df_single_tokens.head(20)

Single-agent token records: 288


Unnamed: 0,session_name,input_tokens,output_tokens,total_tokens,model_name,timestamp
0,Philipp Langen v2 - fix,0,0,0,,2026-02-06 12:55:55+00:00
1,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:00:17+00:00
2,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:00:32+00:00
3,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:01:18+00:00
4,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:01:36+00:00
5,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:02:20+00:00
6,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:03:16+00:00
7,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:04:18+00:00
8,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:04:33+00:00
9,Philipp Langen v2 - fix,0,0,0,,2026-02-06 13:11:36.223654+00:00


In [6]:
# Single-agent token summary stats
df_single_tokens[["input_tokens", "output_tokens", "total_tokens"]].describe()

Unnamed: 0,input_tokens,output_tokens,total_tokens
count,288.0,288.0,288.0
mean,22682.690972,151.378472,22834.069444
std,13398.80999,277.718742,13446.787864
min,0.0,0.0,0.0
25%,22750.5,31.0,22911.75
50%,25483.5,83.0,25615.5
75%,28580.25,142.5,28770.5
max,66815.0,2324.0,66848.0


## Multi-Agent Token Details

### Orchestrator Tokens

In [7]:
# Collect orchestrator token records
orchestrator_records = []
for session in results["multi_agent"]:
    for record in session.get("orchestrator_tokens", []):
        record_copy = record.copy()
        record_copy["session_name"] = session["name"]
        orchestrator_records.append(record_copy)

df_orchestrator = create_token_dataframe(orchestrator_records)
if "session_name" in df_orchestrator.columns:
    df_orchestrator = df_orchestrator[["session_name", "input_tokens", "output_tokens", "total_tokens", "model_name", "timestamp"]]

print(f"Orchestrator token records: {len(df_orchestrator)}")
df_orchestrator.head(20)

Orchestrator token records: 147


Unnamed: 0,session_name,input_tokens,output_tokens,total_tokens,model_name,timestamp
0,nw-with-orchestrator,1770,347,2117,gpt-4.1-2025-04-14,2026-02-04 10:19:07+00:00
1,nw-with-orchestrator,2132,188,2320,gpt-4.1-2025-04-14,2026-02-04 10:19:20+00:00
2,nw-with-orchestrator,2338,25,2363,gpt-4.1-2025-04-14,2026-02-04 10:19:27+00:00
3,nw-with-orchestrator,2382,219,2601,gpt-4.1-2025-04-14,2026-02-04 10:19:28+00:00
4,nw-with-orchestrator,2622,103,2725,gpt-4.1-2025-04-14,2026-02-04 10:20:25+00:00
5,nw-with-orchestrator,2860,132,2992,gpt-4.1-2025-04-14,2026-02-04 10:21:31+00:00
6,nw-with-orchestrator,3041,336,3377,gpt-4.1-2025-04-14,2026-02-04 10:22:16+00:00
7,nw-with-orchestrator,3390,299,3689,gpt-4.1-2025-04-14,2026-02-04 10:24:28+00:00
8,nw-with-orchestrator,3706,161,3867,gpt-4.1-2025-04-14,2026-02-04 10:26:22+00:00
9,nw-with-orchestrator,4068,214,4282,gpt-4.1-2025-04-14,2026-02-04 10:26:40+00:00


In [8]:
# Orchestrator token summary stats
df_orchestrator[["input_tokens", "output_tokens", "total_tokens"]].describe()

Unnamed: 0,input_tokens,output_tokens,total_tokens
count,147.0,147.0,147.0
mean,5050.666667,117.0,5167.666667
std,2109.655741,64.531398,2109.458732
min,1770.0,16.0,1933.0
25%,3322.0,75.0,3446.0
50%,4855.0,103.0,4988.0
75%,6534.0,152.0,6591.5
max,9626.0,368.0,9708.0


### Specialist Tokens

In [9]:
# Collect specialist token records
specialist_records = []
for session in results["multi_agent"]:
    for record in session.get("specialist_tokens", []):
        record_copy = record.copy()
        record_copy["session_name"] = session["name"]
        specialist_records.append(record_copy)

df_specialist = create_token_dataframe(specialist_records)
if "session_name" in df_specialist.columns:
    df_specialist = df_specialist[["session_name", "input_tokens", "output_tokens", "total_tokens", "model_name", "timestamp"]]

print(f"Specialist token records: {len(df_specialist)}")
df_specialist.head(20)

Specialist token records: 140


Unnamed: 0,session_name,input_tokens,output_tokens,total_tokens,model_name,timestamp
0,nw-with-orchestrator,7871,25,7896,gpt-4.1-2025-04-14,2026-02-04 10:20:28+00:00
1,nw-with-orchestrator,7915,15,7930,gpt-4.1-2025-04-14,2026-02-04 10:20:29+00:00
2,nw-with-orchestrator,7973,14,7987,gpt-4.1-2025-04-14,2026-02-04 10:20:37+00:00
3,nw-with-orchestrator,8914,23,8937,gpt-4.1-2025-04-14,2026-02-04 10:20:39+00:00
4,nw-with-orchestrator,8960,148,9108,gpt-4.1-2025-04-14,2026-02-04 10:20:41+00:00
5,nw-with-orchestrator,9130,12,9142,gpt-4.1-2025-04-14,2026-02-04 10:20:53+00:00
6,nw-with-orchestrator,9171,127,9298,gpt-4.1-2025-04-14,2026-02-04 10:21:28+00:00
7,nw-with-orchestrator,9414,60,9474,gpt-4.1-2025-04-14,2026-02-04 10:26:25+00:00
8,nw-with-orchestrator,9510,30,9540,gpt-4.1-2025-04-14,2026-02-04 10:26:28+00:00
9,nw-with-orchestrator,9708,199,9907,gpt-4.1-2025-04-14,2026-02-04 10:26:29+00:00


In [10]:
# Specialist token summary stats
df_specialist[["input_tokens", "output_tokens", "total_tokens"]].describe()

Unnamed: 0,input_tokens,output_tokens,total_tokens
count,140.0,140.0,140.0
mean,15470.907143,88.2,15559.107143
std,6347.023271,66.770109,6346.94908
min,7811.0,12.0,7869.0
25%,10519.75,31.0,10616.0
50%,14412.5,71.5,14559.5
75%,17500.5,129.25,17601.75
max,31271.0,331.0,31407.0


## Turn Response Times

In [15]:
# Create turn times DataFrame
turn_data = []

for session in results["single_agent"]:
    for i, turn_time in enumerate(session.get("turn_times", [])):
        turn_data.append({
            "session_name": session["name"],
            "agent_type": "single",
            "turn_number": i + 1,
            "response_time_s": turn_time,
        })

for session in results["multi_agent"]:
    for i, turn_time in enumerate(session.get("turn_times", [])):
        turn_data.append({
            "session_name": session["name"],
            "agent_type": "multi",
            "turn_number": i + 1,
            "response_time_s": turn_time,
        })

df_turns = pd.DataFrame(turn_data)
print(f"Total turns: {len(df_turns)}")
df_turns.head(20)

Total turns: 221


Unnamed: 0,session_name,agent_type,turn_number,response_time_s
0,Philipp Langen v2 - fix,single,1,4.414581
1,Philipp Langen v2 - fix,single,2,257.033971
2,Philipp Langen v2 - fix,single,3,1.7e-05
3,Philipp Langen v2 - fix,single,4,48.705969
4,Philipp Langen v2 - fix,single,5,1.464313
5,Philipp Langen v2 - fix,single,6,19.369517
6,Philipp Langen v2 - fix,single,7,8.1e-05
7,Philipp Langen v2 - fix,single,8,126.538546
8,Philipp Langen v2 - fix,single,9,3.2e-05
9,Philipp Langen v2 - fix,single,10,2.2e-05


In [12]:
# Turn time stats by agent type
df_turns.groupby("agent_type")["response_time_s"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
agent_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
multi,82.0,65.131052,72.059728,0.02462,15.520122,38.321744,92.724896,398.049725
single,139.0,24.634023,42.934795,1.7e-05,1.71953,7.20151,33.705101,270.012398


## Comparison Summary

In [13]:
# Create comparison summary
comparison_data = {
    "Metric": [
        "Number of sessions",
        "Number of turns",
        "Number of LLM calls",
        "Total tokens",
        "Avg tokens per call",
        "Avg turn response time (s)",
        "Median turn response time (s)",
    ],
    "Single-Agent": [
        len(results["single_agent"]),
        len(df_turns[df_turns["agent_type"] == "single"]),
        len(df_single_tokens),
        df_single_tokens["total_tokens"].sum() if len(df_single_tokens) > 0 else 0,
        df_single_tokens["total_tokens"].mean() if len(df_single_tokens) > 0 else 0,
        df_turns[df_turns["agent_type"] == "single"]["response_time_s"].mean() if len(df_turns[df_turns["agent_type"] == "single"]) > 0 else 0,
        df_turns[df_turns["agent_type"] == "single"]["response_time_s"].median() if len(df_turns[df_turns["agent_type"] == "single"]) > 0 else 0,
    ],
    "Multi-Agent": [
        len(results["multi_agent"]),
        len(df_turns[df_turns["agent_type"] == "multi"]),
        len(df_orchestrator) + len(df_specialist),
        df_orchestrator["total_tokens"].sum() + df_specialist["total_tokens"].sum() if len(df_orchestrator) > 0 else 0,
        (df_orchestrator["total_tokens"].sum() + df_specialist["total_tokens"].sum()) / (len(df_orchestrator) + len(df_specialist)) if (len(df_orchestrator) + len(df_specialist)) > 0 else 0,
        df_turns[df_turns["agent_type"] == "multi"]["response_time_s"].mean() if len(df_turns[df_turns["agent_type"] == "multi"]) > 0 else 0,
        df_turns[df_turns["agent_type"] == "multi"]["response_time_s"].median() if len(df_turns[df_turns["agent_type"] == "multi"]) > 0 else 0,
    ],
}

df_comparison = pd.DataFrame(comparison_data)
df_comparison

Unnamed: 0,Metric,Single-Agent,Multi-Agent
0,Number of sessions,12.0,4.0
1,Number of turns,139.0,82.0
2,Number of LLM calls,288.0,287.0
3,Total tokens,6576212.0,2937922.0
4,Avg tokens per call,22834.07,10236.66
5,Avg turn response time (s),24.63402,65.13105
6,Median turn response time (s),7.20151,38.32174
