In [99]:
import json
from datetime import datetime
from pathlib import Path

import pandas as pd

# Configure pandas display
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50)

In [100]:
# Import functions from the analysis script
from analyze_user_records import (
    USER_RECORDS_DIR,
    analyze_user_records,
    create_token_dataframe,
)

print(f"User records directory: {USER_RECORDS_DIR}")

User records directory: /home/jschillb/meta_ally/Data/UserRecords


In [101]:
# Hardcoded mapping of filenames to users and sessions
# Based on Meta-Ally User Tests CSV and filenames in Data/UserRecords
filename_to_user_session = {
    # Niklas Weingarz (User 4)
    "nw-with-orchestrator_20260204_115154.json": ("User 4", "Niklas Weingarz", 1),
    "nw-wout-orchestrator-replicated_20260204_132143.json": ("User 4", "Niklas Weingarz", 2),
    
    # Maximilian Röttgen (User 2)
    "Maximilian_Roettgen_20260205_114244.json": ("User 2", "Maximilian Röttgen", 1),
    "Maximilian_Roettgen_20260205_120550.json": ("User 2", "Maximilian Röttgen", 2),
    
    # Klaus Reiter (User 3) - MISSING
    
    # Maximilian Bisani (User 1)
    "Meta_Ally_-_Maximilian_Bisani_20260127_150529.json": ("User 1", "Maximilian Bisani", 1),
    "Meta_Ally_-_Maximilian_Bisani_20260129_150101.json": ("User 1", "Maximilian Bisani", 2),
    
    # Natalie Moehlmann (User 5)
    "natalie_m_20260205_101950.json": ("User 5", "Natalie Moehlmann", 1),
    "natalie_m_20260205_104453.json": ("User 5", "Natalie Moehlmann", 2),
    
    # Phillip Langen (User 6)
    "Philipp_Langen_20260206_133810.json": ("User 6", "Phillip Langen", 1),
    "Philipp_Langen_v2_-_fix_20260206_163932.json": ("User 6", "Phillip Langen", 2),
    
    # Florian Winkler (User 7)
    "Florian_W_20260211_110301.json": ("User 7", "Florian Winkler", 1),
    "Florian_W_2_20260211_112632.json": ("User 7", "Florian Winkler", 2),
    
    # Hendrik Freitag (User 8)
    "Hendrik_Freitag_20260210_140656.json": ("User 8", "Hendrik Freitag", 1),
    "Hendrik_Freitag_20260210_151920.json": ("User 8", "Hendrik Freitag", 2),
    
    # Luis Grass (User 9)
    "Luis_20260213_131828.json": ("User 9", "Luis Grass", 1),
    "Luis_20260213_141133.json": ("User 9", "Luis Grass", 2),
    
    # Marco Bender (User 10)
    "Marco_Bender_20260216_100617.json": ("User 10", "Marco Bender", 1),
    "Marco_Bender_20260216_102157.json": ("User 10", "Marco Bender", 2),
}

print(f"Mapped {len(filename_to_user_session)} files to users and sessions")

Mapped 18 files to users and sessions


In [102]:
# Run the analysis
results = analyze_user_records()

print(f"Single-agent sessions: {len(results['single_agent'])}")
print(f"Multi-agent sessions: {len(results['multi_agent'])}")

Single-agent sessions: 13
Multi-agent sessions: 5


In [103]:
# Check which filenames are in results and if they're mapped
all_filenames = []

for session in results["single_agent"]:
    filename = session.get("filename", "")
    if filename:
        all_filenames.append(filename)

for session in results["multi_agent"]:
    filename = session.get("filename", "")
    if filename:
        all_filenames.append(filename)

print(f"Total files found in results: {len(all_filenames)}\n")

# Check mapping status
mapped_files = []
unmapped_files = []

for filename in all_filenames:
    if filename in filename_to_user_session:
        user_info = filename_to_user_session[filename]
        mapped_files.append((filename, user_info))
    else:
        unmapped_files.append(filename)

print(f"Mapped files: {len(mapped_files)}")
for filename, (user_alias, user_name, session_num) in mapped_files:
    print(f"  ✓ {filename} -> {user_alias} ({user_name}), Session {session_num}")

if unmapped_files:
    print(f"\nUnmapped files: {len(unmapped_files)}")
    for filename in unmapped_files:
        print(f"  ✗ {filename}")
else:
    print(f"\n✓ All files are mapped!")

Total files found in results: 18

Mapped files: 18
  ✓ Philipp_Langen_v2_-_fix_20260206_163932.json -> User 6 (Phillip Langen), Session 2
  ✓ natalie_m_20260205_104453.json -> User 5 (Natalie Moehlmann), Session 2
  ✓ natalie_m_20260205_101950.json -> User 5 (Natalie Moehlmann), Session 1
  ✓ Hendrik_Freitag_20260210_151920.json -> User 8 (Hendrik Freitag), Session 2
  ✓ nw-wout-orchestrator-replicated_20260204_132143.json -> User 4 (Niklas Weingarz), Session 2
  ✓ Florian_W_20260211_110301.json -> User 7 (Florian Winkler), Session 1
  ✓ Philipp_Langen_20260206_133810.json -> User 6 (Phillip Langen), Session 1
  ✓ Luis_20260213_141133.json -> User 9 (Luis Grass), Session 2
  ✓ Florian_W_2_20260211_112632.json -> User 7 (Florian Winkler), Session 2
  ✓ Marco_Bender_20260216_100617.json -> User 10 (Marco Bender), Session 1
  ✓ Marco_Bender_20260216_102157.json -> User 10 (Marco Bender), Session 2
  ✓ Luis_20260213_131828.json -> User 9 (Luis Grass), Session 1
  ✓ Meta_Ally_-_Maximilian_B

## Session Overview DataFrame

In [104]:
# Create session overview DataFrame
session_data = []

for session in results["single_agent"]:
    tokens = session.get("token_records", [])
    turns = session.get("turn_times", [])
    total_input = sum(t.get("input_tokens", 0) for t in tokens)
    total_output = sum(t.get("output_tokens", 0) for t in tokens)
    
    # Get user and session info from mapping
    filename = session.get("filename", "")
    user_info = filename_to_user_session.get(filename, ("Unknown", "Unknown", None))
    user_alias, user_name, session_num = user_info
    
    session_data.append({
        "user_alias": user_alias,
        "session": session_num,
        "agent_type": "single",
        "model": session.get("model", "Unknown"),
        "use_improved_descriptions": session.get("use_improved_descriptions", False),
        "sus_score": session.get("sus_score"),
        "num_turns": len(turns),
        "num_llm_calls": len(tokens),
        "total_input_tokens": total_input,
        "total_output_tokens": total_output,
        "avg_turn_time_s": sum(turns) / len(turns) if turns else 0,
    })

for session in results["multi_agent"]:
    orch_tokens = session.get("orchestrator_tokens", [])
    spec_tokens = session.get("specialist_tokens", [])
    turns = session.get("turn_times", [])
    all_tokens = orch_tokens + spec_tokens
    total_input = sum(t.get("input_tokens", 0) for t in all_tokens)
    total_output = sum(t.get("output_tokens", 0) for t in all_tokens)
    
    # Get user and session info from mapping
    filename = session.get("filename", "")
    user_info = filename_to_user_session.get(filename, ("Unknown", "Unknown", None))
    user_alias, user_name, session_num = user_info
    
    session_data.append({
        "user_alias": user_alias,
        "session": session_num,
        "agent_type": "multi",
        "model": session.get("model", "Unknown"),
        "use_improved_descriptions": session.get("use_improved_descriptions", False),
        "sus_score": session.get("sus_score"),
        "num_turns": len(turns),
        "num_llm_calls": len(all_tokens),
        "total_input_tokens": total_input,
        "total_output_tokens": total_output,
        "avg_turn_time_s": sum(turns) / len(turns) if turns else 0,
    })

df_sessions = pd.DataFrame(session_data)

# Extract numeric user number for proper sorting
df_sessions['user_num'] = df_sessions['user_alias'].str.extract(r'(\d+)').astype(float)

# Sort by numeric user number and session
df_sessions = df_sessions.sort_values(['user_num', 'session']).reset_index(drop=True)

# Drop the temporary sorting column
df_sessions = df_sessions.drop(columns=['user_num'])

# Rename columns for report with shortened names
df_sessions = df_sessions.rename(columns={
    "user_alias": "User",
    "session": "Sess.",
    "agent_type": "Type",
    "model": "Model",
    "use_improved_descriptions": "Imp. Desc.",
    "sus_score": "SUS",
    "num_turns": "Turns",
    "num_llm_calls": "LLM \\\\ Calls",
    "total_input_tokens": "In. Tok.",
    "total_output_tokens": "Out. Tok.",
    "avg_turn_time_s": "Avg Time (s)"
})

df_sessions

Unnamed: 0,User,Sess.,Type,Model,Imp. Desc.,SUS,Turns,LLM \\ Calls,In. Tok.,Out. Tok.,Avg Time (s)
0,User 1,1,multi,gpt-4.1-mini,True,72.5,26,94,1152527,9498,68.174934
1,User 1,2,single,gpt-4.1-mini,True,77.5,29,56,1798672,5868,23.694086
2,User 2,1,multi,gpt-4.1-mini,False,37.5,29,97,967366,8286,48.438926
3,User 2,2,multi,gpt-4.1-mini,True,70.0,14,50,439828,5109,54.095567
4,User 4,1,multi,gpt-4.1,True,67.5,13,46,348654,6654,108.163939
5,User 4,2,single,gpt-4.1,True,57.5,15,24,559558,3086,16.877596
6,User 5,1,single,gpt-4.1-mini,False,87.5,20,40,1180873,3513,15.388313
7,User 5,2,single,gpt-4.1-mini,True,90.0,15,27,677893,2396,12.725024
8,User 6,1,single,gpt-4.1-mini,True,70.0,11,23,552162,3025,25.807288
9,User 6,2,single,gpt-5-mini,True,72.5,12,33,456542,6016,67.716265


In [105]:
# Merge session data with cost data for grouped analysis
df_sessions_with_cost = df_sessions.merge(df_cost[['User', 'Sess.', 'Total Cost ($)']], on=['User', 'Sess.'], how='left')

## Grouped Analysis

In [106]:
# Group by agent type and model and calculate averages
df_by_type_model = df_sessions_with_cost.groupby(['Type', 'Model']).agg({
    'User': 'count',
    'Turns': 'mean',
    'LLM \\\\ Calls': 'mean',
    'In. Tok.': 'mean',
    'Out. Tok.': 'mean',
    'Avg Time (s)': 'mean',
    'Total Cost ($)': 'mean'
}).round(2)

df_by_type_model = df_by_type_model.rename(columns={
    'User': 'Count',
    'Turns': 'Avg Turns',
    'LLM \\\\ Calls': 'Avg LLM Calls',
    'In. Tok.': 'Avg In. Tok.',
    'Out. Tok.': 'Avg Out. Tok.',
    'Avg Time (s)': 'Avg Time (s)',
    'Total Cost ($)': 'Avg Cost ($)'
})

df_by_type_model

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Avg Turns,Avg LLM Calls,Avg In. Tok.,Avg Out. Tok.,Avg Time (s),Avg Cost ($)
Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
multi,gpt-4.1,1,13.0,46.0,348654.0,6654.0,108.16,0.83
multi,gpt-4.1-mini,3,23.0,80.33,853240.33,7631.0,56.9,0.39
multi,gpt-5-mini,1,6.0,109.0,2531745.0,61045.0,964.76,0.84
single,gpt-4.1,2,14.5,23.0,674816.5,4521.0,20.25,1.52
single,gpt-4.1-mini,5,17.4,33.8,961240.0,3560.4,19.5,0.43
single,gpt-5-mini,6,13.5,30.33,892359.33,16154.67,60.29,0.29


In [107]:
# Generate LaTeX table for grouped analysis by type and model
latex_lines = []
latex_lines.append("\\begin{table}[h]")
latex_lines.append("\\centering")
latex_lines.append("\\caption{Average Metrics by Agent Type and Model}")
latex_lines.append("\\label{tab:grouped_type_model}")
latex_lines.append("\\begin{tabular}{llrrrrrrr}")
latex_lines.append("\\toprule")
latex_lines.append("Type & Model & Count & Avg Turns & Avg LLM Calls & Avg In. Tok. & Avg Out. Tok. & Avg Time (s) & Avg Cost (\\$) \\\\")
latex_lines.append("\\midrule")

# Iterate through the multi-index dataframe
for (agent_type, model), row in df_by_type_model.iterrows():
    row_parts = [
        agent_type,
        model,
        f"{int(row['Count'])}",
        f"{row['Avg Turns']:.2f}",
        f"{row['Avg LLM Calls']:.2f}",
        f"{row['Avg In. Tok.']:.0f}",
        f"{row['Avg Out. Tok.']:.0f}",
        f"{row['Avg Time (s)']:.2f}",
        f"{row['Avg Cost ($)']:.4f}"
    ]
    latex_lines.append(" & ".join(row_parts) + " \\\\")

latex_lines.append("\\bottomrule")
latex_lines.append("\\end{tabular}")
latex_lines.append("\\end{table}")

latex_type_model_table = "\n".join(latex_lines)
print(latex_type_model_table)

\begin{table}[h]
\centering
\caption{Average Metrics by Agent Type and Model}
\label{tab:grouped_type_model}
\begin{tabular}{llrrrrrrr}
\toprule
Type & Model & Count & Avg Turns & Avg LLM Calls & Avg In. Tok. & Avg Out. Tok. & Avg Time (s) & Avg Cost (\$) \\
\midrule
multi & gpt-4.1 & 1 & 13.00 & 46.00 & 348654 & 6654 & 108.16 & 0.8300 \\
multi & gpt-4.1-mini & 3 & 23.00 & 80.33 & 853240 & 7631 & 56.90 & 0.3900 \\
multi & gpt-5-mini & 1 & 6.00 & 109.00 & 2531745 & 61045 & 964.76 & 0.8400 \\
single & gpt-4.1 & 2 & 14.50 & 23.00 & 674816 & 4521 & 20.25 & 1.5200 \\
single & gpt-4.1-mini & 5 & 17.40 & 33.80 & 961240 & 3560 & 19.50 & 0.4300 \\
single & gpt-5-mini & 6 & 13.50 & 30.33 & 892359 & 16155 & 60.29 & 0.2900 \\
\bottomrule
\end{tabular}
\end{table}


In [108]:
# Group by agent type and calculate averages
df_by_type = df_sessions_with_cost.groupby('Type').agg({
    'User': 'count',
    'Turns': 'mean',
    'LLM \\\\ Calls': 'mean',
    'In. Tok.': 'mean',
    'Out. Tok.': 'mean',
    'Avg Time (s)': 'mean',
    'Total Cost ($)': 'mean'
}).round(2)

df_by_type = df_by_type.rename(columns={
    'User': 'Count',
    'Turns': 'Avg Turns',
    'LLM \\\\ Calls': 'Avg LLM Calls',
    'In. Tok.': 'Avg In. Tok.',
    'Out. Tok.': 'Avg Out. Tok.',
    'Avg Time (s)': 'Avg Time (s)',
    'Total Cost ($)': 'Avg Cost ($)'
})

df_by_type

Unnamed: 0_level_0,Count,Avg Turns,Avg LLM Calls,Avg In. Tok.,Avg Out. Tok.,Avg Time (s),Avg Cost ($)
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
multi,5,17.6,79.2,1088024.0,18118.4,248.73,0.57
single,13,15.15,30.54,885383.77,9520.92,38.44,0.53


In [109]:
# Calculate overall averages across all sessions
total_count = len(df_sessions_with_cost)
total_avg = pd.DataFrame({
    'Count': [total_count],
    'Avg Turns': [df_sessions_with_cost['Turns'].mean()],
    'Avg LLM Calls': [df_sessions_with_cost['LLM \\\\ Calls'].mean()],
    'Avg In. Tok.': [df_sessions_with_cost['In. Tok.'].mean()],
    'Avg Out. Tok.': [df_sessions_with_cost['Out. Tok.'].mean()],
    'Avg Time (s)': [df_sessions_with_cost['Avg Time (s)'].mean()],
    'Avg Cost ($)': [df_sessions_with_cost['Total Cost ($)'].mean()]
}, index=['Overall']).round(2)

total_avg

Unnamed: 0,Count,Avg Turns,Avg LLM Calls,Avg In. Tok.,Avg Out. Tok.,Avg Time (s),Avg Cost ($)
Overall,18,15.83,44.06,941672.72,11909.11,96.86,0.54


In [110]:
# Define pricing information (USD per million tokens)
pricing = {
    "gpt-5-mini": {
        "fresh_input": 0.28,
        "cached_input": 0.03,
        "output": 2.20
    },
    "gpt-4.1-mini": {
        "fresh_input": 0.44,
        "cached_input": 0.11,
        "output": 1.76
    },
    "gpt-4.1": {
        "fresh_input": 2.20,
        "cached_input": 0.55,
        "output": 8.80
    }
}

# Create cost analysis dataframe
cost_data = []

for _, row in df_sessions.iterrows():
    user = row["User"]
    session = row["Sess."]
    model = row["Model"]
    input_tokens = row["In. Tok."]
    output_tokens = row["Out. Tok."]
    
    # Get pricing for this model (default to gpt-4.1-mini if unknown)
    model_pricing = pricing.get(model, pricing["gpt-4.1-mini"])
    
    # Calculate costs (assuming all input tokens are fresh for simplicity)
    # Divide by 1,000,000 to convert per-million-token pricing
    input_cost = (input_tokens / 1_000_000) * model_pricing["fresh_input"]
    output_cost = (output_tokens / 1_000_000) * model_pricing["output"]
    total_cost = input_cost + output_cost
    
    cost_data.append({
        "User": user,
        "Sess.": session,
        "Model": model,
        "Input Cost ($)": input_cost,
        "Output Cost ($)": output_cost,
        "Total Cost ($)": total_cost
    })

df_cost = pd.DataFrame(cost_data)

# Format costs to 4 decimal places for display
df_cost_display = df_cost.copy()
df_cost_display["Input Cost ($)"] = df_cost_display["Input Cost ($)"].apply(lambda x: f"${x:.4f}")
df_cost_display["Output Cost ($)"] = df_cost_display["Output Cost ($)"].apply(lambda x: f"${x:.4f}")
df_cost_display["Total Cost ($)"] = df_cost_display["Total Cost ($)"].apply(lambda x: f"${x:.4f}")

df_cost_display

Unnamed: 0,User,Sess.,Model,Input Cost ($),Output Cost ($),Total Cost ($)
0,User 1,1,gpt-4.1-mini,$0.5071,$0.0167,$0.5238
1,User 1,2,gpt-4.1-mini,$0.7914,$0.0103,$0.8017
2,User 2,1,gpt-4.1-mini,$0.4256,$0.0146,$0.4402
3,User 2,2,gpt-4.1-mini,$0.1935,$0.0090,$0.2025
4,User 4,1,gpt-4.1,$0.7670,$0.0586,$0.8256
5,User 4,2,gpt-4.1,$1.2310,$0.0272,$1.2582
6,User 5,1,gpt-4.1-mini,$0.5196,$0.0062,$0.5258
7,User 5,2,gpt-4.1-mini,$0.2983,$0.0042,$0.3025
8,User 6,1,gpt-4.1-mini,$0.2430,$0.0053,$0.2483
9,User 6,2,gpt-5-mini,$0.1278,$0.0132,$0.1411


In [111]:
# Generate LaTeX table for session costs
latex_lines = []
latex_lines.append("\\begin{table}[h]")
latex_lines.append("\\centering")
latex_lines.append("\\caption{Session Cost Analysis}")
latex_lines.append("\\label{tab:session_costs}")
latex_lines.append("\\begin{tabular}{llr}")
latex_lines.append("\\toprule")
latex_lines.append("User & Sess. & Total Cost (\\$) \\\\")
latex_lines.append("\\midrule")

# Group by user and generate rows with multirow for users
grouped = df_cost.groupby('User', sort=False)
for user_idx, (user, group) in enumerate(grouped):
    rows = group.values.tolist()
    num_rows = len(rows)
    
    for i, row in enumerate(rows):
        row_parts = []
        if i == 0:
            # First row for this user - add multirow for user column
            row_parts.append(f"\\multirow{{{num_rows}}}{{*}}{{{row[0]}}}")
        else:
            # Subsequent rows - empty user cell
            row_parts.append("")
        
        # Add session and total cost
        session = int(row[1])
        total_cost = row[5]  # Total Cost ($) is at index 5
        row_parts.append(str(session))
        row_parts.append(f"{total_cost:.4f}")
        
        latex_lines.append(" & ".join(row_parts) + " \\\\")
    
    # Add midrule after each user except the last
    if user_idx < len(grouped) - 1:
        latex_lines.append("\\midrule")

latex_lines.append("\\bottomrule")
latex_lines.append("\\end{tabular}")
latex_lines.append("\\end{table}")

latex_cost_table = "\n".join(latex_lines)
print(latex_cost_table)

\begin{table}[h]
\centering
\caption{Session Cost Analysis}
\label{tab:session_costs}
\begin{tabular}{llr}
\toprule
User & Sess. & Total Cost (\$) \\
\midrule
\multirow{2}{*}{User 1} & 1 & 0.5238 \\
 & 2 & 0.8017 \\
\midrule
\multirow{2}{*}{User 2} & 1 & 0.4402 \\
 & 2 & 0.2025 \\
\midrule
\multirow{2}{*}{User 4} & 1 & 0.8256 \\
 & 2 & 1.2582 \\
\midrule
\multirow{2}{*}{User 5} & 1 & 0.5258 \\
 & 2 & 0.3025 \\
\midrule
\multirow{2}{*}{User 6} & 1 & 0.2483 \\
 & 2 & 0.1411 \\
\midrule
\multirow{2}{*}{User 7} & 1 & 0.3154 \\
 & 2 & 0.2041 \\
\midrule
\multirow{2}{*}{User 8} & 1 & 0.8432 \\
 & 2 & 0.3154 \\
\midrule
\multirow{2}{*}{User 9} & 1 & 1.7906 \\
 & 2 & 0.4988 \\
\midrule
\multirow{2}{*}{User 10} & 1 & 0.2376 \\
 & 2 & 0.2678 \\
\bottomrule
\end{tabular}
\end{table}


## Session Cost Analysis

In [112]:
# Generate LaTeX table with merged User cells and bold larger values
import io

# Create LaTeX table manually with multirow for users
latex_lines = []
latex_lines.append("\\begin{table}[h]")
latex_lines.append("\\centering")
latex_lines.append("\\caption{Session Analysis Overview}")
latex_lines.append("\\label{tab:sessions}")
latex_lines.append("\\begin{tabular}{lllllrrrrrr}")
latex_lines.append("\\toprule")

# Header with makecell for multi-line headers
latex_lines.append("User & Sess. & Type & Model & \\makecell{Imp. \\\\Desc.} & SUS & Turns &\\makecell{LLM \\\\Calls} & \\makecell{In.\\\\ Tok.} &\\makecell{ Out.\\\\ Tok.} &\\makecell{ Avg Time\\\\ (s) }\\\\")
latex_lines.append("\\midrule")

# Columns to compare and bold (numeric metrics)
compare_cols = [5, 6, 7, 8, 9, 10]  # SUS, Turns, LLM Calls, In. Tok., Out. Tok., Avg Time

# Group by user and generate rows
grouped = df_sessions.groupby('User', sort=False)
for user_idx, (user, group) in enumerate(grouped):
    rows = group.values.tolist()
    num_rows = len(rows)
    
    # If there are 2 rows, compare values and determine which should be bold
    bold_mask = [[False] * len(rows[0]) for _ in range(num_rows)]
    if num_rows == 2:
        for col_idx in compare_cols:
            val1 = rows[0][col_idx]
            val2 = rows[1][col_idx]
            if val1 is not None and val2 is not None:
                if val1 > val2:
                    bold_mask[0][col_idx] = True
                elif val2 > val1:
                    bold_mask[1][col_idx] = True
    
    for i, row in enumerate(rows):
        row_parts = []
        if i == 0:
            # First row for this user - add multirow for user column
            row_parts.append(f"\\multirow{{{num_rows}}}{{*}}{{{row[0]}}}")
        else:
            # Subsequent rows - empty user cell
            row_parts.append("")
        
        # Add remaining columns
        for j, val in enumerate(row[1:], start=1):
            if isinstance(val, bool):
                val_str = "True" if val else "False"
            elif isinstance(val, float):
                val_str = f"{val:.2f}"
            else:
                val_str = str(val)
            
            # Apply bold if this cell should be bolded
            if bold_mask[i][j]:
                val_str = f"\\textbf{{{val_str}}}"
            
            row_parts.append(val_str)
        
        latex_lines.append(" & ".join(row_parts) + " \\\\")
    
    # Add midrule after each user except the last
    if user_idx < len(grouped) - 1:
        latex_lines.append(" \\midrule")

latex_lines.append("\\bottomrule")
latex_lines.append("\\end{tabular}")
latex_lines.append("\\end{table}")

latex_table = "\n".join(latex_lines)
print(latex_table)

\begin{table}[h]
\centering
\caption{Session Analysis Overview}
\label{tab:sessions}
\begin{tabular}{lllllrrrrrr}
\toprule
User & Sess. & Type & Model & \makecell{Imp. \\Desc.} & SUS & Turns &\makecell{LLM \\Calls} & \makecell{In.\\ Tok.} &\makecell{ Out.\\ Tok.} &\makecell{ Avg Time\\ (s) }\\
\midrule
\multirow{2}{*}{User 1} & 1 & multi & gpt-4.1-mini & True & 72.50 & 26 & \textbf{94} & 1152527 & \textbf{9498} & \textbf{68.17} \\
 & 2 & single & gpt-4.1-mini & True & \textbf{77.50} & \textbf{29} & 56 & \textbf{1798672} & 5868 & 23.69 \\
 \midrule
\multirow{2}{*}{User 2} & 1 & multi & gpt-4.1-mini & False & 37.50 & \textbf{29} & \textbf{97} & \textbf{967366} & \textbf{8286} & 48.44 \\
 & 2 & multi & gpt-4.1-mini & True & \textbf{70.00} & 14 & 50 & 439828 & 5109 & \textbf{54.10} \\
 \midrule
\multirow{2}{*}{User 4} & 1 & multi & gpt-4.1 & True & \textbf{67.50} & 13 & \textbf{46} & 348654 & \textbf{6654} & \textbf{108.16} \\
 & 2 & single & gpt-4.1 & True & 57.50 & \textbf{15} & 24 & \te

## User Feedback and Notes Analysis

In [113]:
# Extract feedback and notes from all user records by reading JSON files directly
import json
from pathlib import Path

user_feedback_data = {}

# Read each JSON file directly
for filename, (user_alias, user_name, session_num) in filename_to_user_session.items():
    file_path = USER_RECORDS_DIR / filename
    
    if not file_path.exists():
        print(f"Warning: File not found: {filename}")
        continue
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        continue
    
    # Get metadata
    metadata = data.get("metadata", {})
    notes = metadata.get("notes", {})
    feedback_str = metadata.get("feedback", "")
    config = metadata.get("config", {})
    agent_type = "multi" if config.get("use_multi_agent", False) else "single"
    
    # Initialize user data if not exists
    if user_alias not in user_feedback_data:
        user_feedback_data[user_alias] = {
            "name": user_name,
            "sessions": {}
        }
    
    # Parse feedback string
    achievement = ""
    portal_comparison = ""
    config_preference = ""
    
    if feedback_str:
        parts = feedback_str.split("|")
        for part in parts:
            part = part.strip()
            if part.lower().startswith("achievement:"):
                achievement = part.split(":", 1)[1].strip()
            elif part.lower().startswith("portal comparison:"):
                portal_comparison = part.split(":", 1)[1].strip()
            elif part.lower().startswith("configuration preference:"):
                config_preference = part.split(":", 1)[1].strip()
    
    # Store session data
    user_feedback_data[user_alias]["sessions"][session_num] = {
        "agent_type": agent_type,
        "achievement": achievement,
        "portal_comparison": portal_comparison,
        "config_preference": config_preference,
        "intention": notes.get("intention", ""),
        "achievement_note": notes.get("achievement", ""),
        "what_went_well": notes.get("what_went_well", ""),
        "what_went_poorly": notes.get("what_went_poorly", "")
    }

# Display summary
print(f"Collected feedback for {len(user_feedback_data)} users")
for user_alias in sorted(user_feedback_data.keys(), key=lambda x: int(x.split()[1])):
    print(f"  {user_alias}: {len(user_feedback_data[user_alias]['sessions'])} sessions")

Collected feedback for 9 users
  User 1: 2 sessions
  User 2: 2 sessions
  User 4: 2 sessions
  User 5: 2 sessions
  User 6: 2 sessions
  User 7: 2 sessions
  User 8: 2 sessions
  User 9: 2 sessions
  User 10: 2 sessions


In [114]:
# Display detailed notes and feedback for all users
for user_alias in sorted(user_feedback_data.keys(), key=lambda x: int(x.split()[1])):
    user_data = user_feedback_data[user_alias]
    print(f"\n{'='*80}")
    print(f"{user_alias} - {user_data['name']}")
    print(f"{'='*80}")
    
    for session_num in sorted(user_data['sessions'].keys()):
        session = user_data['sessions'][session_num]
        print(f"\n--- Session {session_num} ({session['agent_type']}) ---")
        
        print(f"\nIntention:")
        print(f"  {session['intention']}")
        
        print(f"\nAchievement Note:")
        print(f"  {session['achievement_note']}")
        
        print(f"\nWhat went well:")
        print(f"  {session['what_went_well']}")
        
        print(f"\nWhat went poorly:")
        print(f"  {session['what_went_poorly']}")
        
        print(f"\nFeedback:")
        print(f"  Achievement: {session['achievement']}")
        print(f"  Portal comparison: {session['portal_comparison']}")
        print(f"  Config preference: {session['config_preference']}")
        
    print(f"\n{'-'*80}")


User 1 - Maximilian Bisani

--- Session 1 (multi) ---

Intention:
  Explore Meta Ally's capabilities by testing three different tasks: setting up a coding assistant, analyzing chat requests, and managing permissions

Achievement Note:
  Partially completed the three tasks with varying degrees of success across setting up a copilot, analyzing chat requests, and modifying user permissions

What went well:
  Analyzing chat requests worked surprisingly well and showed strong potential (would benefit from statistical analysis capabilities). The system demonstrated good understanding of permission concepts overall

What went poorly:
  Multiple function call failures and incomplete requests. GitHub source creation encountered a 500 error. Permission management had issues when removing a user from a role. The system assigned 'owner' role immediately without asking which permission level to grant to the coworker

Feedback:
  Achievement: yes
  Portal comparison: yes, getting started requires l

In [115]:
# Generate LaTeX table for user feedback summary
# Manual assessment based on feedback, notes, and portal comparison data

# Define feedback assessment for each user and session
# Format: user_alias -> session_num -> {portal_improvement, goal_achieved, config_preference}
# Config preference indicates which session (S1 or S2) the user preferred
feedback_assessment = {
    "User 1": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "S1"  # Preferred multi-agent (S1)
    },
    "User 2": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "S1"  # Preferred single-agent (both were single)
    },
    "User 4": {
        1: {"portal": "Partial", "goal": "$\\checkmark$"},
        2: {"portal": "$\\times$", "goal": "Partial"},
        "config_pref": ""  # Fill in
    },
    "User 5": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "Both"  # Preferred multi-agent (both sessions)
    },
    "User 6": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "Both"  # Preferred multi-agent (both sessions)
    },
    "User 7": {
        1: {"portal": "Partial", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "S2"  # Preferred multi-agent (S2)
    },
    "User 8": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "Both"  # Preferred single-agent (both sessions)
    },
    "User 9": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": ""  # Fill in
    },
    "User 10": {
        1: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        2: {"portal": "$\\checkmark$", "goal": "$\\checkmark$"},
        "config_pref": "Both"  # Preferred multi-agent (both sessions)
    }
}

# Get sorted list of users
sorted_users = sorted(feedback_assessment.keys(), key=lambda x: int(x.split()[1]))

# Generate LaTeX table
latex_lines = []
latex_lines.append("\\begin{table*}[t]")
latex_lines.append("\\centering")
latex_lines.append("\\caption{User Feedback Summary}")
latex_lines.append("\\label{tab:feedback}")

# Create column specification - now with single column per user for config preference
col_spec = "l" + "cc" * len(sorted_users)
latex_lines.append(f"\\begin{{tabular}}{{{col_spec}}}")
latex_lines.append("\\toprule")

# Header row with user columns
header_parts = [""]
for user in sorted_users:
    header_parts.append(f"\\multicolumn{{2}}{{c}}{{{user}}}")
latex_lines.append(" & ".join(header_parts) + " \\\\")

# Sub-header row with session numbers
subheader_parts = ["Metric"]
for _ in sorted_users:
    subheader_parts.extend(["S1", "S2"])
latex_lines.append(" & ".join(subheader_parts) + " \\\\")
latex_lines.append("\\midrule")

# Row 1: Portal Improvement
portal_parts = ["Portal Improvement"]
for user in sorted_users:
    portal_parts.append(feedback_assessment[user][1]["portal"])
    portal_parts.append(feedback_assessment[user][2]["portal"])
latex_lines.append(" & ".join(portal_parts) + " \\\\")

# Row 2: Goal Achieved
goal_parts = ["Goal Achieved"]
for user in sorted_users:
    goal_parts.append(feedback_assessment[user][1]["goal"])
    goal_parts.append(feedback_assessment[user][2]["goal"])
latex_lines.append(" & ".join(goal_parts) + " \\\\")

# Row 3: Config Preference (spans both columns for each user)
latex_lines.append("\\midrule")
config_parts = ["Config Preference"]
for user in sorted_users:
    config_pref = feedback_assessment[user]["config_pref"]
    config_parts.append(f"\\multicolumn{{2}}{{c}}{{{config_pref}}}")
latex_lines.append(" & ".join(config_parts) + " \\\\")

latex_lines.append("\\bottomrule")
latex_lines.append("\\end{tabular}")
latex_lines.append("\\end{table*}")

latex_feedback_table = "\n".join(latex_lines)
print(latex_feedback_table)

\begin{table*}[t]
\centering
\caption{User Feedback Summary}
\label{tab:feedback}
\begin{tabular}{lcccccccccccccccccc}
\toprule
 & \multicolumn{2}{c}{User 1} & \multicolumn{2}{c}{User 2} & \multicolumn{2}{c}{User 4} & \multicolumn{2}{c}{User 5} & \multicolumn{2}{c}{User 6} & \multicolumn{2}{c}{User 7} & \multicolumn{2}{c}{User 8} & \multicolumn{2}{c}{User 9} & \multicolumn{2}{c}{User 10} \\
Metric & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 & S1 & S2 \\
\midrule
Portal Improvement & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & Partial & $\times$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & Partial & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ \\
Goal Achieved & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & Partial & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ & $\checkmark$ 