In [1]:

# set working directory
import os
os.chdir("/Users/antonwiklund/Documents/Understanding-Tool-Aware-AI-Agents/code/tool-aware-ai-testing")

from src.database.connection import get_connection
from tabulate import tabulate
import pandas as pd


In [2]:
conn, cur = get_connection()

In [3]:
cur.execute("""
    SELECT 
        r.id,
        p.prompt,
        t.model_name,
        r.tool_calls,
        r.time_taken,
        r.success_rate,
        r.error_type,
        r.created_at
    FROM results r
    JOIN prompts p ON r.prompt_id = p.id
    JOIN test_runs t ON r.test_run_id = t.id
    ORDER BY r.created_at DESC;
""")
results = cur.fetchall()


In [4]:
# Convert to pandas DataFrame with column names
df = pd.DataFrame(results, columns=[
    'ID', 'Prompt', 'Model', 'Tool Calls', 
    'Time Taken', 'Success', 'Error', 'Created At'
])

In [5]:
# Display the results as a formatted table
print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))


+------+------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------------------------------------------------+--------------+-----------+---------+----------------------------+
|   ID | Prompt                                                                                                                                               | Model       | Tool Calls                                                 |   Time Taken | Success   | Error   | Created At                 |
|------+------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------------------------------------------------+--------------+-----------+---------+----------------------------|
|   11 | Generate a statistical report for the following data: [{"duration": 30, "participants": 

In [6]:
# Calculate and display statistics
print("\n=== Test Run Statistics ===")

# Success rate by model
model_stats = df.groupby('Model').agg({
    'Success': ['count', 'mean'],
    'Time Taken': ['mean', 'std']
}).round(3)

print("\nSuccess Rate by Model:")
print(tabulate(model_stats, headers=[
    'Total Tests', 'Success Rate', 'Avg Time', 'Std Time'
], tablefmt='psql'))

# Success rate by individual run
# First, we need to add test_run_id to our DataFrame
cur.execute("""
    SELECT 
        r.id,
        r.test_run_id,
        p.prompt,
        t.model_name,
        r.tool_calls,
        r.time_taken,
        r.success_rate,
        r.error_type,
        r.created_at
    FROM results r
    JOIN prompts p ON r.prompt_id = p.id
    JOIN test_runs t ON r.test_run_id = t.id
    ORDER BY r.test_run_id DESC, r.created_at DESC;
""")
results = cur.fetchall()

# Update DataFrame with test_run_id
df = pd.DataFrame(results, columns=[
    'ID', 'Test Run ID', 'Prompt', 'Model', 'Tool Calls', 
    'Time Taken', 'Success', 'Error', 'Created At'
])

# Group by Test Run ID
run_stats = df.groupby('Test Run ID').agg({
    'Success': ['count', 'mean'],
    'Time Taken': ['mean', 'std'],
    'Model': 'first',  # Get the model name for this test run
    'Created At': ['min', 'max'],  # Get start and end time
    'Tool Calls': lambda x: len(set([item for sublist in x for item in sublist]))  # unique tools used
}).round(3)

# Rename columns for better readability
run_stats.columns = [
    'Total Tests', 'Success Rate', 'Avg Time', 'Time Std Dev', 
    'Model', 'Start Time', 'End Time', 'Unique Tools'
]

print("\nSuccess Rate by Test Run:")
print(tabulate(run_stats, headers='keys', tablefmt='psql'))

tool_counts = {}
for tools in df['Tool Calls']:
    for tool in tools:
        tool_counts[tool] = tool_counts.get(tool, 0) + 1

# Overall statistics
overall_stats = pd.DataFrame({
    'Metric': [
        'Total Test Runs',
        'Total Tests',
        'Overall Success Rate',
        'Average Time per Test',
        'Total Unique Tools Used',
        'Most Used Tool'
    ],
    'Value': [
        len(run_stats),
        len(df),
        f"{df['Success'].mean()*100:.1f}%",
        f"{df['Time Taken'].mean():.3f} seconds",
        len(set([tool for tools in df['Tool Calls'] for tool in tools])),
        max(tool_counts.items(), key=lambda x: x[1])[0]
    ]
})

print("\nOverall Statistics:")
print(tabulate(overall_stats, headers='keys', tablefmt='psql', showindex=False))

# Tool usage statistics
print("\nTool Usage Statistics:")
tool_counts = {}
for tools in df['Tool Calls']:
    for tool in tools:
        tool_counts[tool] = tool_counts.get(tool, 0) + 1

tool_stats = pd.DataFrame.from_dict(
    tool_counts, 
    orient='index', 
    columns=['Count']
).sort_values('Count', ascending=False)

# Add percentage column
total_calls = tool_stats['Count'].sum()
tool_stats['Percentage'] = (tool_stats['Count'] / total_calls * 100).round(1)
tool_stats['Percentage'] = tool_stats['Percentage'].apply(lambda x: f"{x}%")

print(tabulate(tool_stats, headers=['Tool', 'Usage Count', 'Usage %'], tablefmt='psql'))


=== Test Run Statistics ===

Success Rate by Model:
+-------------+---------------+----------------+------------+------------+
|             |   Total Tests |   Success Rate |   Avg Time |   Std Time |
|-------------+---------------+----------------+------------+------------|
| gpt-4o-mini |             5 |            0.8 |          0 |          0 |
+-------------+---------------+----------------+------------+------------+

Success Rate by Test Run:
+---------------+---------------+----------------+------------+----------------+-------------+----------------------------+----------------------------+----------------+
|   Test Run ID |   Total Tests |   Success Rate |   Avg Time |   Time Std Dev | Model       | Start Time                 | End Time                   |   Unique Tools |
|---------------+---------------+----------------+------------+----------------+-------------+----------------------------+----------------------------+----------------|
|             6 |             5 |  