In [1]:
import os

# --- Task ID Definitions ---
# These dictionaries define the IDs for Easy, Medium, and Hard tasks.
EASY_TASK_IDS = [
    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
    "030eeff7-b492-4218-b312-701ec99ee0cc",
    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
    "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
    "9656a811-9b5b-4ddf-99c7-5117bcef0626",
]

MEDIUM_TASK_IDS = [
    "1704f00f-79e6-43a7-961b-cedd3724d5fd",
    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc",
    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825",
]

HARD_TASK_IDS = [
    "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba",
    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
]
ALL_TASK_DIFFICULTIES = {
    "Easy": EASY_TASK_IDS,
    "Medium": MEDIUM_TASK_IDS,
    "Hard": HARD_TASK_IDS,
}

# --- Helper Functions ---

def calculate_average_results(base_path, category_name=""):
    """
    Goes through all ID directories, extracts values from result.txt,
    and calculates the average.

    Args:
        base_path (str): The base path leading to the ID directories.
        category_name (str): An optional name for the location being processed, for clearer output.

    Returns:
        float: The average of the results, or None if no results are found.
    """
    results = []
    
    if not os.path.exists(base_path):
        print(f"Error: Base path for {category_name or 'unnamed location'} ('{base_path}') does not exist.")
        return None

    for item in os.listdir(base_path):
        item_path = os.path.join(base_path, item)

        if os.path.isdir(item_path):
            result_file_path = os.path.join(item_path, "result.txt")

            if os.path.exists(result_file_path):
                with open(result_file_path, 'r') as f:
                    value = float(f.read().strip())
                    results.append(value)


    if not results:
        print(f"No valid result.txt files found in '{category_name or base_path}' to calculate an average.")
        return None
    else:
        average = sum(results) / len(results)
        return average

def analyze_task_success(base_path, task_ids_by_difficulty):
    """
    Analyzes the success count for tasks categorized by difficulty.

    Args:
        base_path (str): The base path for the current run/system (e.g., .../first/pyautogui/screenshot/ui-tars/chrome).
        task_ids_by_difficulty (dict): A dictionary mapping difficulty levels (e.g., "Easy")
                                       to a list of task IDs belonging to that difficulty.

    Returns:
        dict: A dictionary where keys are difficulty levels and values are the
              count of successful tasks for that difficulty in the given base_path.
              Returns counts of zeros if base_path does not exist.
    """
    success_counts = {difficulty: 0 for difficulty in task_ids_by_difficulty.keys()}

    if not os.path.exists(base_path):
        return success_counts 

    for difficulty, task_ids in task_ids_by_difficulty.items():
        successful_count = 0
        for task_id in task_ids:
            # Construct the path to the result.txt for this specific task ID
            result_file_path = os.path.join(base_path, task_id, "result.txt")
            
            if os.path.exists(result_file_path):
                with open(result_file_path, 'r') as f:
                    value = float(f.read().strip())
                    if value == 1.0: # A task is successful if its result.txt contains 1.0
                        successful_count += 1

        success_counts[difficulty] = successful_count
    return success_counts



In [3]:
## Set Up Paths and Calculate All Averages and Success Counts

# Get the current working directory
current_working_directory = os.getcwd()

# Define the common base path up to 'results_chrome_thesis'
common_base_path_root = os.path.join(
    current_working_directory,
    'results_chrome_thesis'
)

# Define the different "runs" 
runs = ['first', 'second', 'third']

# Define the different "systems" 
systems = {
    'Baseline System': os.path.join('pyautogui', 'screenshot', 'ui-tars', 'chrome'),
    'Hierarchical Manager-Worker System': os.path.join('pyautogui', 'screenshot', 'ui-tars-agent', 'chrome')
}

# Dictionary to store all results (average and success counts)
# Structure: {run_name: {system_name: {'average': avg_value, 'success_counts': {'Easy': X, 'Medium': Y, 'Hard': Z}}}}
all_processed_data = {}

print("--- Calculating Averages and Analyzing Task Success Across All Runs and Systems ---")

for run in runs:
    print(f"\n--- Processing Run: {run.capitalize()} ---")
    all_processed_data[run] = {} # Initialize dictionary for the current run

    for system_name, system_relative_path in systems.items():
        # Construct the full path for the current run and system
        full_path = os.path.join(common_base_path_root, run, system_relative_path)
        
        category_name = f"{run.capitalize()} - {system_name}"
                
        # Calculate overall average
        average = calculate_average_results(full_path, category_name)
        
        # Analyze task success by difficulty
        success_counts = analyze_task_success(full_path, ALL_TASK_DIFFICULTIES)
        
        all_processed_data[run][system_name] = {
            'average': average,
            'success_counts': success_counts
        }

        if average is not None:
            print(f"  Overall Average for {category_name}: {average:.2f}")
        else:
            print(f"  Overall Average for {category_name}: No Data")
        
        print(f"  Successful Tasks for {category_name}:")
        for difficulty, count in success_counts.items():
            total_tasks_in_difficulty = len(ALL_TASK_DIFFICULTIES[difficulty])
            print(f"    {difficulty}: {count}/{total_tasks_in_difficulty} successful")

--- Calculating Averages and Analyzing Task Success Across All Runs and Systems ---

--- Processing Run: First ---
  Overall Average for First - Baseline System: 0.10
  Successful Tasks for First - Baseline System:
    Easy: 1/5 successful
    Medium: 0/3 successful
    Hard: 0/2 successful
  Overall Average for First - Hierarchical Manager-Worker System: 0.30
  Successful Tasks for First - Hierarchical Manager-Worker System:
    Easy: 3/5 successful
    Medium: 0/3 successful
    Hard: 0/2 successful

--- Processing Run: Second ---
  Overall Average for Second - Baseline System: 0.20
  Successful Tasks for Second - Baseline System:
    Easy: 1/5 successful
    Medium: 0/3 successful
    Hard: 1/2 successful
  Overall Average for Second - Hierarchical Manager-Worker System: 0.10
  Successful Tasks for Second - Hierarchical Manager-Worker System:
    Easy: 1/5 successful
    Medium: 0/3 successful
    Hard: 0/2 successful

--- Processing Run: Third ---
  Overall Average for Third - Base

In [4]:
# --- Overall System Performance Comparison (Aggregated Across All Runs) ---

# Initialize a dictionary to store aggregated data for each system
system_overall_performance = {}

for system_name in systems.keys():
    system_overall_performance[system_name] = {
        'run_averages': [],  # Stores average scores from each run
        'total_success_counts': {}, # Stores summed success counts per difficulty
        'runs_with_data_count': 0 # Counts runs contributing to the average score
    }
    # Initialize success counts for all known difficulties
    for difficulty_level in ALL_TASK_DIFFICULTIES.keys():
        system_overall_performance[system_name]['total_success_counts'][difficulty_level] = 0

# Aggregate data from all_processed_data
for run_name, systems_data_in_run in all_processed_data.items():
    for system_name, data_for_system in systems_data_in_run.items():
        if system_name in system_overall_performance: # Process only configured systems
            # Aggregate average scores
            if data_for_system['average'] is not None:
                system_overall_performance[system_name]['run_averages'].append(data_for_system['average'])
                system_overall_performance[system_name]['runs_with_data_count'] +=1
            
            # Aggregate success counts by difficulty
            for difficulty, count in data_for_system['success_counts'].items():
                if difficulty in system_overall_performance[system_name]['total_success_counts']:
                    system_overall_performance[system_name]['total_success_counts'][difficulty] += count
                else:
                    # Handle unexpected difficulty levels if necessary
                    system_overall_performance[system_name]['total_success_counts'][difficulty] = count

# Print the overall system comparison
print("\n\n--- Overall System Performance Comparison (Aggregated Across All Runs) ---")

num_total_runs = len(runs)

for system_name, performance_data in system_overall_performance.items():
    print(f"\nSystem: {system_name}")
    
    # Calculate and print overall average score
    if performance_data['run_averages']:
        # Compute average from per-run averages
        overall_system_avg = sum(performance_data['run_averages']) / len(performance_data['run_averages'])
        print(f"  Overall Average Score (from {len(performance_data['run_averages'])} of {num_total_runs} runs with data): {overall_system_avg:.2f}")
    else:
        print(f"  Overall Average Score: No Data available from any run.")
        
    print(f"  Total Successful Tasks (across {num_total_runs} runs):")
    if not ALL_TASK_DIFFICULTIES:
        print("    Cannot display task success: ALL_TASK_DIFFICULTIES is undefined or empty.")
    else:
        for difficulty, total_success_count in performance_data['total_success_counts'].items():
            if difficulty in ALL_TASK_DIFFICULTIES:
                # Total tasks for this difficulty = (tasks per run) * (number of runs)
                tasks_per_run_for_difficulty = len(ALL_TASK_DIFFICULTIES[difficulty])
                grand_total_tasks_for_difficulty = tasks_per_run_for_difficulty * num_total_runs
                
                if grand_total_tasks_for_difficulty > 0:
                    success_rate = (total_success_count / grand_total_tasks_for_difficulty) * 100
                    print(f"    {difficulty}: {total_success_count}/{grand_total_tasks_for_difficulty} successful ({success_rate:.1f}%)")
                elif tasks_per_run_for_difficulty == 0:
                     print(f"    {difficulty}: {total_success_count}/0 successful (No tasks defined for this difficulty)")
                else:
                     print(f"    {difficulty}: {total_success_count} successful (Total possible tasks calculation error)")
            else:
                # Fallback for difficulties not in the primary definition
                print(f"    {difficulty}: {total_success_count} successful (Warning: Difficulty not in ALL_TASK_DIFFICULTIES)")



--- Overall System Performance Comparison (Aggregated Across All Runs) ---

System: Baseline System
  Overall Average Score (from 3 of 3 runs with data): 0.17
  Total Successful Tasks (across 3 runs):
    Easy: 3/15 successful (20.0%)
    Medium: 0/9 successful (0.0%)
    Hard: 2/6 successful (33.3%)

System: Hierarchical Manager-Worker System
  Overall Average Score (from 3 of 3 runs with data): 0.23
  Total Successful Tasks (across 3 runs):
    Easy: 7/15 successful (46.7%)
    Medium: 0/9 successful (0.0%)
    Hard: 0/6 successful (0.0%)
