# Performance Analyzer

This notebook analyzes the performance of different experiment runs based on a common prefix.

In [None]:
# Imports
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from round_analysis import RoundAnalyzer

## 1. Find Matching Run IDs

In [None]:
def find_run_ids(users_directory, prefix):
    """Find all run_ids in the user files that start with a given prefix."""
    run_ids = set()
    user_files = [f for f in os.listdir(users_directory) if f.endswith('.jsonl')]
    
    for user_file in user_files:
        user_file_path = os.path.join(users_directory, user_file)
        try:
            with open(user_file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                if len(lines) > 2:
                    runs_content = json.loads(lines[2].strip())
                    for run in runs_content.get('runs', []):
                        run_id = run.get('run_id')
                        if run_id and run_id.startswith(prefix):
                            run_ids.add(run_id)
        except (json.JSONDecodeError, IndexError, FileNotFoundError) as e:
            print(f"Could not process file {user_file}: {e}")
            continue
            
    return sorted(list(run_ids))

## 2. Load Data and Extract Metrics

In [None]:
def extract_metrics(df):
    """Extract evaluation metrics from the raw data format into a flat DataFrame."""
    extracted_rows = []
    for _, row in df.iterrows():
        try:
            base_row = {
                'user_id': row['user_id'],
                'round': row['round'],
                'run_id': row['run_id'],
                'timestamp': row['timestamp']
            }

            # Extract from statistics dictionary
            if isinstance(row.get('statistics'), dict):
                stats = row['statistics']
                base_row['mean_combined_score'] = stats.get('mean_combined_score', 0)
                base_row['std_combined_score'] = stats.get('std_combined_score', 0)
                base_row['best_score'] = stats.get('best_score', 0)
                base_row['worst_score'] = stats.get('worst_score', 0)

            # Extract from overall dictionary
            if isinstance(row.get('overall'), dict):
                overall = row['overall']
                base_row['overall_score'] = overall.get('score', 0)
                if isinstance(overall.get('rouge'), dict):
                    base_row['overall_rouge1'] = overall['rouge'].get('rouge1', 0)
                if isinstance(overall.get('bleu'), dict):
                    base_row['overall_bleu'] = overall['bleu'].get('bleu', 0)

            # Extract from individual scores list
            if isinstance(row.get('individual_scores'), list):
                scores = [s.get('combined_score', 0) for s in row['individual_scores'] if isinstance(s, dict)]
                if scores:
                    base_row['individual_mean_combined'] = np.mean(scores)
                    base_row['individual_max_combined'] = np.max(scores)
                    base_row['individual_min_combined'] = np.min(scores)

            extracted_rows.append(base_row)
        except Exception as e:
            print(f"Error processing row for user {row.get('user_id')}, run {row.get('run_id')}: {e}")
            continue
            
    return pd.DataFrame(extracted_rows)

In [None]:
def load_and_extract_data(users_directory, run_ids):
    """Load data for the given run_ids and extract metrics."""
    print("\nLoading and processing data...")
    analyzer = RoundAnalyzer(users_directory=users_directory)
    
    # Get the raw data for all users
    df_raw = analyzer.analyze_all_users()
    
    if df_raw.empty:
        print("No data loaded from RoundAnalyzer.")
        return pd.DataFrame()
        
    # Filter for the runs we are interested in
    df_filtered = df_raw[df_raw['run_id'].isin(run_ids)].copy()
    
    if df_filtered.empty:
        print(f"No data found for the specified run IDs.")
        return pd.DataFrame()
        
    print(f"Found {len(df_filtered)} records for the specified runs.")
    
    # Extract metrics
    df_metrics = extract_metrics(df_filtered)
    print(f"Successfully extracted metrics into {len(df_metrics)} records.")
    
    return df_metrics

## 3. Summarize and Visualize

In [None]:
def get_summary_statistics(df_metrics):
    """Calculate summary statistics for each run."""
    if df_metrics.empty:
        return pd.DataFrame()
    
    # Define the metrics to aggregate
    metrics_to_agg = [
        'mean_combined_score', 'best_score', 'worst_score', 
        'overall_score', 'overall_rouge1', 'overall_bleu',
        'individual_mean_combined'
    ]
    
    # Filter out metrics that are not in the dataframe
    existing_metrics = [m for m in metrics_to_agg if m in df_metrics.columns]
    
    summary = df_metrics.groupby('run_id')[existing_metrics].agg(['mean', 'std', 'count', 'min', 'max']).round(4)
    
    return summary

In [None]:
def plot_metric_comparison(df_metrics, metric='mean_combined_score'):
    """Create a boxplot to compare a metric across different runs."""
    if df_metrics.empty or metric not in df_metrics.columns:
        print(f"Cannot plot '{metric}' as it's not available in the data.")
        return
        
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 7))
    
    sns.boxplot(data=df_metrics, x='run_id', y=metric, ax=ax)
    sns.stripplot(data=df_metrics, x='run_id', y=metric, ax=ax, color='black', alpha=0.3, size=4)
    
    ax.set_title(f'Comparison of {metric.replace("_", " ").title()} Across Runs', fontsize=16, fontweight='bold')
    ax.set_xlabel('Run ID', fontsize=12)
    ax.set_ylabel(metric.replace("_", " ").title(), fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 4. Main Analysis Function

In [None]:
def analyze_performance(run_name_prefix, users_directory='data/filtered_users'):
    """
    Analyzes performance for all runs matching a given prefix.
    
    :param run_name_prefix: The prefix of the run names to analyze.
    :param users_directory: Path to the directory containing user .jsonl files.
    """
    print(f"Analyzing runs with prefix: {run_name_prefix}\n")
    
    # 1. Find all runs matching the prefix
    matching_run_ids = find_run_ids(users_directory, run_name_prefix)
    
    if not matching_run_ids:
        print(f"No runs found with prefix '{run_name_prefix}'.")
        return
        
    print(f"Found {len(matching_run_ids)} matching runs:")
    for run_id in matching_run_ids:
        print(f"  - {run_id}")
        
    # 2. Load data and extract metrics
    df_metrics = load_and_extract_data(users_directory, matching_run_ids)
    
    if df_metrics.empty:
        print("Analysis halted as no metrics could be extracted.")
        return
        
    # 3. Aggregate and compare results
    summary_stats = get_summary_statistics(df_metrics)
    print("\n--- Summary Statistics ---")
    print(summary_stats)
    
    # 4. Plot visualizations
    print("\n--- Visualizations ---")
    plot_metric_comparison(df_metrics, metric='mean_combined_score')
    plot_metric_comparison(df_metrics, metric='best_score')
    plot_metric_comparison(df_metrics, metric='overall_rouge1')
    
    print("\nAnalysis complete.")

## 5. Execute Analysis

In [None]:
if __name__ == "__main__":
    # Configuration
    RUN_PREFIX = "2025_07_09"  # <<< CHANGE THIS TO YOUR RUN PREFIX
    USERS_DIR = "data/filtered_users"
    
    analyze_performance(RUN_PREFIX, USERS_DIR)