## Running the Evaluation

Now let's run the complete evaluation pipeline using our synthetic data.

In [None]:
def analyze_rank_ablation(rank_results: Dict) -> Dict:
    """Analyze rank ablation study to find optimal configurations."""
    logger.info("Analyzing rank ablation study")

    try:
        ranks = sorted([int(r) for r in rank_results.keys()])
        logger.info(f"Ranks tested: {ranks}")

        analysis = {
            "rank_comparison": {},
            "optimal_rank": None,
            "diminishing_returns": []
        }

        # Compare each rank
        for rank in ranks:
            rank_str = str(rank)
            data = rank_results[rank_str]

            token_mean = data["token_stats"]["mean_tokens_per_episode"]
            accuracy = data["metrics"]["accuracy"]
            efficiency = accuracy / token_mean if token_mean > 0 else 0.0

            analysis["rank_comparison"][rank] = {
                "tokens_per_episode": float(token_mean),
                "accuracy": float(accuracy),
                "efficiency_score": efficiency,
                "compression_ratio": float(data.get("compression_ratio", 0)),
                "param_reduction": float(data.get("param_reduction", 0))
            }

        # Find optimal rank (best efficiency)
        best_rank = max(
            analysis["rank_comparison"].items(),
            key=lambda x: x[1]["efficiency_score"]
        )[0]

        logger.info(f"Optimal rank identified: {best_rank}")

        analysis["optimal_rank"] = {
            "rank": int(best_rank),
            "rationale": f"Rank {best_rank} achieves best efficiency score (accuracy/tokens)"
        }

        # Check for diminishing returns
        for i in range(len(ranks) - 1):
            r1, r2 = ranks[i], ranks[i+1]
            eff1 = analysis["rank_comparison"][r1]["efficiency_score"]
            eff2 = analysis["rank_comparison"][r2]["efficiency_score"]

            improvement = ((eff2 - eff1) / eff1) * 100 if eff1 > 0 else 0.0

            if improvement < 1.0:  # Less than 1% improvement
                analysis["diminishing_returns"].append({
                    "from_rank": int(r1),
                    "to_rank": int(r2),
                    "efficiency_improvement_percent": improvement,
                    "note": "Diminishing returns observed"
                })

        logger.info(f"Diminishing returns found: {len(analysis['diminishing_returns'])} cases")

        return analysis

    except Exception as e:
        logger.error(f"Error analyzing rank ablation: {e}")
        raise

print("✓ Rank ablation analysis function defined")

## Rank Ablation Analysis

This function analyzes different rank configurations to find the optimal setting.

In [None]:
def compute_token_efficiency_metrics(
    baseline_tokens: List[int],
    method_tokens: List[int],
    baseline_acc: float,
    method_acc: float
) -> Dict:
    """Compute comprehensive token efficiency metrics."""
    logger.info("Computing token efficiency metrics")

    try:
        # Validate inputs
        if len(baseline_tokens) != len(method_tokens):
            raise ValueError(f"Token arrays have different lengths: {len(baseline_tokens)} vs {len(method_tokens)}")

        if len(baseline_tokens) == 0:
            raise ValueError("Empty token arrays")

        logger.info(f"Number of samples: {len(baseline_tokens)}")
        logger.info(f"Baseline tokens sample: {truncate_for_log(baseline_tokens[:5])}")
        logger.info(f"Method tokens sample: {truncate_for_log(method_tokens[:5])}")

        # Token statistics
        baseline_mean = float(np.mean(baseline_tokens))
        baseline_std = float(np.std(baseline_tokens))
        method_mean = float(np.mean(method_tokens))
        method_std = float(np.std(method_tokens))

        logger.info(f"Baseline: mean={baseline_mean:.2f}, std={baseline_std:.2f}")
        logger.info(f"Method: mean={method_mean:.2f}, std={method_std:.2f}")

        # Token reduction
        token_reduction = ((baseline_mean - method_mean) / baseline_mean) * 100 if baseline_mean > 0 else 0.0
        absolute_reduction = baseline_mean - method_mean

        logger.info(f"Token reduction: {token_reduction:.2f}% ({absolute_reduction:.2f} tokens)")

        # Efficiency score: accuracy / tokens (higher is better)
        baseline_efficiency = baseline_acc / baseline_mean if baseline_mean > 0 else 0.0
        method_efficiency = method_acc / method_mean if method_mean > 0 else 0.0
        efficiency_improvement = ((method_efficiency - baseline_efficiency) / baseline_efficiency) * 100 if baseline_efficiency > 0 else 0.0

        logger.info(f"Efficiency improvement: {efficiency_improvement:.2f}%")

        # Statistical significance test (paired t-test)
        t_stat, p_value = stats.ttest_rel(baseline_tokens, method_tokens)

        logger.info(f"Statistical test: t={t_stat:.4f}, p={p_value:.2e}")

        # Effect size (Cohen's d for paired samples)
        diff = np.array(baseline_tokens) - np.array(method_tokens)
        cohens_d = float(np.mean(diff) / np.std(diff)) if np.std(diff) > 0 else 0.0

        # Task success rate comparison
        accuracy_delta = method_acc - baseline_acc
        accuracy_maintained = abs(accuracy_delta) < 0.01  # Within 1% tolerance

        logger.info(f"Accuracy delta: {accuracy_delta:.4f}, maintained: {accuracy_maintained}")

        return {
            "baseline_stats": {
                "mean_tokens": baseline_mean,
                "std_tokens": baseline_std,
                "accuracy": float(baseline_acc),
                "efficiency_score": baseline_efficiency
            },
            "method_stats": {
                "mean_tokens": method_mean,
                "std_tokens": method_std,
                "accuracy": float(method_acc),
                "efficiency_score": method_efficiency
            },
            "improvements": {
                "token_reduction_percent": token_reduction,
                "token_reduction_absolute": absolute_reduction,
                "efficiency_improvement_percent": efficiency_improvement,
                "accuracy_delta": accuracy_delta,
                "accuracy_maintained": accuracy_maintained
            },
            "statistical_tests": {
                "t_statistic": float(t_stat),
                "p_value": float(p_value),
                "significant_at_0.05": bool(p_value < 0.05),
                "significant_at_0.01": bool(p_value < 0.01),
                "cohens_d": cohens_d,
                "effect_size_interpretation": interpret_effect_size(cohens_d)
            }
        }

    except Exception as e:
        logger.error(f"Error computing efficiency metrics: {e}")
        raise

print("✓ Token efficiency metrics function defined")

## Token Efficiency Metrics Calculation

The core function computes comprehensive token efficiency metrics including statistical tests and effect sizes.

In [None]:
def truncate_for_log(data: Any, max_len: int = 200) -> str:
    """Truncate data for logging to avoid long output."""
    s = str(data)
    if len(s) > max_len:
        return s[:max_len] + f"... (truncated, total length: {len(s)})"
    return s

def interpret_effect_size(d: float) -> str:
    """Interpret Cohen's d effect size."""
    abs_d = abs(d)
    if abs_d < 0.2:
        return "negligible"
    elif abs_d < 0.5:
        return "small"
    elif abs_d < 0.8:
        return "medium"
    else:
        return "large"

print("✓ Helper functions defined")

## Helper Functions

The evaluation uses several helper functions to compute token efficiency metrics and analyze results.

In [None]:
# Synthetic experiment data (based on actual results from the evaluation)
# This represents the data that would normally be loaded from method_summary.json files

# Experiment 1: Empirical Evaluation
exp1_data = {
    "method_summary": {
        "mean_tokens_per_episode": 318.5,
        "accuracy": 0.525,
        "dataset_size": 200,
        "compression_ratio": 0.012
    },
    "baseline_results": {
        "mean_tokens_per_episode": 320.715,
        "accuracy": 0.525
    }
}

# Experiment 2: Rank Ablation Study with detailed per-rank results
exp2_data = {
    "dataset_size": 200,
    "baseline_results": {
        "token_counts": np.random.normal(320.715, 50, 200).astype(int).tolist(),
        "predictions": ["model_a" if i % 3 == 0 else "model_b" if i % 3 == 1 else "tie" for i in range(200)],
        "metrics": {
            "accuracy": 0.525
        }
    },
    "rank_ablation_results": {
        "2": {
            "token_counts": np.random.normal(318, 52, 200).astype(int).tolist(),
            "predictions": ["model_a" if i % 3 == 0 else "model_b" if i % 3 == 1 else "tie" for i in range(200)],
            "metrics": {"accuracy": 0.52},
            "token_stats": {"mean_tokens_per_episode": 318.0},
            "compression_ratio": 0.008,
            "param_reduction": 0.75
        },
        "4": {
            "token_counts": np.random.normal(317.2, 51, 200).astype(int).tolist(),
            "predictions": ["model_a" if i % 3 == 0 else "model_b" if i % 3 == 1 else "tie" for i in range(200)],
            "metrics": {"accuracy": 0.523},
            "token_stats": {"mean_tokens_per_episode": 317.2},
            "compression_ratio": 0.011,
            "param_reduction": 0.50
        },
        "8": {
            "token_counts": np.random.normal(316.785, 49, 200).astype(int).tolist(),
            "predictions": ["model_a" if i % 3 == 0 else "model_b" if i % 3 == 1 else "tie" for i in range(200)],
            "metrics": {"accuracy": 0.525},
            "token_stats": {"mean_tokens_per_episode": 316.785},
            "compression_ratio": 0.012,
            "param_reduction": 0.25
        },
        "16": {
            "token_counts": np.random.normal(316.9, 48.5, 200).astype(int).tolist(),
            "predictions": ["model_a" if i % 3 == 0 else "model_b" if i % 3 == 1 else "tie" for i in range(200)],
            "metrics": {"accuracy": 0.524},
            "token_stats": {"mean_tokens_per_episode": 316.9},
            "compression_ratio": 0.012,
            "param_reduction": 0.125
        }
    }
}

print("✓ Synthetic experimental data loaded successfully")
print(f"Dataset size: {exp2_data['dataset_size']}")
print(f"Ranks tested: {list(exp2_data['rank_ablation_results'].keys())}")

## Sample Data

Instead of loading from external JSON files, we'll create synthetic data that represents the experimental results from two studies:
1. **Experiment 1**: Empirical evaluation of Low-Rank Recurrent Coordinator
2. **Experiment 2**: Rank ablation study with multiple rank configurations

In [None]:
# Import required libraries
import json
import logging
import numpy as np
from scipy import stats
from typing import Dict, List, Any

# Configure logging for demo
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Cross-Task Token Efficiency Evaluation of Low-Rank Coordinator

This notebook demonstrates the evaluation methodology for measuring token efficiency improvements achieved by the Low-Rank Recurrent Coordinator compared to baseline methods.

## Overview

The evaluation analyzes:
- **Token Usage**: Comparing token consumption between baseline and low-rank coordinator methods
- **Accuracy Preservation**: Ensuring task performance is maintained while reducing tokens
- **Statistical Significance**: Using paired t-tests to validate improvements
- **Rank Optimization**: Finding optimal low-rank configurations through ablation studies

The original script evaluated 200 coordination tasks from LMSYS chatbot arena conversations, achieving 1.23% token reduction while maintaining 52.5% accuracy.