In [None]:
def run_experiment(examples: List[Dict], coordinator, tracker: TokenTracker, name: str) -> Tuple[List[str], List[int]]:
    """Run experiment with given coordinator."""
    print(f"\\n{BLUE}Running experiment: {name}{END}")

    predictions = []
    token_counts = []

    for idx, example in enumerate(examples):
        coordinator.reset()
        episode_start_tokens = tracker.total_tokens

        context = example['context']
        agent_outputs = [context['response_a'], context['response_b']]

        state, coordinator_message = coordinator.step(agent_outputs)
        step_tokens = tracker.log_coordinator_step(agent_outputs, coordinator_message)

        prediction = predict_winner(state, agent_outputs, context)
        predictions.append(prediction)

        episode_tokens = tracker.total_tokens - episode_start_tokens
        token_counts.append(episode_tokens)

    print(f"{GREEN}Experiment {name} complete: {len(predictions)} predictions{END}")
    return predictions, token_counts


def predict_winner(state: np.ndarray, agent_outputs: List[str], context: Dict) -> str:
    """Predict winner based on coordinator state."""
    len_a = len(agent_outputs[0])
    len_b = len(agent_outputs[1])

    confidence = np.sum(state[state > 0])

    score_a = len_a * (1 + confidence * 0.1)
    score_b = len_b * (1 + confidence * 0.1)

    if abs(score_a - score_b) < 50:
        return "tie"
    elif score_a > score_b:
        return "model_a"
    else:
        return "model_b"


def evaluate_performance(predictions: List[str], ground_truth: List[str]) -> Dict[str, float]:
    """Evaluate task performance metrics."""
    accuracy = accuracy_score(ground_truth, predictions)
    f1_macro = f1_score(ground_truth, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(ground_truth, predictions, average='weighted', zero_division=0)

    return {
        "accuracy": float(accuracy),
        "f1_macro": float(f1_macro),
        "f1_weighted": float(f1_weighted)
    }

print("üî¨ Experiment framework functions implemented!")
print("   - run_experiment: Executes coordinator on dataset")
print("   - predict_winner: Makes predictions based on coordinator state") 
print("   - evaluate_performance: Calculates accuracy and F1 scores")

## üî¨ Experiment Framework

Let's implement the core experiment functions for running the rank ablation study.

In [None]:
class FullRankCoordinator:
    """Baseline: Full-rank recurrent coordinator for multi-agent coordination."""

    def __init__(self, hidden_dim: int = 256):
        """Initialize full-rank coordinator."""
        self.hidden_dim = hidden_dim
        self.W = np.random.randn(hidden_dim, hidden_dim) * 0.01
        self.state = np.zeros(hidden_dim)
        print(f"{GREEN}FullRankCoordinator initialized: hidden_dim={hidden_dim}{END}")

    def reset(self):
        """Reset coordinator state."""
        self.state = np.zeros(self.hidden_dim)

    def step(self, agent_outputs: List[str]) -> Tuple[np.ndarray, str]:
        """Recurrent step: process agent outputs and update state."""
        features = self._encode_outputs(agent_outputs)
        self.state = self.W @ self.state + features
        coordinator_message = self._generate_message(self.state)
        return self.state.copy(), coordinator_message

    def _encode_outputs(self, outputs: List[str]) -> np.ndarray:
        """Encode agent outputs into feature vector."""
        features = []
        for output in outputs:
            features.extend([
                len(output.split()),
                len(output),
                output.count('.'),
                output.count('?'),
            ])

        features_array = np.array(features[:self.hidden_dim])
        if len(features_array) < self.hidden_dim:
            padded = np.zeros(self.hidden_dim)
            padded[:len(features_array)] = features_array
            features_array = padded

        features_array = features_array / (np.linalg.norm(features_array) + 1e-8)
        return features_array

    def _generate_message(self, state: np.ndarray) -> str:
        """Generate coordinator message from state."""
        message_parts = []
        for i in range(0, len(state), 10):
            val = state[i]
            if abs(val) > 0.1:
                message_parts.append(f"dim{i}:{val:.2f}")
        return " ".join(message_parts)


class LowRankRecurrentCoordinator:
    """Low-rank recurrent coordinator with rank parameter for ablation."""

    def __init__(self, hidden_dim: int = 256, rank: int = 32, num_modules: int = 4):
        """Initialize low-rank coordinator with specified rank."""
        self.hidden_dim = hidden_dim
        self.rank = rank
        self.num_modules = num_modules

        # Low-rank factorization: W = U @ V^T
        self.U = np.random.randn(hidden_dim, rank) * 0.01
        self.V = np.random.randn(hidden_dim, rank) * 0.01

        # RIM sparse attention
        self.active_k = max(1, num_modules // 2)

        # Module-specific transformations
        self.module_weights = [
            np.random.randn(hidden_dim, rank) * 0.01
            for _ in range(num_modules)
        ]

        self.state = np.zeros(hidden_dim)

        compression_ratio = rank / hidden_dim
        param_reduction = (2 * hidden_dim * rank) / (hidden_dim * hidden_dim)

        print(f"{GREEN}LowRankCoordinator initialized: rank={rank}, compression={compression_ratio:.2%}{END}")

    def reset(self):
        """Reset coordinator state."""
        self.state = np.zeros(self.hidden_dim)

    def step(self, agent_outputs: List[str]) -> Tuple[np.ndarray, str]:
        """Low-rank recurrent step with sparse module updates."""
        features = self._encode_outputs(agent_outputs)
        active_modules = self._select_active_modules(self.state, features)

        # Low-rank update: s_{t+1} = U @ (V^T @ s_t) + module_updates
        state_proj = self.V.T @ self.state
        new_state = self.U @ state_proj

        # Apply sparse module updates
        for module_idx in active_modules:
            module_update = self.module_weights[module_idx] @ state_proj
            new_state += module_update

        new_state += features
        self.state = new_state

        # Compressed message based on rank
        coordinator_message = self._generate_compressed_message(state_proj)
        return self.state.copy(), coordinator_message

    def _encode_outputs(self, outputs: List[str]) -> np.ndarray:
        """Encode agent outputs into feature vector."""
        features = []
        for output in outputs:
            features.extend([
                len(output.split()),
                len(output),
                output.count('.'),
                output.count('?'),
            ])

        features_array = np.array(features[:self.hidden_dim])
        if len(features_array) < self.hidden_dim:
            padded = np.zeros(self.hidden_dim)
            padded[:len(features_array)] = features_array
            features_array = padded

        features_array = features_array / (np.linalg.norm(features_array) + 1e-8)
        return features_array

    def _select_active_modules(self, state: np.ndarray, features: np.ndarray) -> List[int]:
        """Select top-k modules based on attention scores."""
        scores = []
        state_proj = self.V.T @ state

        for module_idx, module_w in enumerate(self.module_weights):
            module_proj = module_w.T @ features
            score = np.dot(module_proj[:self.rank], state_proj)
            scores.append((score, module_idx))

        scores.sort(reverse=True)
        active = [idx for _, idx in scores[:self.active_k]]
        return active

    def _generate_compressed_message(self, state_proj: np.ndarray) -> str:
        """Generate compressed coordinator message from projected state."""
        message_parts = []
        for i in range(len(state_proj)):
            val = state_proj[i]
            if abs(val) > 0.1:
                message_parts.append(f"r{i}:{val:.2f}")
        return " ".join(message_parts)

print("üß† Coordinator classes implemented successfully!")
print("   - FullRankCoordinator: Baseline with full recurrent matrix")
print("   - LowRankRecurrentCoordinator: Low-rank version with configurable rank")

## üß† Coordinator Implementations

Now let's implement both the baseline full-rank coordinator and the low-rank recurrent coordinator that we'll be ablating.

In [None]:
# Color codes for logging
BLUE, GREEN, YELLOW, CYAN, RED, END = "\\033[94m", "\\033[92m", "\\033[93m", "\\033[96m", "\\033[91m", "\\033[0m"

def truncate_str(text: str, max_len: int = 100) -> str:
    """Truncate long strings for logging."""
    if len(text) <= max_len:
        return text
    return text[:max_len] + f"... ({len(text)} chars total)"

@dataclass
class ExampleResult:
    """Single example result."""
    input: str
    output: str
    context: Dict[str, Any]
    dataset: str
    split: str
    predict_baseline: str
    predict_method: str
    method: str

@dataclass
class ExperimentResult:
    """Schema matching exp_gen_sol_out.json format."""
    examples: List[Dict[str, Any]]

class TokenTracker:
    """Track token usage for multi-agent coordination."""

    def __init__(self, model: str = "gpt-4"):
        """Initialize token tracker with tiktoken encoder."""
        try:
            self.encoding = tiktoken.encoding_for_model(model)
            print(f"{GREEN}TokenTracker initialized with model: {model}{END}")
        except Exception as e:
            print(f"{YELLOW}Could not load model-specific encoding, using cl100k_base: {e}{END}")
            self.encoding = tiktoken.get_encoding("cl100k_base")

        self.total_tokens = 0
        self.episode_tokens = []
        self.call_count = 0

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        try:
            tokens = len(self.encoding.encode(text))
            return tokens
        except Exception:
            # Fallback: approximate as words * 1.3
            return int(len(text.split()) * 1.3)

    def log_coordinator_step(self, agent_outputs: List[str], coordinator_message: str = ""):
        """Log tokens for a coordinator step."""
        step_tokens = 0

        # Count tokens in agent outputs
        for output in agent_outputs:
            step_tokens += self.count_tokens(output)

        # Count tokens in coordinator message
        if coordinator_message:
            step_tokens += self.count_tokens(coordinator_message)

        self.total_tokens += step_tokens
        self.episode_tokens.append(step_tokens)
        self.call_count += 1

        return step_tokens

    def get_stats(self) -> Dict[str, float]:
        """Get aggregated token statistics."""
        return {
            "total_tokens": self.total_tokens,
            "num_episodes": len(self.episode_tokens),
            "mean_tokens_per_episode": np.mean(self.episode_tokens) if self.episode_tokens else 0,
            "std_tokens_per_episode": np.std(self.episode_tokens) if self.episode_tokens else 0,
            "call_count": self.call_count
        }

print("‚úÖ Utility classes and functions defined successfully!")

## üîß Utility Classes and Functions

Let's define the core utility classes for token tracking and helper functions.

In [None]:
# Embedded dataset - complete multi-LLM coordination examples
EMBEDDED_DATASET = {
    "examples": [
        {
            "input": "What is the difference between OpenCL and CUDA?",
            "output": "Winner: model_b",
            "context": {
                "model_a": "chatglm-6b",
                "model_b": "koala-13b",
                "winner": "model_b",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "OpenCL and CUDA are two different programming models that are used for parallel computing.OpenCL is a general-purposeÂπ∂Ë°åÁºñÁ®ãÊé•Âè£ that allows developers to write parallel code that can run on any platform that supportsCL, which includes most modern operating systems and computer systems, including Windows, Linux, and macOS. It provides a lower-level, more flexible API that is more suitable for building large-scale distributed computing systems.CUDA is a specific implementation ofOpenCL that is designed for performance and scalability in devices with multiple GPU(s). It was developed by Nvidia and is widely used for scientific computing, machine learning, andÂõæÂΩ¢ÂõæÂÉèÂ§ÑÁêÜ. The CUDA API provides a high-level, more efficient way to write parallel code that can run on top of a GPU, while still maintaining the ability to write code that can be easily adapted for other parallel computing platforms.",
                "response_b": "OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in several key ways:\\n\\n1.   Architecture: OpenCL is a general-purpose parallel computing language, while CUDA is designed specifically for GPU computing. OpenCL can run on various types of processors, including CPUs, GPUs, and FPGAs, whereas CUDA is only compatible with NVIDIA GPUs.\\n2.   Language: OpenCL is a C++ language, while CUDA is a C++ language with additional features for GPU computing. CUDA provides additional libraries and tools for handling graphics and other tasks unique to GPUs.\\n3.   Driver support: OpenCL has a wider range of device support, with support for not just NVIDIA GPUs but also other manufacturers like AMD, Intel, and ARM. CUDA is only available on NVIDIA GPUs, and requires the CUDA Toolkit to be installed on the host machine.\\n4.   Memory management: OpenCL provides a unified memory model, where all the data is stored in global memory, and the kernel can access any part of it. CUDA, on the other hand, uses a hierarchical memory architecture, where data is divided into host, device, and shared memory.\\n5.   Performance: OpenCL and CUDA have different strengths and weaknesses in terms of performance. OpenCL is generally faster for numerical computations, while CUDA is faster for graphics and other tasks that require fast memory access. CUDA also has better support for 64-bit floating-point operations, which are essential for some scientific applications.\\n\\nIn summary, OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in terms of architecture, language, driver support, memory management, and performance. OpenCL is a general-purpose language that can run on various types of processors, while CUDA is specific to NVIDIA GPUs and provides additional libraries and tools for handling graphics and other tasks unique to GPUs.",
                "token_usage": {
                    "total_input_tokens": 181,
                    "total_output_tokens_a": 171,
                    "total_output_tokens_b": 373,
                    "total_tokens": 725
                }
            },
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        },
        {
            "input": "Why did my parent not invite me to their wedding?",
            "output": "Winner: tie", 
            "context": {
                "model_a": "oasst-pythia-12b",
                "model_b": "alpaca-13b",
                "winner": "tie",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "It is possible that your parent did not invite you to their wedding for a variety of reasons. It could be that they did not want to burden you with the planning or expense of the event, or they may not have had the time or resources to include you. Additionally, it could be that they did not feel comfortable having you attend due to your age or relationship with them. It is important to remember that your parent may have had their own reasons for not inviting you, and it is not your place to judge their choices. If you feel neglected or upset about not being invited, it is best to talk to them about it and express your feelings in a non-accusatory way.",
                "response_b": "It is likely that they wanted to keep the guest list small and intimate. Maybe they felt that inviting you would make the event too crowded, or that they wanted to keep the event reserved for close friends and family. It could be that they wanted to avoid any potential drama or conflict that could arise from having too many people at the event. Whatever the reason, it's important to remember that they made this decision out of love and respect for you, and that their decision doesn't define your relationship.",
                "token_usage": {
                    "total_input_tokens": 150,
                    "total_output_tokens_a": 139,
                    "total_output_tokens_b": 100,
                    "total_tokens": 389
                }
            },
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        },
        {
            "input": "Fuji vs. Nikon, which is better?",
            "output": "Winner: model_b",
            "context": {
                "model_a": "koala-13b",
                "model_b": "oasst-pythia-12b", 
                "winner": "model_b",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "Both Fuji and Nikon are popular camera brands that offer a wide range of cameras for different types of photography. It really depends on your specific needs and preferences as to which brand is \"better\" for you.\\n\\nFujifilm is known for its high image quality, especially in the areas of color accuracy and dynamic range. Many photographers also appreciate the company's focus on ergonomics and the overall user experience of its cameras. Additionally, Fujifilm has a loyal following among enthusiasts and professional photographers, thanks to its commitment to film photography and its high-end mirrorless cameras.\\n\\nNikon, on the other hand, is known for its advanced autofocus system and image stabilization technology, as well as its high-end cameras and lenses. Nikon also has a wide selection of cameras and lenses to choose from, including both DSLR and mirrorless options.\\n\\nUltimately, it's difficult to say which brand is \"better\" without knowing more about your specific needs and preferences. It might be helpful to research both brands further and try out a few different models before making a decision.",
                "response_b": "This is a subjective question and the answer depends on the individual preferences and needs. Both Fujifilm and Nikon are well-known for their high-quality cameras, and the choice between the two ultimately depends on the user's personal preferences and needs.\\n\\nSome factors to consider when deciding between a Fuji and a Nikon camera include:\\n\\n1. Image quality: Fuji cameras are known for their high-quality images, with a distinctive film-like quality. Nikon cameras are also notable for their image quality, but they may not offer the same level of sharpness and color accuracy as Fuji cameras.\\n\\n2. Focusing: Fuji cameras are known for their fast and precise autofocus, while Nikon cameras tend to have a slower and more manual focus system.\\n\\n3. Image size: Fuji cameras are known for their wide range of photo sizes, from small compacts to large-format cameras. Nikon cameras are also capable of producing high-quality images, but they may not be as versatile as Fuji cameras.\\n\\n4. User interface: Fuji cameras have a user-friendly interface, with a simplified design that is easy to navigate. Nikon cameras may have a more complex interface, but they also offer a wealth of features and customization options.\\n\\nUltimately, the best camera for you will depend on your specific needs and preferences. If you're looking for a high-quality camera that offers a unique film-like experience, a Fuji camera may be the right choice for you. If you're looking for a camera that is easy to use and offers a wide range of photo sizes, a Nikon camera may be the best choice.",
                "token_usage": {
                    "total_input_tokens": 222,
                    "total_output_tokens_a": 212,
                    "total_output_tokens_b": 309,
                    "total_tokens": 743
                }
            },
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        },
        {
            "input": "How to build an arena for chatbots?",
            "output": "Winner: model_b",
            "context": {
                "model_a": "vicuna-13b",
                "model_b": "oasst-pythia-12b",
                "winner": "model_b",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "Building an arena for chatbots can be done by following these steps:\\n\\n1. Define the purpose of the arena: The first step is to clearly define the purpose of the arena, such as testing chatbots for a specific task or evaluating their performance in a certain scenario.\\n2. Choose a platform: There are several platforms available for building chatbot arenas, such as Dialogflow, Botpress, and Rasa. Choose a platform that best fits your needs and has the necessary features for building the arena.\\n3. Create the bot: Create the chatbot that will participate in the arena. This bot should be designed to perform a specific task or set of tasks that are relevant to the purpose of the arena.\\n4. Define the evaluation criteria: Determine how the chatbots will be evaluated in the arena. This could include factors such as accuracy, speed, and user experience.\\n5. Develop the arena: Use the platform to build the arena, including defining the tasks and scenarios that the chatbots will encounter.\\n6. Test and refine: Test the arena with the chatbots to ensure that it is functioning as intended. Refine the arena as needed based on the results of the testing.\\n7. Evaluate the chatbots: Use the evaluation criteria to assess the performance of the chatbots in the arena and compare their results.\\n\\nBy following these steps, you can build an arena for chatbots that will allow you to evaluate and compare their performance on a specific task or set of tasks. This can be useful for improving the accuracy and reliability of chatbots and for identifying areas for further development.",
                "response_b": "Building an arena for chatbots is a great way to encourage users to interact with your bot and provides a space for your bot to showcase its abilities. Here are some steps you can follow to build an arena for chatbots:\\n\\n1. Define the objective: Start by defining the objective of your arena. This could be to create a platform where users can interact with chatbots, ask questions, or simply have conversations.\\n\\n2. Choose a location: Decide on a location for your arena and make sure it is accessible and has enough space to accommodate the number of bots you plan to have.\\n\\n3. Set up the infrastructure: Set up the infrastructure needed to run your arena, including a server, database, and programming tools.\\n\\n4. Design the bots: Design the bots that will be participating in your arena. Consider the various features and capabilities that your bots will need to perform and create a design that will allow them to interact with users in a natural and intuitive way.\\n\\n5. Develop the software: Develop the software that will run your arena, including the chatbot platform and any other functionality that your bots will need.\\n\\n6. Test and refine: Test your arena thoroughly and refine it as needed to ensure that it runs smoothly and effectively.\\n\\n7. Launch and promote: Launch your arena and promote it to your target audience. Encourage users to interact with your bots and provide feedback to help improve the experience.\\n\\nBy following these steps, you should be able to build an arena for chatbots that provides a platform for users to interact with your bot and provides a space for your bot to showcase its abilities.",
                "token_usage": {
                    "total_input_tokens": 327,
                    "total_output_tokens_a": 318,
                    "total_output_tokens_b": 320,
                    "total_tokens": 965
                }
            },
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        },
        {
            "input": "When is it today?",
            "output": "Winner: model_a",
            "context": {
                "model_a": "vicuna-13b",
                "model_b": "koala-13b",
                "winner": "model_a",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "I'm sorry, I cannot determine the current date for you as I do not have access to the current date and time. My knowledge cutoff is September 2021, and I do not have real-time access to the internet. Please let me know if there is anything else I can assist you with.",
                "response_b": "Today is February 23, 2023.",
                "token_usage": {
                    "total_input_tokens": 66,
                    "total_output_tokens_a": 61,
                    "total_output_tokens_b": 10,
                    "total_tokens": 137
                }
            },
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        }
    ]
}

print(f"üìä Loaded {len(EMBEDDED_DATASET['examples'])} examples from embedded dataset")
print("‚úÖ Dataset is completely self-contained - no external files needed!")

## üìä Embedded Dataset (Self-Contained)

The dataset below contains multi-LLM coordination examples with token usage annotations. This is a subset of the original dataset embedded directly in the notebook to make it completely self-contained.

In [None]:
# Import required libraries
import json
import logging
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass, asdict

import numpy as np
import tiktoken
from scipy.stats import ttest_rel
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend

print("‚úÖ All required libraries imported successfully!")
print("üìä Setting up experiment configuration...")

# Rank-Ablation Study of Low-Rank Recurrent Coordinator on Multi-LLM Coordination Datasets

**Interactive Demo Notebook**

## Experiment Overview

**Hypothesis:** Systematically evaluate how the dimensionality (rank) of the shared recurrent coordinator affects (1) token-efficiency and (2) task performance across multi-LLM interaction datasets.

**Expected Outcome:** If the low-rank recurrent coordinator retains performance while reducing token usage, we will obtain a clear trade-off curve (rank vs. accuracy vs. tokens) that validates the hypothesis.

This notebook contains a complete, self-contained implementation of the rank ablation experiment that can be run independently without external file dependencies.