In [None]:
# Analyze communication efficiency across examples
def analyze_efficiency_patterns(examples):
    """Analyze communication efficiency patterns across the dataset."""
    
    if not examples:
        print("No examples to analyze")
        return
    
    print(f"{CYAN}=== COMMUNICATION EFFICIENCY ANALYSIS ==={END}")
    print(f"Analyzing {len(examples)} conversation examples\\n")
    
    # Extract efficiency metrics
    total_tokens = []
    avg_tokens_per_turn = []
    input_output_ratios = []
    coordination_overheads = []
    model_pairs = []
    winners = []
    
    for example in examples:
        token_usage = example['context']['token_usage']
        efficiency = token_usage['communication_efficiency']
        
        total_tokens.append(token_usage['total_tokens'])
        avg_tokens_per_turn.append(efficiency['avg_tokens_per_turn'])
        input_output_ratios.append(efficiency['input_output_ratio'])
        coordination_overheads.append(efficiency['coordination_overhead'])
        
        model_pair = f"{example['context']['model_a']} vs {example['context']['model_b']}"
        model_pairs.append(model_pair)
        winners.append(example['context']['winner'])
    
    # Calculate statistics
    print(f"{BLUE}Token Usage Statistics:{END}")
    print(f"  Average total tokens: {sum(total_tokens)/len(total_tokens):.1f}")
    print(f"  Min total tokens: {min(total_tokens)}")
    print(f"  Max total tokens: {max(total_tokens)}")
    print()
    
    print(f"{BLUE}Efficiency Metrics:{END}")
    print(f"  Avg tokens per turn: {sum(avg_tokens_per_turn)/len(avg_tokens_per_turn):.1f}")
    print(f"  Avg input/output ratio: {sum(input_output_ratios)/len(input_output_ratios):.3f}")
    print(f"  Avg coordination overhead: {sum(coordination_overheads)/len(coordination_overheads):.3f}")
    print()
    
    print(f"{BLUE}Model Performance:{END}")
    winner_counts = {}
    for winner in winners:
        winner_counts[winner] = winner_counts.get(winner, 0) + 1
    
    for winner, count in winner_counts.items():
        percentage = (count / len(winners)) * 100
        print(f"  {winner}: {count} ({percentage:.1f}%)")
    print()
    
    print(f"{BLUE}Model Pairs in Dataset:{END}")
    unique_pairs = set(model_pairs)
    for pair in unique_pairs:
        count = model_pairs.count(pair)
        print(f"  {pair}: {count} conversations")

# Run the analysis
analyze_efficiency_patterns(enriched_examples)

## Analyze Communication Efficiency Across Examples

Let's analyze the communication efficiency patterns across all processed examples:

In [None]:
import json

# Display the first example in a readable format
if enriched_examples:
    example = enriched_examples[0]
    
    print(f"{CYAN}=== EXAMPLE STRUCTURE ==={END}")
    print(f"{BLUE}Input:{END} {example['input'][:100]}...")
    print(f"{BLUE}Models:{END} {example['context']['model_a']} vs {example['context']['model_b']}")
    print(f"{BLUE}Winner:{END} {example['context']['winner']}")
    print()
    
    print(f"{CYAN}=== TOKEN USAGE METADATA ==={END}")
    token_usage = example['context']['token_usage']
    
    print(f"{BLUE}Total Tokens:{END}")
    print(f"  Input tokens: {token_usage['total_input_tokens']}")
    print(f"  Model A output: {token_usage['total_output_tokens_a']}")
    print(f"  Model B output: {token_usage['total_output_tokens_b']}")
    print(f"  Total: {token_usage['total_tokens']}")
    print()
    
    print(f"{BLUE}Communication Efficiency:{END}")
    efficiency = token_usage['communication_efficiency']
    print(f"  Avg tokens per turn: {efficiency['avg_tokens_per_turn']:.1f}")
    print(f"  Input/output ratio: {efficiency['input_output_ratio']:.3f}")
    print(f"  Coordination overhead: {efficiency['coordination_overhead']:.3f}")
    print()
    
    print(f"{BLUE}Per-turn Analysis:{END}")
    for turn in token_usage['turns']:
        print(f"  Turn {turn['turn_number']}: {turn.get('input_tokens', 0)} in → "
              f"{turn.get('response_a_tokens', 0)} (A) + {turn.get('response_b_tokens', 0)} (B) out")

## Examine Example Structure

Let's look at the structure of one enriched example to understand the token usage metadata:

In [None]:
# Process the sample dataset
enriched_examples = process_chatbot_arena_with_tokens(sample_chatbot_arena_data, limit=10)

print(f"\\n{GREEN}Dataset Processing Complete!{END}")
print(f"Generated {len(enriched_examples)} enriched examples")
print(f"Each example includes:")
print("  ✓ Original conversation data")
print("  ✓ Detailed token usage metadata") 
print("  ✓ Communication efficiency metrics")
print("  ✓ API metadata and timestamps")

## Process the Dataset

Let's run the processing function on our sample data and examine the results:

In [None]:
def process_chatbot_arena_with_tokens(data: List[Dict], limit: int = 200) -> List[Dict]:
    """
    Process lmsys/chatbot_arena_conversations dataset with token usage enrichment.

    This adds detailed per-turn token counts, message lengths, and API-call metadata
    to enable communication-efficiency evaluation.
    """
    print(f"{YELLOW}Processing{END} chatbot_arena with token enrichment (target: {limit} examples)")
    examples = []

    for idx, item in enumerate(data):
        if len(examples) >= limit:
            break

        # Extract conversations
        conv_a = item.get("conversation_a", [])
        conv_b = item.get("conversation_b", [])

        if not conv_a or not conv_b or len(conv_a) < 2 or len(conv_b) < 2:
            continue

        # Extract first user message
        user_msg = conv_a[0].get("content", "") if conv_a else ""
        response_a = conv_a[1].get("content", "") if len(conv_a) > 1 else ""
        response_b = conv_b[1].get("content", "") if len(conv_b) > 1 else ""

        if not user_msg or not response_a or not response_b:
            continue

        # Compute token usage metadata
        token_metadata = compute_token_metadata(conv_a, conv_b)

        # Create enriched example
        example = {
            "input": user_msg,
            "context": {
                "model_a": item.get("model_a", ""),
                "model_b": item.get("model_b", ""),
                "winner": item.get("winner", ""),
                "judge": item.get("judge", ""),
                "language": item.get("language", ""),
                "turn": item.get("turn", 1),
                "response_a": response_a,
                "response_b": response_b,
                # TOKEN USAGE METADATA (NEW)
                "token_usage": token_metadata,
                "api_metadata": {
                    "question_id": item.get("question_id", ""),
                    "timestamp": item.get("tstamp", time.time()),
                    "toxic_chat_tag": item.get("toxic_chat_tag", False),
                    "anony": item.get("anony", True)
                }
            },
            "output": f"Winner: {item.get('winner', 'unknown')}",
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        }
        examples.append(example)

    print(f"{GREEN}Processed{END} {len(examples)} enriched examples with token metadata")
    return examples

print("✓ Dataset processing function defined")

## Dataset Processing Function

Now let's define the main processing function that enriches the conversation data with token usage metadata:

In [None]:
# Sample dataset - inline data to make notebook self-contained
sample_chatbot_arena_data = [
    {
        "question_id": "58210e39b3fd4441a2bd4a518bb44c2d",
        "model_a": "chatglm-6b",
        "model_b": "koala-13b",
        "winner": "model_b",
        "judge": "arena_user_973",
        "language": "English",
        "turn": 1,
        "tstamp": 1682351591.1322,
        "conversation_a": [
            {
                "content": "What is the difference between OpenCL and CUDA?",
                "role": "user"
            },
            {
                "content": "OpenCL and CUDA are two different programming models that are used for parallel computing.OpenCL is a general-purpose并行编程接口 that allows developers to write parallel code that can run on any platform that supportsCL, which includes most modern operating systems and computer systems, including Windows, Linux, and macOS. It provides a lower-level, more flexible API that is more suitable for building large-scale distributed computing systems.CUDA is a specific implementation ofOpenCL that is designed for performance and scalability in devices with multiple GPU(s). It was developed by Nvidia and is widely used for scientific computing, machine learning, and图形图像处理. The CUDA API provides a high-level, more efficient way to write parallel code that can run on top of a GPU, while still maintaining the ability to write code that can be easily adapted for other parallel computing platforms.",
                "role": "assistant"
            }
        ],
        "conversation_b": [
            {
                "content": "What is the difference between OpenCL and CUDA?",
                "role": "user"
            },
            {
                "content": "OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in several key ways:\\n\\n1.   Architecture: OpenCL is a general-purpose parallel computing language, while CUDA is designed specifically for GPU computing. OpenCL can run on various types of processors, including CPUs, GPUs, and FPGAs, whereas CUDA is only compatible with NVIDIA GPUs.\\n2.   Language: OpenCL is a C++ language, while CUDA is a C++ language with additional features for GPU computing. CUDA provides additional libraries and tools for handling graphics and other tasks unique to GPUs.\\n3.   Driver support: OpenCL has a wider range of device support, with support for not just NVIDIA GPUs but also other manufacturers like AMD, Intel, and ARM. CUDA is only available on NVIDIA GPUs, and requires the CUDA Toolkit to be installed on the host machine.\\n4.   Memory management: OpenCL provides a unified memory model, where all the data is stored in global memory, and the kernel can access any part of it. CUDA, on the other hand, uses a hierarchical memory architecture, where data is divided into host, device, and shared memory.\\n5.   Performance: OpenCL and CUDA have different strengths and weaknesses in terms of performance. OpenCL is generally faster for numerical computations, while CUDA is faster for graphics and other tasks that require fast memory access. CUDA also has better support for 64-bit floating-point operations, which are essential for some scientific applications.\\n\\nIn summary, OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in terms of architecture, language, driver support, memory management, and performance. OpenCL is a general-purpose language that can run on various types of processors, while CUDA is specific to NVIDIA GPUs and provides additional libraries and tools for handling graphics and other tasks unique to GPUs.",
                "role": "assistant"
            }
        ],
        "toxic_chat_tag": {
            "roberta-large": {"flagged": False, "probability": 0.0083130226},
            "t5-large": {"flagged": False, "score": 7.94367e-05}
        },
        "anony": True
    },
    {
        "question_id": "2564acd09e3942fd97657d05282d4389",
        "model_a": "oasst-pythia-12b",
        "model_b": "alpaca-13b",
        "winner": "tie",
        "judge": "arena_user_973",
        "language": "English",
        "turn": 1,
        "tstamp": 1682351654.6747,
        "conversation_a": [
            {
                "content": "Why did my parent not invite me to their wedding?",
                "role": "user"
            },
            {
                "content": "It is possible that your parent did not invite you to their wedding for a variety of reasons. It could be that they did not want to burden you with the planning or expense of the event, or they may not have had the time or resources to include you. Additionally, it could be that they did not feel comfortable having you attend due to your age or relationship with them. It is important to remember that your parent may have had their own reasons for not inviting you, and it is not your place to judge their choices. If you feel neglected or upset about not being invited, it is best to talk to them about it and express your feelings in a non-accusatory way.",
                "role": "assistant"
            }
        ],
        "conversation_b": [
            {
                "content": "Why did my parent not invite me to their wedding?",
                "role": "user"
            },
            {
                "content": "It is likely that they wanted to keep the guest list small and intimate. Maybe they felt that inviting you would make the event too crowded, or that they wanted to keep the event reserved for close friends and family. It could be that they wanted to avoid any potential drama or conflict that could arise from having too many people at the event. Whatever the reason, it's important to remember that they made this decision out of love and respect for you, and that their decision doesn't define your relationship.",
                "role": "assistant"
            }
        ],
        "toxic_chat_tag": {
            "roberta-large": {"flagged": False, "probability": 0.038766861},
            "t5-large": {"flagged": False, "score": 0.0017759029}
        },
        "anony": True
    },
    {
        "question_id": "90bfd142157948aba01931726c888e7f",
        "model_a": "koala-13b",
        "model_b": "oasst-pythia-12b",
        "winner": "model_b",
        "judge": "arena_user_973",
        "language": "English",
        "turn": 1,
        "tstamp": 1682351708.9351,
        "conversation_a": [
            {
                "content": "Fuji vs. Nikon, which is better?",
                "role": "user"
            },
            {
                "content": "Both Fuji and Nikon are popular camera brands that offer a wide range of cameras for different types of photography. It really depends on your specific needs and preferences as to which brand is \"better\" for you.\\n\\nFujifilm is known for its high image quality, especially in the areas of color accuracy and dynamic range. Many photographers also appreciate the company's focus on ergonomics and the overall user experience of its cameras. Additionally, Fujifilm has a loyal following among enthusiasts and professional photographers, thanks to its commitment to film photography and its high-end mirrorless cameras.\\n\\nNikon, on the other hand, is known for its advanced autofocus system and image stabilization technology, as well as its high-end cameras and lenses. Nikon also has a wide selection of cameras and lenses to choose from, including both DSLR and mirrorless options.\\n\\nUltimately, it's difficult to say which brand is \"better\" without knowing more about your specific needs and preferences. It might be helpful to research both brands further and try out a few different models before making a decision.",
                "role": "assistant"
            }
        ],
        "conversation_b": [
            {
                "content": "Fuji vs. Nikon, which is better?",
                "role": "user"
            },
            {
                "content": "This is a subjective question and the answer depends on the individual preferences and needs. Both Fujifilm and Nikon are well-known for their high-quality cameras, and the choice between the two ultimately depends on the user's personal preferences and needs.\\n\\nSome factors to consider when deciding between a Fuji and a Nikon camera include:\\n\\n1. Image quality: Fuji cameras are known for their high-quality images, with a distinctive film-like quality. Nikon cameras are also notable for their image quality, but they may not offer the same level of sharpness and color accuracy as Fuji cameras.\\n\\n2. Focusing: Fuji cameras are known for their fast and precise autofocus, while Nikon cameras tend to have a slower and more manual focus system.\\n\\n3. Image size: Fuji cameras are known for their wide range of photo sizes, from small compacts to large-format cameras. Nikon cameras are also capable of producing high-quality images, but they may not be as versatile as Fuji cameras.\\n\\n4. User interface: Fuji cameras have a user-friendly interface, with a simplified design that is easy to navigate. Nikon cameras may have a more complex interface, but they also offer a wealth of features and customization options.\\n\\nUltimately, the best camera for you will depend on your specific needs and preferences. If you're looking for a high-quality camera that offers a unique film-like experience, a Fuji camera may be the right choice for you. If you're looking for a camera that is easy to use and offers a wide range of photo sizes, a Nikon camera may be the best choice.",
                "role": "assistant"
            }
        ],
        "toxic_chat_tag": {
            "roberta-large": {"flagged": False, "probability": 0.0259171631},
            "t5-large": {"flagged": False, "score": 0.00019426}
        },
        "anony": True
    }
]

print(f"✓ Sample dataset loaded with {len(sample_chatbot_arena_data)} conversation examples")
print("Dataset structure:", list(sample_chatbot_arena_data[0].keys()))

## Sample Dataset

Instead of loading from external files, we'll use sample data inline to make this notebook self-contained. This represents the structure of the lmsys/chatbot_arena_conversations dataset:

In [None]:
def count_tokens(text: str) -> int:
    """Count tokens in text using tiktoken (cl100k_base encoding for GPT-4)."""
    if TIKTOKEN_AVAILABLE:
        try:
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(text))
        except Exception as e:
            print(f"Failed to use tiktoken: {e}")
    
    # Fallback: approximate as words * 1.3 (common heuristic)
    return int(len(text.split()) * 1.3)


def compute_token_metadata(conversation_a: List[Dict], conversation_b: List[Dict]) -> Dict[str, Any]:
    """
    Compute detailed token usage metadata for multi-agent interaction.

    Returns:
        Dictionary with per-turn token counts, message lengths, and timestamps
    """
    metadata = {
        "turns": [],
        "total_input_tokens": 0,
        "total_output_tokens_a": 0,
        "total_output_tokens_b": 0,
        "total_tokens": 0,
        "message_lengths": {
            "input": [],
            "response_a": [],
            "response_b": []
        }
    }

    # Process each turn
    for turn_idx in range(max(len(conversation_a), len(conversation_b))):
        turn_data = {
            "turn_number": turn_idx + 1,
            "timestamp_offset": turn_idx * 5.0  # Simulated 5-second intervals
        }

        # Get user input (should be same for both models)
        if turn_idx < len(conversation_a):
            user_msg = conversation_a[turn_idx].get("content", "")
            turn_data["input_tokens"] = count_tokens(user_msg)
            turn_data["input_length"] = len(user_msg)
            metadata["total_input_tokens"] += turn_data["input_tokens"]
            metadata["message_lengths"]["input"].append(len(user_msg))

        # Get response from model A
        if turn_idx + 1 < len(conversation_a):
            response_a = conversation_a[turn_idx + 1].get("content", "")
            turn_data["response_a_tokens"] = count_tokens(response_a)
            turn_data["response_a_length"] = len(response_a)
            metadata["total_output_tokens_a"] += turn_data["response_a_tokens"]
            metadata["message_lengths"]["response_a"].append(len(response_a))

        # Get response from model B
        if turn_idx + 1 < len(conversation_b):
            response_b = conversation_b[turn_idx + 1].get("content", "")
            turn_data["response_b_tokens"] = count_tokens(response_b)
            turn_data["response_b_length"] = len(response_b)
            metadata["total_output_tokens_b"] += turn_data["response_b_tokens"]
            metadata["message_lengths"]["response_b"].append(len(response_b))

        metadata["turns"].append(turn_data)

    # Compute total tokens (input + both agent outputs)
    metadata["total_tokens"] = (
        metadata["total_input_tokens"] +
        metadata["total_output_tokens_a"] +
        metadata["total_output_tokens_b"]
    )

    # Compute communication efficiency metrics
    metadata["communication_efficiency"] = {
        "avg_tokens_per_turn": metadata["total_tokens"] / max(len(metadata["turns"]), 1),
        "input_output_ratio": metadata["total_input_tokens"] / max(metadata["total_tokens"], 1),
        "coordination_overhead": abs(metadata["total_output_tokens_a"] - metadata["total_output_tokens_b"]) / max(metadata["total_tokens"], 1)
    }

    return metadata

print("✓ Token counting and metadata functions defined")

## Core Functions

Let's define the core functions for token counting and metadata computation.

In [None]:
import json
import sys
import time
from typing import Any, Dict, List

# Try to import tiktoken for precise token counting, fallback to approximation
try:
    import tiktoken
    TIKTOKEN_AVAILABLE = True
    print("✓ tiktoken available for precise token counting")
except ImportError:
    TIKTOKEN_AVAILABLE = False
    print("⚠ tiktoken not available, using word-based approximation")

# Define color constants for logging
BLUE, GREEN, YELLOW, CYAN, END = "\033[94m", "\033[92m", "\033[93m", "\033[96m", "\033[0m"

print(f"{GREEN}Environment setup complete!{END}")
print(f"Using token counting: {'tiktoken (precise)' if TIKTOKEN_AVAILABLE else 'word approximation'}")

## Setup and Imports

We'll start by importing the necessary libraries and setting up the environment for dataset processing.

# Extended Multi-LLM Coordination Dataset with Token‑Usage Annotations

This notebook demonstrates dataset processing for multi-LLM agent coordination with detailed token usage analysis. The script processes conversations from the lmsys/chatbot_arena_conversations dataset and enriches each example with comprehensive token-usage metadata to enable evaluation of communication efficiency in multi-agent systems.

## Key Features

- **Token Usage Analysis**: Per-turn token counts using tiktoken cl100k_base encoding
- **Communication Efficiency Metrics**: Average tokens per turn, input/output ratios, coordination overhead
- **Model Comparison**: Side-by-side responses from different language models with performance evaluation
- **Rich Metadata**: Timestamps, message lengths, winner labels, and toxicity scores

This enriched dataset enables research into:
- Communication efficiency in multi-agent coordination
- Token overhead analysis for different model pairs
- Low-rank compression strategies for multi-LLM systems
- Performance-efficiency trade-offs in collaborative AI