In [None]:
# Examine a complete example
print("\\n" + "=" * 60)
print("üîç Detailed Example Structure")
print("=" * 60)

example = standardized_examples[0]
print(f"\\nüìù Input Query: {example['input']}")
print(f"\\nüèÜ Coordination Outcome: {example['output']}")
print(f"\\nüè∑Ô∏è  Dataset: {example['dataset']}")
print(f"\\nüìä Split: {example['split']}")

print("\\nü§ñ Model Context:")
context = example['context']
print(f"  Model A: {context['model_a']}")
print(f"  Model B: {context['model_b']}")
print(f"  Winner: {context['winner']}")
print(f"  Judge: {context['judge']}")
print(f"  Language: {context['language']}")
print(f"  Turn: {context['turn']}")

print("\\nüí¨ Response Excerpts:")
print(f"  Model A ({context['model_a']}):")
print(f"    '{context['response_a'][:100]}...'")
print(f"  Model B ({context['model_b']}):")
print(f"    '{context['response_b'][:100]}...'")

print("\\nüìà Token Usage Details:")
token_usage = context['token_usage']
print(f"  Total tokens: {token_usage['total_tokens']}")
print(f"  Input tokens: {token_usage['total_input_tokens']}")
print(f"  Output A tokens: {token_usage['total_output_tokens_a']}")
print(f"  Output B tokens: {token_usage['total_output_tokens_b']}")

print("\\n‚öôÔ∏è API Metadata:")
api_meta = context['api_metadata']
print(f"  Question ID: {api_meta.get('question_id', 'N/A')}")
print(f"  Timestamp: {api_meta.get('timestamp', 'N/A')}")
print(f"  Anonymous: {api_meta.get('anony', 'N/A')}")
if 'toxic_chat_tag' in api_meta:
    print(f"  Toxic content flags: {list(api_meta['toxic_chat_tag'].keys())}")

## Example Data Exploration

Let's examine one complete example to understand the data structure:

In [None]:
# Communication Efficiency Analysis
print("\\n" + "=" * 60)
print("üìä Communication Efficiency Metrics")
print("=" * 60)

efficiency_metrics = []
for i, ex in enumerate(standardized_examples):
    comm_eff = ex["context"]["token_usage"]["communication_efficiency"]
    efficiency_metrics.append(comm_eff)
    
    print(f"\\nExample {i+1}: {ex['input'][:40]}...")
    print(f"  Average tokens per turn: {comm_eff['avg_tokens_per_turn']:.1f}")
    print(f"  Input/Output ratio: {comm_eff['input_output_ratio']:.3f}")
    print(f"  Coordination overhead: {comm_eff['coordination_overhead']:.3f}")
    print(f"  Winner: {ex['context']['winner']}")

# Overall averages
avg_tokens_per_turn = sum(m['avg_tokens_per_turn'] for m in efficiency_metrics) / len(efficiency_metrics)
avg_io_ratio = sum(m['input_output_ratio'] for m in efficiency_metrics) / len(efficiency_metrics)
avg_coord_overhead = sum(m['coordination_overhead'] for m in efficiency_metrics) / len(efficiency_metrics)

print(f"\\nüìà Overall Averages:")
print(f"  Average tokens per turn: {avg_tokens_per_turn:.1f}")
print(f"  Average input/output ratio: {avg_io_ratio:.3f}")
print(f"  Average coordination overhead: {avg_coord_overhead:.3f}")

## Communication Efficiency Analysis

Let's dive deeper into the communication efficiency metrics that are crucial for coordinator analysis:

In [None]:
# Dataset Statistics
print("=" * 60)
print("üìà Dataset Statistics")
print("=" * 60)
print(f"Total examples: {len(standardized_examples)}")
print(f"Dataset source: lmsys/chatbot_arena_conversations")
print(f"Split: train")

# Token statistics
total_tokens = sum(
    ex["context"]["token_usage"]["total_tokens"]
    for ex in standardized_examples
)
avg_tokens = total_tokens / len(standardized_examples) if standardized_examples else 0
print(f"\\nToken Usage:")
print(f"  Total tokens: {total_tokens:,}")
print(f"  Average tokens per example: {avg_tokens:.1f}")

# Winner distribution
winner_counts = {}
for ex in standardized_examples:
    winner = ex["context"]["winner"]
    winner_counts[winner] = winner_counts.get(winner, 0) + 1

print(f"\\nWinner Distribution:")
for winner, count in sorted(winner_counts.items()):
    pct = (count / len(standardized_examples)) * 100
    print(f"  {winner}: {count} ({pct:.1f}%)")

# Model coverage
model_pairs = set()
for ex in standardized_examples:
    pair = (ex["context"]["model_a"], ex["context"]["model_b"])
    model_pairs.add(pair)

print(f"\\nModel Pairs: {len(model_pairs)} unique combinations")
for pair in sorted(model_pairs):
    print(f"  {pair[0]} vs {pair[1]}")

## Dataset Analysis

Let's examine our processed dataset and generate key statistics about model coordination patterns:

In [None]:
# Process examples through standardization
examples = sample_data.get("examples", [])
print(f"üìä Processing {len(examples)} examples from source dataset")

standardized_examples = []

for i, example in enumerate(examples):
    try:
        std_example = standardize_example(
            example,
            dataset_name="lmsys/chatbot_arena_conversations"
        )
        standardized_examples.append(std_example)
        print(f"‚úÖ Successfully processed example {i+1}: '{std_example['input'][:50]}...'")
    except Exception as e:
        print(f"‚ö†Ô∏è  Warning: Failed to process example {i}: {e}")
        continue

print(f"\\n‚úÖ Successfully standardized {len(standardized_examples)} examples")

## Process the Dataset

Let's apply the standardization function to our sample data and explore the results:

In [None]:
def standardize_example(example: Dict[str, Any], dataset_name: str) -> Dict[str, Any]:
    """
    Convert ChatBot Arena example to standardized schema format.
    
    Required fields: input, context, output, dataset, split
    """
    # Extract input (user query)
    input_text = example.get("input", "")
    
    # Build context with all coordination-relevant metadata
    context = {
        "model_a": example["context"]["model_a"],
        "model_b": example["context"]["model_b"],
        "winner": example["context"]["winner"],
        "judge": example["context"].get("judge", "unknown"),
        "language": example["context"].get("language", "unknown"),
        "turn": example["context"].get("turn", 1),
        "response_a": example["context"]["response_a"],
        "response_b": example["context"]["response_b"],
        "token_usage": example["context"]["token_usage"],
        "api_metadata": example["context"].get("api_metadata", {}),
    }
    
    # Extract output (coordination outcome - winner label)
    output = example.get("output", "")
    
    # Get split
    split = example.get("split", "train")
    
    return {
        "input": input_text,
        "context": context,
        "output": output,
        "dataset": dataset_name,
        "split": split
    }

print("‚úÖ Data standardization function defined!")

## Data Standardization Function

The core function that converts raw ChatBot Arena data to the standardized schema format required for downstream ML research:

In [None]:
# Sample dataset - inlined for self-contained notebook
sample_data = {
    "examples": [
        {
            "input": "What is the difference between OpenCL and CUDA?",
            "context": {
                "model_a": "chatglm-6b",
                "model_b": "koala-13b", 
                "winner": "model_b",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "OpenCL and CUDA are two different programming models that are used for parallel computing.OpenCL is a general-purposeÂπ∂Ë°åÁºñÁ®ãÊé•Âè£ that allows developers to write parallel code that can run on any platform that supportsCL, which includes most modern operating systems and computer systems, including Windows, Linux, and macOS. It provides a lower-level, more flexible API that is more suitable for building large-scale distributed computing systems.CUDA is a specific implementation ofOpenCL that is designed for performance and scalability in devices with multiple GPU(s). It was developed by Nvidia and is widely used for scientific computing, machine learning, andÂõæÂΩ¢ÂõæÂÉèÂ§ÑÁêÜ. The CUDA API provides a high-level, more efficient way to write parallel code that can run on top of a GPU, while still maintaining the ability to write code that can be easily adapted for other parallel computing platforms.",
                "response_b": "OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in several key ways:\\n\\n1.   Architecture: OpenCL is a general-purpose parallel computing language, while CUDA is designed specifically for GPU computing. OpenCL can run on various types of processors, including CPUs, GPUs, and FPGAs, whereas CUDA is only compatible with NVIDIA GPUs.\\n2.   Language: OpenCL is a C++ language, while CUDA is a C++ language with additional features for GPU computing. CUDA provides additional libraries and tools for handling graphics and other tasks unique to GPUs.\\n3.   Driver support: OpenCL has a wider range of device support, with support for not just NVIDIA GPUs but also other manufacturers like AMD, Intel, and ARM. CUDA is only available on NVIDIA GPUs, and requires the CUDA Toolkit to be installed on the host machine.\\n4.   Memory management: OpenCL provides a unified memory model, where all the data is stored in global memory, and the kernel can access any part of it. CUDA, on the other hand, uses a hierarchical memory architecture, where data is divided into host, device, and shared memory.\\n5.   Performance: OpenCL and CUDA have different strengths and weaknesses in terms of performance. OpenCL is generally faster for numerical computations, while CUDA is faster for graphics and other tasks that require fast memory access. CUDA also has better support for 64-bit floating-point operations, which are essential for some scientific applications.\\n\\nIn summary, OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in terms of architecture, language, driver support, memory management, and performance. OpenCL is a general-purpose language that can run on various types of processors, while CUDA is specific to NVIDIA GPUs and provides additional libraries and tools for handling graphics and other tasks unique to GPUs.",
                "token_usage": {
                    "turns": [
                        {
                            "turn_number": 1,
                            "timestamp_offset": 0.0,
                            "input_tokens": 10,
                            "input_length": 47,
                            "response_a_tokens": 171,
                            "response_a_length": 892,
                            "response_b_tokens": 373,
                            "response_b_length": 1905
                        },
                        {
                            "turn_number": 2,
                            "timestamp_offset": 5.0,
                            "input_tokens": 171,
                            "input_length": 892
                        }
                    ],
                    "total_input_tokens": 181,
                    "total_output_tokens_a": 171,
                    "total_output_tokens_b": 373,
                    "total_tokens": 725,
                    "message_lengths": {
                        "input": [47, 892],
                        "response_a": [892],
                        "response_b": [1905]
                    },
                    "communication_efficiency": {
                        "avg_tokens_per_turn": 362.5,
                        "input_output_ratio": 0.2496551724137931,
                        "coordination_overhead": 0.2786206896551724
                    }
                },
                "api_metadata": {
                    "question_id": "58210e39b3fd4441a2bd4a518bb44c2d",
                    "timestamp": 1682351591.1322,
                    "toxic_chat_tag": {
                        "roberta-large": {"flagged": False, "probability": 0.0083130226},
                        "t5-large": {"flagged": False, "score": 7.94367e-05}
                    },
                    "anony": True
                }
            },
            "output": "Winner: model_b",
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        },
        {
            "input": "Why did my parent not invite me to their wedding?",
            "context": {
                "model_a": "oasst-pythia-12b",
                "model_b": "alpaca-13b",
                "winner": "tie",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "It is possible that your parent did not invite you to their wedding for a variety of reasons. It could be that they did not want to burden you with the planning or expense of the event, or they may not have had the time or resources to include you. Additionally, it could be that they did not feel comfortable having you attend due to your age or relationship with them. It is important to remember that your parent may have had their own reasons for not inviting you, and it is not your place to judge their choices. If you feel neglected or upset about not being invited, it is best to talk to them about it and express your feelings in a non-accusatory way.",
                "response_b": "It is likely that they wanted to keep the guest list small and intimate. Maybe they felt that inviting you would make the event too crowded, or that they wanted to keep the event reserved for close friends and family. It could be that they wanted to avoid any potential drama or conflict that could arise from having too many people at the event. Whatever the reason, it's important to remember that they made this decision out of love and respect for you, and that their decision doesn't define your relationship.",
                "token_usage": {
                    "turns": [
                        {
                            "turn_number": 1,
                            "timestamp_offset": 0.0,
                            "input_tokens": 11,
                            "input_length": 49,
                            "response_a_tokens": 139,
                            "response_a_length": 660,
                            "response_b_tokens": 100,
                            "response_b_length": 514
                        }
                    ],
                    "total_input_tokens": 150,
                    "total_output_tokens_a": 139,
                    "total_output_tokens_b": 100,
                    "total_tokens": 389,
                    "message_lengths": {
                        "input": [49, 660],
                        "response_a": [660],
                        "response_b": [514]
                    },
                    "communication_efficiency": {
                        "avg_tokens_per_turn": 194.5,
                        "input_output_ratio": 0.3856041131105398,
                        "coordination_overhead": 0.10025706940874037
                    }
                },
                "api_metadata": {
                    "question_id": "2564acd09e3942fd97657d05282d4389",
                    "timestamp": 1682351654.6747,
                    "toxic_chat_tag": {
                        "roberta-large": {"flagged": False, "probability": 0.038766861},
                        "t5-large": {"flagged": False, "score": 0.0017759029}
                    },
                    "anony": True
                }
            },
            "output": "Winner: tie",
            "dataset": "lmsys/chatbot_arena_conversations", 
            "split": "train"
        },
        {
            "input": "Fuji vs. Nikon, which is better?",
            "context": {
                "model_a": "koala-13b",
                "model_b": "oasst-pythia-12b",
                "winner": "model_b",
                "judge": "arena_user_973",
                "language": "English",
                "turn": 1,
                "response_a": "Both Fuji and Nikon are popular camera brands that offer a wide range of cameras for different types of photography. It really depends on your specific needs and preferences as to which brand is \\"better\\" for you.\\n\\nFujifilm is known for its high image quality, especially in the areas of color accuracy and dynamic range. Many photographers also appreciate the company's focus on ergonomics and the overall user experience of its cameras. Additionally, Fujifilm has a loyal following among enthusiasts and professional photographers, thanks to its commitment to film photography and its high-end mirrorless cameras.\\n\\nNikon, on the other hand, is known for its advanced autofocus system and image stabilization technology, as well as its high-end cameras and lenses. Nikon also has a wide selection of cameras and lenses to choose from, including both DSLR and mirrorless options.\\n\\nUltimately, it's difficult to say which brand is \\"better\\" without knowing more about your specific needs and preferences. It might be helpful to research both brands further and try out a few different models before making a decision.",
                "response_b": "This is a subjective question and the answer depends on the individual preferences and needs. Both Fujifilm and Nikon are well-known for their high-quality cameras, and the choice between the two ultimately depends on the user's personal preferences and needs.",
                "token_usage": {
                    "turns": [
                        {
                            "turn_number": 1,
                            "timestamp_offset": 0.0,
                            "input_tokens": 10,
                            "input_length": 32,
                            "response_a_tokens": 212,
                            "response_a_length": 1114,
                            "response_b_tokens": 309,
                            "response_b_length": 1566
                        }
                    ],
                    "total_input_tokens": 222,
                    "total_output_tokens_a": 212,
                    "total_output_tokens_b": 309,
                    "total_tokens": 743,
                    "message_lengths": {
                        "input": [32, 1114],
                        "response_a": [1114],
                        "response_b": [1566]
                    },
                    "communication_efficiency": {
                        "avg_tokens_per_turn": 371.5,
                        "input_output_ratio": 0.29878869448183043,
                        "coordination_overhead": 0.13055181695827725
                    }
                },
                "api_metadata": {
                    "question_id": "90bfd142157948aba01931726c888e7f",
                    "timestamp": 1682351708.9351,
                    "toxic_chat_tag": {
                        "roberta-large": {"flagged": False, "probability": 0.0259171631},
                        "t5-large": {"flagged": False, "score": 0.00019426}
                    },
                    "anony": True
                }
            },
            "output": "Winner: model_b",
            "dataset": "lmsys/chatbot_arena_conversations",
            "split": "train"
        }
    ]
}

print(f"üìä Loaded sample dataset with {len(sample_data['examples'])} examples")

## Sample Dataset

Here's our sample dataset with 3 examples from the LMSYS ChatBot Arena conversations. Each example contains:

- **Input**: User query
- **Context**: Model comparison data including responses, token usage, and metadata
- **Output**: Coordination outcome (winner)
- **Dataset**: Source dataset identifier
- **Split**: Train/test split designation

In [None]:
# Import required libraries
import json
from pathlib import Path
from typing import Any, Dict, List
import pprint

print("üì¶ Libraries imported successfully!")

# Annotated Low-Rank Coordinator Interaction Dataset

This interactive notebook demonstrates the **Multi-LLM Agent Coordination Dataset Curation Script** that processes multi-LLM conversation data and creates standardized outputs for coordination analysis.

## Dataset Objective

The script processes coordination data with the following features:
- ‚úÖ **Token-level communication logs**: Already available in the dataset
- ‚ö†Ô∏è **Low-rank recurrent coordinator latent states**: To be added in annotation phase  
- ‚ö†Ô∏è **Explicit coordination outcome labels**: To be added in annotation phase

## Key Features

- Standardizes multi-LLM conversation data from LMSYS ChatBot Arena
- Tracks token usage and communication efficiency metrics
- Analyzes model coordination outcomes and winner distributions
- Provides structured data for downstream ML research

Let's explore how this dataset curation pipeline works!