In [None]:
from pathlib import Path
from analysis.models.conversation import Conversation, completion_logs_dirs_from_openhands_evaluation

# CHANGE THIS VALUE TO AN OPENHANDS EVALUATION DIRECTORY
EVALUATION_PATH = "..."

conversation_paths = completion_logs_dirs_from_openhands_evaluation(Path(EVALUATION_PATH))
conversations = {path: Conversation.from_completion_logs_dir(path) for path in conversation_paths}

In [None]:
# To get a view into what happens during condensations, we can create a "heatmap" of the messages
# that survive the condensation. This is done by comparing the messages in the turn before and after
# the condensation, and mapping each message to 1 if it exists in the after messages, and 0 otherwise.

# Since messages don't have an identifier, we'll use hash equality.

def condensation_heatmap(conversation: Conversation) -> list[int]:
    statuses: list[list[int]] = []

    for i, completion in enumerate(conversation.turns):


        # The completion is a condensation if it only has a single message
        if len(completion.messages) == 1:

            # We want to compare the turns before and after the condensation
            before_condensation = conversation.turns[i-1]
            after_condensation = conversation.turns[i+1]

            # Convert turns to list of messages hashes for comparison
            before_messages = [hash(message) for message in before_condensation.messages]
            after_messages = [hash(message) for message in after_condensation.messages]

            # To get a sense of what gets dropped in the condensation, map each before
            # message to 1 if it exists in the after messages, and 0 otherwise.
            message_status = [1 if message in after_messages else 0 for message in before_messages]

            statuses.append(message_status)

    status_heatmap = [sum(indices) for indices in zip(*statuses)]

    return status_heatmap

heatmaps = {path: condensation_heatmap(conversation) for path, conversation in conversations.items()}

In [None]:
# Get a view into what happens during a condensation by plotting average chance to survive a condensation event.
# Uses normalized heatmaps as samples from an index-wise distribution of survival.

import altair as alt
import numpy as np
import pandas as pd

def normalize_heatmap_values(heatmap: list[int]) -> list[float]:
    if not heatmap:
        return heatmap
    
    max_value = max(heatmap)
    return [value / max_value for value in heatmap]

def normalize_heatmap_length(heatmap: list[int], target_length: int) -> list[float]:
    if not heatmap:
        return [1.0] * target_length

    arr = np.array(heatmap)
    return np.interp(np.linspace(0, len(heatmap) - 1, num=target_length), np.arange(len(heatmap)), arr).tolist()

for path, heatmap in heatmaps.items():
    heatmaps[path] = normalize_heatmap_length(normalize_heatmap_values(heatmap), 40)

# Conver the entries in the heatmaps dictionary to a dataframe so we can render it with Altair
rows = []
for path, heatmap in heatmaps.items():
    rows.extend([{
        "path": str(path),
        "index": index,
        "value": value
    } for index, value in enumerate(heatmap)])
df = pd.DataFrame(rows)
# Plot the average heatmap value across all conversations
alt.Chart(df).mark_line().encode(
    alt.X("index:Q").title("Position in context"),
    alt.Y("average(value):Q").title("Likelihood of surviving condensation"),
)

In [None]:
# Plot the average number of messages in each turn ov the conversation (across all conversations)

rows = []
for path, conversation in conversations.items():
    for index, turn in enumerate(conversation.turns):
        rows.append({
            "path": str(path),
            "index": index,
            "value": len(turn.messages)
        })
df = pd.DataFrame(rows)

df = df.groupby("index").agg({"value": "mean"}).reset_index()

alt.Chart(df).mark_line().encode(
    alt.X("index:Q").title("Turns in the conversation"),
    alt.Y("value:Q").title("Average number of messages"),
)