In [6]:
!pip install plotly nbformat numpy pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import wandb
import pandas as pd
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 6)

wandb.login()
api = wandb.Api()

entity = "macrocosmos"
project = "prompting-validators"
run_id = "y1gvm7fm"  # v2.11.0
df_current = pd.DataFrame([row for row in api.run(f"{entity}/{project}/{run_id}").scan_history()])


In [8]:

run_id = "nda21wnr"  # v2.10.0
# run_id = "utrrhymc"  # v2.10.0
# run_id = "dmiyn1nx"  # v2.9.2
# run_id = "d4i2jalp"  # v2.9.0
# run_id = "a300gshd" # v2.8.3
# run_id = "3xqww1c2"  # v2.8.3 unit-tests
df_previous = pd.DataFrame([row for row in api.run(f"{entity}/{project}/{run_id}").scan_history()])

In [9]:
import plotly.io as pio
import plotly.graph_objects as go
import numpy as np
import pandas as pd


def calculate_task_rewards(events):
    if not isinstance(events, (list, np.ndarray)):
        print(f"Unexpected type for events: {type(events)}")
        return np.nan
    try:
        return np.sum(np.array([np.array(x["rewards"]) * x["weight"] for x in events]), axis=0)
    except Exception as e:
        print(f"Error processing events: {e}")
        return np.nan


def plot(wb_df, name=""):
    print("=" * 30)
    print(f"Stats for {name}")
    print("=" * 30)
    # Apply the function to create the new column
    if "rewards" not in wb_df:
        wb_df["rewards"] = wb_df["reward_events"].apply(calculate_task_rewards)

    # Set the default template to 'plotly_dark' for all plots
    pio.templates.default = "plotly_dark"


    # Function to calculate the mean and std for each task.
    def calculate_stats(group) -> pd.Series:
        rewards = np.concatenate(group["rewards"].values)
        return pd.Series({"mean": np.mean(rewards), "std": np.std(rewards), "count": len(rewards)})

    # Group by "task" and apply the function.
    task_stats = wb_df.groupby("task").apply(calculate_stats).reset_index()

    # Create the stacked bar plot using Plotly.
    fig = go.Figure()

    # Add mean bars (green).
    fig.add_trace(go.Bar(
        x=task_stats["task"],
        y=task_stats["mean"],
        name="Mean",
        marker_color="#5C8374"
    ))

    # Add std bars (red) stacked on top of the mean.
    fig.add_trace(go.Bar(
        x=task_stats["task"],
        y=task_stats["std"],
        name="Standard Deviation",
        marker_color="#A0153E",
        base=task_stats["mean"]
    ))

    # Update the layout for better visualization.
    fig.update_layout(
        barmode="stack",
        title=f"Mean and Standard Deviation by Task ({name})",
        xaxis_title="Task",
        yaxis_title="Value",
        legend_title="Metrics"
    )

    fig.show()

    import plotly.express as px
    task_counts = wb_df["task"].value_counts(dropna=True)
    fig = px.pie(task_counts, names=task_counts.index, values=task_counts.values, title=f"Task Distribution ({name})")
    fig.show()


    # Step 2: Explode 'uids' and 'rewards' simultaneously
    long_df = wb_df.copy()
    long_df = wb_df[['task', 'uids', 'rewards']].explode(['uids', 'rewards']).rename(columns={'uids': 'uid', 'rewards': 'reward'})
    long_df = long_df.reset_index(drop=True)

    # Step 3: Calculate average reward per task and uid
    avg_reward_per_task_uid = long_df.groupby(['task', 'uid'])['reward'].mean().reset_index()
    avg_reward_per_task_uid = avg_reward_per_task_uid.rename(columns={'reward': 'average_reward'})

    # Step 4: Calculate average reward per uid across all tasks
    avg_reward_per_uid = long_df.groupby('uid')['reward'].mean().reset_index()
    avg_reward_per_uid = avg_reward_per_uid.rename(columns={'reward': 'average_reward'})

    # Step 5: (Optional) Export to CSV
    # avg_reward_per_task_uid.to_csv('average_rewards_per_task_uid.csv', index=False)
    # avg_reward_per_uid.to_csv('average_rewards_per_uid.csv', index=False)
    return task_stats, avg_reward_per_task_uid, avg_reward_per_uid


current_stats, current_reward_per_task, current_reward = plot(df_current, "Current version")
previous_stats, previous_reward_per_task, previous_reward = plot(df_previous, "Previous version")

Stats for Current version


KeyError: 'reward_events'

In [5]:
import pandas as pd
import plotly.graph_objects as go

current = current_reward_per_task.rename(columns={"average_reward": "current_average_reward"})
previous = previous_reward_per_task.rename(columns={"average_reward": "previous_average_reward"})

# Merge on "task" and "uid" using an inner join to ensure alignment
merged_df = pd.merge(current, previous, on=["task", "uid"], how="inner")

unique_tasks = merged_df["task"].unique()

for task in unique_tasks:
    # Filter data for the current task
    task_data = merged_df[merged_df["task"] == task].copy()
    
    # Sort by "previous_average_reward" ascending
    task_data_sorted = task_data.sort_values(by="previous_average_reward", ascending=True)
    
    # Extract sorted UIDs and convert to strings for categorical axis
    sorted_uids = task_data_sorted["uid"].astype(str).tolist()
    
    # Align previous data based on sorted UIDs
    # Set UID as index and reindex to match sorted_uids order
    previous_rewards_aligned = task_data_sorted.set_index("uid")["previous_average_reward"].reindex(task_data_sorted["uid"])
    
    # Convert UIDs to strings for consistency
    previous_rewards_aligned = previous_rewards_aligned.reset_index()["previous_average_reward"].tolist()
    
    # Current average rewards
    current_rewards = task_data_sorted["current_average_reward"].tolist()
    
    # Previous average rewards aligned
    previous_rewards = previous_rewards_aligned
    
    # Create a Plotly figure with dark background
    fig = go.Figure()
    
    # Add current version line
    fig.add_trace(go.Scatter(
        x=sorted_uids,
        y=current_rewards,
        mode="lines+markers",
        name="Current Version",
        line=dict(color="cyan"),
        marker=dict(size=6)
    ))
    
    # Add previous version line
    fig.add_trace(go.Scatter(
        x=sorted_uids,
        y=previous_rewards,
        mode="lines+markers",
        name="Previous Version",
        line=dict(color="orange"),
        marker=dict(size=6)
    ))
    
    # Update layout for dark theme and categorical X-axis
    fig.update_layout(
        title=f"Average Reward Comparison for Task: {task}",
        xaxis_title="UID",
        yaxis_title="Average Reward",
        template="plotly_dark",
        legend=dict(
            x=0.01,
            y=0.99,
            bgcolor="rgba(0,0,0,0)",
            bordercolor="rgba(255,255,255,0)"
        ),
        xaxis=dict(
            type="category",
            categoryorder="array",
            categoryarray=sorted_uids,
            tickangle=45 if len(sorted_uids) > 20 else 0,  # Rotate ticks if many UIDs
            tickfont=dict(size=8 if len(sorted_uids) > 20 else 10)
        )
    )
    
    fig.update_xaxes(showgrid=True, gridcolor="rgba(255,255,255,0.1)")
    fig.update_yaxes(showgrid=True, gridcolor="rgba(255,255,255,0.1)")
    fig.show()

In [4]:
current_reward_per_task[(current_reward_per_task["task"] == "summarization") & (current_reward_per_task["uid"] == 355)]

Unnamed: 0,task,uid,average_reward
5362,summarization,355,0.374975


In [5]:
previous_reward_per_task[(previous_reward_per_task["task"] == "summarization") & (previous_reward_per_task["uid"] == 355)]

Unnamed: 0,task,uid,average_reward
5354,summarization,355,0.467765


In [6]:
samples_num = 3
tasks = df_current["task"].unique()
for task in tasks:
    print(f"======= TASK: {task} =======")
    for idx in range(samples_num):
        task_df = df_current[df_current.task == task]
        if idx >= len(task_df):
            break
        print(f"TASK: {task} | RUN: {idx}")
        print(f"Challenge\n {task_df.challenge.iloc[idx]}")
        print("-" * 30)
        print(f"Reference\n {task_df.reference.iloc[idx]}")
        print("-" * 30)
        print("Random 3 miners:")
        for comp in task_df.completions.iloc[idx][:3]:
            print(comp)
        print("=" * 30)
        print(df_current[df_current["task"] == task].rewards.iloc[idx][:5])

TASK: multi_choice | RUN: 0
Challenge
 [Example 1]
What is the capital of Texas?
A. Paris
B. London
C. Austin
D. Houston
Answer: C

[Input Question]
Where is Paugussett State Forest located?

A. On the northern shore of Lake Lillinonah and Lake Zoar in the town of New Haven
B. On the southern shore of Lake Lillinonah and Lake Zoar in the town of Stamford
C. On the western shore of Lake Lillinonah and Lake Zoar in the town of Newtown
D. On the eastern shore of Lake Lillinonah and Lake Zoar in the town of Hartford
Answer: 
------------------------------
Reference
 C
------------------------------
Random 3 miners:
{"C":1}
{"C":1}
{"C":1}
[1. 1. 1. 1. 1.]
TASK: multi_choice | RUN: 1
Challenge
 [Example 1]
What is the capital of Texas?
A. Paris
B. London
C. Austin
D. Houston
Answer: C

[Input Question]
Which constituency did M. Kannan represent in the Tamil Nadu legislative assembly?

A. Kandamangalam East
B. Tamil Nadu North
C. Kandamangalam
D. Tamil Nadu South
Answer: 
-------------------