In [None]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
from eval import get_run_metrics

In [None]:
# Paths to CoT-I and CoT-IO model runs
# Update these with your actual run_ids
cot_i_run_path = "../models/cot-I_3nn/YOUR_COT_I_RUN_ID"  # Replace with actual run_id
cot_io_run_path = "../models/cot-IO_3nn/YOUR_COT_IO_RUN_ID"  # Replace with actual run_id

# Load metrics
print("Loading CoT-I metrics...")
cot_i_metrics = get_run_metrics(cot_i_run_path, step=-1, cache=True)

print("Loading CoT-IO metrics...")
cot_io_metrics = get_run_metrics(cot_io_run_path, step=-1, cache=True)

# Extract stepwise losses (each is a list of length n_points)
# stepwise_loss.0 = s1, stepwise_loss.1 = s2, stepwise_loss.2 = y
cot_i_step0 = np.array(cot_i_metrics.get("stepwise_loss.0", []))  # s1 loss vs n
cot_i_step1 = np.array(cot_i_metrics.get("stepwise_loss.1", []))  # s2 loss vs n
cot_i_step2 = np.array(cot_i_metrics.get("stepwise_loss.2", []))  # y loss vs n

cot_io_step0 = np.array(cot_io_metrics.get("stepwise_loss.0", []))  # s1 loss vs n
cot_io_step1 = np.array(cot_io_metrics.get("stepwise_loss.1", []))  # s2 loss vs n
cot_io_step2 = np.array(cot_io_metrics.get("stepwise_loss.2", []))  # y loss vs n

# Create x-axis: position index (1-indexed for "n points seen")
n_points = len(cot_i_step0)
n_values = np.arange(1, n_points + 1)  # 1, 2, ..., n_points

# Plot each step for both models
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Step 0 (s1)
axes[0].plot(n_values, cot_i_step0, label="CoT-I", linewidth=2, marker='o', markersize=3)
axes[0].plot(n_values, cot_io_step0, label="CoT-IO", linewidth=2, marker='s', markersize=3)
axes[0].set_xlabel("Number of in-context examples (n)", fontsize=11)
axes[0].set_ylabel("Test Error", fontsize=11)
axes[0].set_title("Step 0 (s1)", fontsize=12)
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Step 1 (s2)
axes[1].plot(n_values, cot_i_step1, label="CoT-I", linewidth=2, marker='o', markersize=3)
axes[1].plot(n_values, cot_io_step1, label="CoT-IO", linewidth=2, marker='s', markersize=3)
axes[1].set_xlabel("Number of in-context examples (n)", fontsize=11)
axes[1].set_ylabel("Test Error", fontsize=11)
axes[1].set_title("Step 1 (s2)", fontsize=12)
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)

# Step 2 (y)
axes[2].plot(n_values, cot_i_step2, label="CoT-I", linewidth=2, marker='o', markersize=3)
axes[2].plot(n_values, cot_io_step2, label="CoT-IO", linewidth=2, marker='s', markersize=3)
axes[2].set_xlabel("Number of in-context examples (n)", fontsize=11)
axes[2].set_ylabel("Test Error", fontsize=11)
axes[2].set_title("Step 2 (y)", fontsize=12)
axes[2].legend(fontsize=10)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
import os
from tqdm import tqdm

from eval import get_run_metrics

run_dir = "../models"
task = "cot_2nn"
# Add the run_id of the runs you want to evaluate
run_ids = []

for run_id in tqdm(run_ids):
    run_path = os.path.join(run_dir, task, run_id)
    get_run_metrics(run_path)

print('DONE!')