In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from scipy import stats

In [66]:
def find_latest_run(base_path):
    base_path = Path(base_path)
    run_dirs = [d for d in base_path.glob("run_*") if d.is_dir()]
    if not run_dirs:
        raise FileNotFoundError(f"No run directories found in {base_path}")
    return max(run_dirs, key=lambda d: datetime.strptime(d.name.replace("run_", ""), "%Y-%m-%d_%H-%M-%S"))

def load_monitor_file(monitor_dir):
    monitor_dir = Path(monitor_dir)
    csv_file = list(monitor_dir.glob("*.csv"))
    if not csv_file:
        raise FileNotFoundError(f"No CSV files found in {monitor_dir}")
    return pd.read_csv(csv_file[0], skiprows=1)

def calculate_metrics(df):
    rewards_df = df['r']
    return {
        'mean': rewards_df.mean(),
        'std': rewards_df.std(),
        'median': rewards_df.median(),
        'max': rewards_df.max(),
        'success': np.mean(rewards_df >= 475) * 100
    }

In [67]:
root = Path(__file__).resolve().parents[2] if '__file__' in globals() else Path.cwd().parents[1]
docs = root / "documentation" / "cartpole"

ppo_run = find_latest_run(docs / "ppo-cartpole")
base_run = find_latest_run(docs / "random-baseline")

ppo_df = load_monitor_file(ppo_run / "monitor")
base_df = load_monitor_file(base_run / "monitor")

comparison_dir = docs / "comparison" / f"comparison_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
graphs_dir = comparison_dir / "graphs"
graphs_dir.mkdir(parents=True, exist_ok=True)

In [68]:
ppo_metrics = calculate_metrics(ppo_df)
base_metrics = calculate_metrics(base_df)
improvement = (ppo_metrics['mean'] - base_metrics['mean']) / base_metrics['mean'] * 100
t_stat, p_val = stats.ttest_ind(ppo_df['r'], base_df['r'],equal_var=False)


In [69]:
def plot_curves(ppo, base, out):
    plt.figure(figsize=(10,5))
    plt.plot(ppo['r'], label="PPO", color='tab:blue')
    plt.plot(base['r'], label="Random Baseline", color='tab:red')
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Training Curves Comparison")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.savefig(out / "curves.png", dpi=300, bbox_inches='tight'); plt.close()

def plot_overlay(ppo, base, out):
    plt.figure(figsize=(10,5))
    plt.plot(ppo['r'].rolling(10).mean(), label="PPO (MA-10)", color='tab:blue')
    plt.plot(base['r'].rolling(10).mean(), label="Baseline (MA-10)", color='tab:red')
    plt.title("Overlayed Moving Averages")
    plt.xlabel("Episode"); plt.ylabel("Reward")
    plt.legend(); plt.grid(alpha=0.3)
    plt.savefig(out / "overlay.png", dpi=300, bbox_inches='tight'); plt.close()

def plot_metrics(ppo_metrics, base_metrics, out):
    labels = ["Mean", "Median", "Max", "Success Rate (%)"]
    ppo_vals = [ppo_metrics['mean'], ppo_metrics['median'], ppo_metrics['max'], ppo_metrics['success']]
    base_vals = [base_metrics['mean'], base_metrics['median'], base_metrics['max'], base_metrics['success']]
    x = np.arange(len(labels)); w = 0.35
    plt.figure(figsize=(8,5))
    plt.bar(x - w/2, ppo_vals, w, label='PPO', color='tab:blue')
    plt.bar(x + w/2, base_vals, w, label='Baseline', color='tab:red')
    plt.xticks(x, labels); plt.ylabel("Value"); plt.title("Performance Metrics Comparison")
    plt.legend(); plt.grid(alpha=0.3, axis='y')
    plt.savefig(out / "metrics.png", dpi=300, bbox_inches='tight'); plt.close()

def plot_reward_distribution(ppo, base, out):
    plt.figure(figsize=(8,5))
    plt.hist(ppo['r'], bins=30, alpha=0.6, label='PPO', color='tab:blue', density=True)
    plt.hist(base['r'], bins=30, alpha=0.6, label='Baseline', color='tab:red', density=True)
    plt.title("Reward Distribution per Episode")
    plt.xlabel("Reward")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "reward_distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_stability(ppo, base, out):
    plt.figure(figsize=(10,5))
    plt.plot(ppo['r'].rolling(10).std(), label="PPO (Rolling Std)", color='tab:blue')
    plt.plot(base['r'].rolling(10).std(), label="Baseline (Rolling Std)", color='tab:red')
    plt.title("Rolling Standard Deviation (Stability Over Time)")
    plt.xlabel("Episode"); plt.ylabel("Std of Reward (MA-10)")
    plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "stability.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_cumulative(ppo, base, out):
    plt.figure(figsize=(10,5))
    plt.plot(np.cumsum(ppo['r']), label="PPO (Cumulative)", color='tab:blue')
    plt.plot(np.cumsum(base['r']), label="Baseline (Cumulative)", color='tab:red')
    plt.title("Cumulative Rewards Over Episodes")
    plt.xlabel("Episode"); plt.ylabel("Total Reward")
    plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "cumulative.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_overlay_with_mean(ppo, base, out):
    plt.figure(figsize=(10,5))
    plt.plot(ppo['r'].rolling(10).mean(), label="PPO (MA-10)", color='tab:blue')
    plt.plot(base['r'].rolling(10).mean(), label="Baseline (MA-10)", color='tab:red')

    plt.axhline(ppo['r'].mean(), color='blue', linestyle='--', alpha=0.5, label=f"PPO Mean ({ppo['r'].mean():.1f})")
    plt.axhline(base['r'].mean(), color='red', linestyle='--', alpha=0.5, label=f"Baseline Mean ({base['r'].mean():.1f})")

    plt.title("Moving Averages + Mean Reward Levels")
    plt.xlabel("Episode"); plt.ylabel("Reward")
    plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "overlay_with_mean.png", dpi=300, bbox_inches='tight')
    plt.close()

In [70]:
plot_curves(ppo_df, base_df, graphs_dir)
plot_overlay(ppo_df, base_df, graphs_dir)
plot_metrics(ppo_metrics, base_metrics, graphs_dir)
plot_reward_distribution(ppo_df, base_df, graphs_dir)
plot_stability(ppo_df, base_df, graphs_dir)
plot_cumulative(ppo_df, base_df, graphs_dir)
plot_overlay_with_mean(ppo_df, base_df, graphs_dir)

In [71]:
print("\n*PPO v Random Baseline Comparison*")
print(f"PPO Mean Reward: {ppo_metrics['mean']:.2f}")
print(f"Baseline Mean Reward: {base_metrics['mean']:.2f}")
print(f"Improvement: {improvement:.2f}%")
print(f"t-stat: {t_stat:.3f}, p-value: {p_val:.7f}")


*PPO v Random Baseline Comparison*
PPO Mean Reward: 37.40
Baseline Mean Reward: 20.40
Improvement: 83.34%
t-stat: 6.387, p-value: 0.0000000


In [72]:
summary_file = comparison_dir / "summary.md"
with open(summary_file, "w") as f:
    f.write(f"# PPO vs Random Baseline Comparison ({datetime.now():%Y-%m-%d %H:%M:%S})\n\n")
    f.write(f"- PPO Mean Reward: **{ppo_metrics['mean']:.2f}**\n")
    f.write(f"- Baseline Mean Reward: **{base_metrics['mean']:.2f}**\n")
    f.write(f"- Improvement: **{improvement:.2f}%**\n")
    f.write(f"- t-stat: {t_stat:.3f}\n")
    f.write(f"- p-value: {p_val:.7f}\n")
