In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from scipy import stats

In [10]:
def find_latest_run(base_path):
    base_path = Path(base_path)
    run_dirs = [d for d in base_path.glob("run_*") if d.is_dir()]
    if not run_dirs:
        raise FileNotFoundError(f"No run directories found in {base_path}")
    return max(run_dirs, key=lambda d: datetime.strptime(d.name.replace("run_", ""), "%Y-%m-%d_%H-%M-%S"))

def load_monitor_file(monitor_dir):
    monitor_dir = Path(monitor_dir)
    csv_file = list(monitor_dir.glob("*.csv"))
    if not csv_file:
        raise FileNotFoundError(f"No CSV files found in {monitor_dir}")
    return pd.read_csv(csv_file[0], skiprows=1)

def calculate_metrics(df):
    rewards_df = df['r']
    total_timesteps = df['l'].sum()
    return {
        'mean': rewards_df.mean(),
        'std': rewards_df.std(),
        'median': rewards_df.median(),
        'max': rewards_df.max(),
        'success': np.mean(rewards_df >= 475) * 100,
        'timesteps': total_timesteps
    }

In [11]:
root = Path(__file__).resolve().parents[2] if '__file__' in globals() else Path.cwd().parents[1]
docs = root / "documentation" / "cartpole"

ppo_run = find_latest_run(docs / "ppo-cartpole")
dqn_run = find_latest_run(docs / "dqn-cartpole")

ppo_df = load_monitor_file(ppo_run / "monitor")
dqn_df = load_monitor_file(dqn_run / "monitor")

comparison_dir = docs / "comparison" / f"ppo_vs_dqn_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
graphs_dir = comparison_dir / "graphs"
graphs_dir.mkdir(parents=True, exist_ok=True)

In [12]:
ppo_metrics = calculate_metrics(ppo_df)
dqn_metrics = calculate_metrics(dqn_df)
improvement = (ppo_metrics['mean'] - dqn_metrics['mean']) / dqn_metrics['mean'] * 100
t_stat, p_val = stats.ttest_ind(ppo_df['r'], dqn_df['r'], equal_var=False)

In [13]:
def plot_metrics(ppo_metrics, dqn_metrics, out):
    labels = ["Mean", "Median", "Max", "Success Rate (%)"]
    ppo_vals = [ppo_metrics['mean'], ppo_metrics['median'], ppo_metrics['max'], ppo_metrics['success']]
    dqn_vals = [dqn_metrics['mean'], dqn_metrics['median'], dqn_metrics['max'], dqn_metrics['success']]
    x = np.arange(len(labels)); w = 0.35
    plt.figure(figsize=(8,5))
    plt.bar(x - w/2, ppo_vals, w, label='PPO', color='tab:orange')
    plt.bar(x + w/2, dqn_vals, w, label='DQN', color='tab:blue')
    plt.xticks(x, labels); plt.ylabel("Value"); plt.title("Performance Metrics Comparison")
    plt.legend(); plt.grid(alpha=0.3, axis='y')
    plt.savefig(out / "metrics.png", dpi=300, bbox_inches='tight'); plt.close()

def plot_cumulative(ppo, dqn, out):
    plt.figure(figsize=(10,5))
    plt.plot(np.cumsum(ppo['r']), label="PPO (Cumulative)", color='tab:orange')
    plt.plot(np.cumsum(dqn['r']), label="DQN (Cumulative)", color='tab:blue')
    plt.title("Cumulative Rewards Over Episodes (PPO vs DQN)")
    plt.xlabel("Episode"); plt.ylabel("Total Reward")
    plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "cumulative.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_overlay_with_mean(ppo, dqn, out):
    plt.figure(figsize=(10,5))
    plt.plot(ppo['r'].rolling(10).mean(), label="PPO (MA-10)", color='tab:orange')
    plt.plot(dqn['r'].rolling(10).mean(), label="DQN (MA-10)", color='tab:blue')

    plt.axhline(ppo['r'].mean(), color='orange', linestyle='--', alpha=0.5, label=f"PPO Mean ({ppo['r'].mean():.1f})")
    plt.axhline(dqn['r'].mean(), color='blue', linestyle='--', alpha=0.5, label=f"DQN Mean ({dqn['r'].mean():.1f})")

    plt.title("Moving Averages + Mean Reward Levels")
    plt.xlabel("Episode"); plt.ylabel("Reward")
    plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out / "overlay_with_mean.png", dpi=300, bbox_inches='tight')
    plt.close()


In [14]:

plot_metrics(ppo_metrics, dqn_metrics, graphs_dir)
plot_cumulative(ppo_df, dqn_df, graphs_dir)
plot_overlay_with_mean(ppo_df, dqn_df, graphs_dir)

In [15]:
print("\n*PPO vs DQN Comparison*")
print(f"PPO Mean Reward: {ppo_metrics['mean']:.2f}")
print(f"DQN Mean Reward: {dqn_metrics['mean']:.2f}")
print(f"Improvement (PPO over DQN): {improvement:.2f}%")
print(f"t-stat: {t_stat:.3f}, p-value: {p_val:.7f}")
print(f"PPO Total Timesteps: {ppo_metrics['timesteps']:,}")
print(f"DQN Total Timesteps: {dqn_metrics['timesteps']:,}")


*PPO vs DQN Comparison*
PPO Mean Reward: 217.80
DQN Mean Reward: 69.37
Improvement (PPO over DQN): 213.97%
t-stat: 14.326, p-value: 0.0000000
PPO Total Timesteps: 100,623
DQN Total Timesteps: 100,100


In [16]:
summary_file = comparison_dir / "summary.md"
with open(summary_file, "w") as f:
    f.write(f"# PPO vs DQN Comparison ({datetime.now():%Y-%m-%d %H:%M:%S})\n\n")
    f.write(f"- PPO Mean Reward: **{ppo_metrics['mean']:.2f}**\n")
    f.write(f"- DQN Mean Reward: **{dqn_metrics['mean']:.2f}**\n")
    f.write(f"- Improvement (PPO over DQN): **{improvement:.2f}%**\n")
    f.write(f"- t-stat: {t_stat:.3f}\n")
    f.write(f"- p-value: {p_val:.9f}\n\n")
    f.write("## Training Details\n")
    f.write(f"- PPO Total Timesteps: {ppo_metrics['timesteps']:,}\n")
    f.write(f"- DQN Total Timesteps: {dqn_metrics['timesteps']:,}\n\n")
    f.write(f"## Run Sources\n")
    f.write(f"- PPO run: {ppo_run}\n")
    f.write(f"- DQN run: {dqn_run}\n")
