# PPO CartPole 

In [262]:
import gymnasium as gym
from pathlib import Path
from gymnasium.wrappers import RecordVideo
from datetime import datetime
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [263]:
docs_path = Path("../../documentation/cartpole/ppo-cartpole") 

run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run_path = docs_path / f"run_{run_id}"

video_dir = run_path / "videos"
graphs_dir  = run_path / "graphs"
report_file = run_path / "run_log.md"
model_dir = run_path / "model"
monitor_dir = run_path / "monitor"

docs_path.mkdir(parents=True, exist_ok=True)
graphs_dir.mkdir(parents=True, exist_ok=True)
video_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)
monitor_dir.mkdir(parents=True, exist_ok=True)

model_file = model_dir / "ppo_cartpole"
monitor_file = str(monitor_dir) #/ "monitor_log.csv"

## Training Model

In [264]:
env = gym.make("CartPole-v1", render_mode="rgb_array") 
env = Monitor(env, monitor_file)

In [265]:
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)
model.save(str(model_file))

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.4     |
|    ep_rew_mean     | 22.4     |
| time/              |          |
|    fps             | 6151     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 25.9        |
|    ep_rew_mean          | 25.9        |
| time/                   |             |
|    fps                  | 4065        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009278709 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_varia

In [266]:
# Reset environment
observation, info = env.reset(seed=42)
print(f"Total Timesteps:{model.num_timesteps}")
print(f"Action space: {env.action_space}") 
print(f"Observation space: {env.observation_space}") 
print(f"Starting observation: {observation}")
print(f"maxiumum number of steps per episode: {env.spec.max_episode_steps}")

Total Timesteps:10240
Action space: Discrete(2)
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Starting observation: [ 0.0273956  -0.00611216  0.03585979  0.0197368 ]
maxiumum number of steps per episode: 500


In [267]:
step = 0
total_reward = 0
episode_over = False

In [268]:
labels = ["cart position", "cart velocity", "pole angle", "pole angular velocity"]

while not episode_over:
    step += 1
    action, state = model.predict(observation,deterministic=True)  
    observation, reward, terminated, truncated, info = env.step(action)

    total_reward += reward
    episode_over = terminated or truncated

    print(f"Step {step}:")
    print(f"Action taken: {action}")

    for label, observe in zip(labels,observation):
        print(f"{label}: {observe}")

    print(f"Reward: {reward}")
    print(f"Terminated: {terminated}, Truncated: {truncated}")
    print("-" * 50)

print(f"Episode finished! Total reward: {total_reward}")
env.close()

Step 1:
Action taken: 1
cart position: 0.02727336250245571
cart velocity: 0.18847766518592834
pole angle: 0.036254528909921646
pole angular velocity: -0.26141977310180664
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 2:
Action taken: 0
cart position: 0.0310429148375988
cart velocity: -0.007142550311982632
pole angle: 0.03102613240480423
pole angular velocity: 0.04247424006462097
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 3:
Action taken: 1
cart position: 0.03090006485581398
cart velocity: 0.1875210702419281
pole angle: 0.0318756178021431
pole angular velocity: -0.24026045203208923
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 4:
Action taken: 0
cart position: 0.03465048596262932
cart velocity: -0.0080413818359375
pole angle: 0.027070408686995506
pole angular velocity: 0.062304068356752396
Reward: 1.0
Terminated:

## Report Document

In [269]:
obs_explanation = """\
**Observation vector (4 values):**
1. **Cart Position (m)** — horizontal position on the track (≈ -4.8 to +4.8).
2. **Cart Velocity (m/s)** — how fast the cart moves (unbounded float in practice).
3. **Pole Angle (rad)** — tilt of the pole relative to vertical (≈ -0.4189 to +0.4189 rad ≈ ±24°).
4. **Pole Angular Velocity (rad/s)** — how fast the pole is rotating (unbounded float in practice).
"""

failure_conditions = """\
**Episode ends when (termination/truncation):**
- **Pole tilt exceeds ±0.4189 rad (~±24°)** → `terminated = True`
- **Cart position leaves track bounds (≈ ±4.8 m)** → `terminated = True`
- **Time limit of 500 steps is reached** → `truncated = True`
"""

with open(report_file, "w",encoding="utf-8")as f:
    f.write("## Environment Details\n")
    f.write(f"- Action space: {env.action_space}\n")
    f.write(f"- Observation space: {env.observation_space}\n")
    f.write(f"- Maximum steps per episode: {env.spec.max_episode_steps}\n\n")

    f.write("## Observation Meaning\n")
    f.write(obs_explanation + "\n")

    f.write("## Failure Conditions\n")
    f.write(failure_conditions + "\n")


    f.write("## Performance Summary:\n")
    f.write(f"- Total Timesteps: {model.num_timesteps}\n")
    f.write(f"- Algorithm: {type(model).__name__}\n")
    f.write(f"- Observation: {observation.tolist()}\n")
    f.write(f"- Total reward: {total_reward}\n")

    f.write("## Graph Interpretation:\n")
    f.write("- Training curve shows how total episode rewards increased during training.\n")
    f.write("- Evaluation curve shows the mean performance of the trained model at fixed intervals.\n")
    f.write("- A smooth upward trend in both curves indicates stable learning.\n")
    f.write("- The final rewards approaching 500 suggest the agent solved the CartPole task.\n\n")


## Plotting (Evaluation Curve + Training Curve)

In [270]:
def generate_evaluation_curve(model, graphs_dir, num_eval_episodes=20, eval_freq=200, total_timesteps=2000):
    """
    Evaluate the model at regular intervals and plot evaluation curve.
    Evaluation curve, not recording rewards during training, evaluating trained model at fixed intervals
    Each point on graph comes from evaluate_policy(), which tests model for several episodes & averages results
    Measures performance after certain amounts of training, NOT the actual learning process

    * TLDR: graphs shows the evaluarion reward at regular timesteps during training *
    """
    rewards = []
    timesteps = []

    env = gym.make("CartPole-v1")

    for step in range(eval_freq, total_timesteps + 1, eval_freq):
        # Evaluate policy
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=num_eval_episodes, deterministic=True) #this uses the model we just trained 
        rewards.append(mean_reward)
        timesteps.append(step)

    # Plot
    plt.figure(figsize=(8, 5))
    plt.plot(timesteps, rewards, marker="o", label="Mean Reward (Evaluation)", color="tab:orange")
    plt.xlabel("Timesteps")
    plt.ylabel("Mean Reward over Episodes")
    plt.title("evaluation Curve (PPO on CartPole)")
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.legend()

    # Save figure
    graph_file = graphs_dir / "evaluation_curve.png"
    plt.savefig(graph_file, dpi=200, bbox_inches="tight")
    plt.close()
    env.close()

    print(f"evaluation curve saved to {graph_file}")


generate_evaluation_curve(model, graphs_dir)




evaluation curve saved to ../../documentation/cartpole/ppo-cartpole/run_2025-10-10_14-08-34/graphs/evaluation_curve.png


In [271]:
def plot_training_curve(monitor_dir, graphs_dir):
    """
    Plot the training learning curve based on rewards logged during training.
    The learning curve visualizes how the agent’s performance evolves over time
    by showing the total episode rewards collected during training.

    Each point on the graph corresponds to the cumulative reward obtained in a
    single training episode, as recorded automatically by the Monitor wrapper.

    This curve reflects the agent’s actual *learning process* — showing how it
    improves, stabilizes, or fluctuates in performance while training.

    * TLDR: graph shows the agent’s reward per episode DURING training (true learning progress) *
  
    """
    monitor_dir = Path(monitor_dir)

    csv_files = list(monitor_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No monitor CSV found in {monitor_dir}")
    log_file = csv_files[0]  

    print(f"Found monitor file: {log_file.name}")

    df = pd.read_csv(log_file, skiprows=1)
    plt.figure(figsize=(10, 6))
    plt.plot(df["r"], label="Reward per Episode", color="tab:blue", alpha=0.7)
    plt.xlabel("Episodes")
    plt.ylabel("Total Reward")
    plt.title("Training Learning Curve (PPO on CartPole)")
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.legend()
    out = Path(graphs_dir) / "training_learning_curve.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved training learning curve → {out}")

plot_training_curve(monitor_file, graphs_dir)


Found monitor file: monitor.csv
Saved training learning curve → ../../documentation/cartpole/ppo-cartpole/run_2025-10-10_14-08-34/graphs/training_learning_curve.png
