In [53]:
import gymnasium as gym
from pathlib import Path
from gymnasium.wrappers import RecordVideo
from datetime import datetime
from stable_baselines3 import PPO

In [54]:
docs_path = Path("../../documentation/cartpole/ppo-cartpole") 

run_id = datetime.now().strftime("%Y-%m-%d>%H:%M:%S")
run_path = docs_path / f"run_{run_id}"

video_dir = run_path / "videos"
graphs_dir  = run_path / "graphs"
report_file = run_path / "run_log.md"
model_dir = run_path / "model"

docs_path.mkdir(parents=True, exist_ok=True)
graphs_dir.mkdir(parents=True, exist_ok=True)
video_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)

model_file = model_dir / "ppo_cartpole"


In [55]:
env = gym.make("CartPole-v1", render_mode="rgb_array") 
env = RecordVideo(
    env,
    video_folder=str(video_dir),
    episode_trigger=lambda e: True,
    name_prefix="cartpole_random_baseline" 
)

  logger.warn(


In [56]:
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1000)

model.save(str(model_file))

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.2     |
|    ep_rew_mean     | 21.2     |
| time/              |          |
|    fps             | 134      |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 2048     |
---------------------------------


In [57]:
# Reset environment
observation, info = env.reset(seed=42)

print(f"Action space: {env.action_space}") 
print(f"Observation space: {env.observation_space}") 
print(f"Starting observation: {observation}")
print(f"maxiumum number of steps per episode: {env.spec.max_episode_steps}")

Action space: Discrete(2)
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Starting observation: [ 0.0273956  -0.00611216  0.03585979  0.0197368 ]
maxiumum number of steps per episode: 500


In [58]:
step = 0
total_reward = 0
episode_over = False

In [59]:
labels = ["cart position", "cart velocity", "pole angle", "pole angular velocity"]

while not episode_over:
    step += 1
    action, state = model.predict(observation,deterministic=True)  
    observation, reward, terminated, truncated, info = env.step(action)

    total_reward += reward
    episode_over = terminated or truncated

    print(f"Step {step}:")
    print(f"Action taken: {action}")

    for label, observe in zip(labels,observation):
        print(f"{label}: {observe}")

    print(f"Reward: {reward}")
    print(f"Terminated: {terminated}, Truncated: {truncated}")
    print("-" * 50)

print(f"Episode finished! Total reward: {total_reward}")
env.close()

Step 1:
Action taken: 1
cart position: 0.02727336250245571
cart velocity: 0.18847766518592834
pole angle: 0.036254528909921646
pole angular velocity: -0.26141977310180664
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 2:
Action taken: 0
cart position: 0.0310429148375988
cart velocity: -0.007142550311982632
pole angle: 0.03102613240480423
pole angular velocity: 0.04247424006462097
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 3:
Action taken: 1
cart position: 0.03090006485581398
cart velocity: 0.1875210702419281
pole angle: 0.0318756178021431
pole angular velocity: -0.24026045203208923
Reward: 1.0
Terminated: False, Truncated: False
--------------------------------------------------
Step 4:
Action taken: 0
cart position: 0.03465048596262932
cart velocity: -0.0080413818359375
pole angle: 0.027070408686995506
pole angular velocity: 0.062304068356752396
Reward: 1.0
Terminated:

In [60]:
obs_explanation = """\
**Observation vector (4 values):**
1. **Cart Position (m)** — horizontal position on the track (≈ -4.8 to +4.8).
2. **Cart Velocity (m/s)** — how fast the cart moves (unbounded float in practice).
3. **Pole Angle (rad)** — tilt of the pole relative to vertical (≈ -0.4189 to +0.4189 rad ≈ ±24°).
4. **Pole Angular Velocity (rad/s)** — how fast the pole is rotating (unbounded float in practice).
"""

failure_conditions = """\
**Episode ends when (termination/truncation):**
- **Pole tilt exceeds ±0.4189 rad (~±24°)** → `terminated = True`
- **Cart position leaves track bounds (≈ ±4.8 m)** → `terminated = True`
- **Time limit of 500 steps is reached** → `truncated = True`
"""

with open(report_file, "w")as f:
    f.write("# SCRUM-15: Researching Cartpole test write\n\n")
    f.write("## Environment Details\n")
    f.write(f"- Action space: {env.action_space}\n")
    f.write(f"- Observation space: {env.observation_space}\n")
    f.write(f"- Maximum steps per episode: {env.spec.max_episode_steps}\n\n")

    f.write("## Observation Meaning\n")
    f.write(obs_explanation + "\n")

    f.write("## Failure Conditions\n")
    f.write(failure_conditions + "\n")


    f.write("## Example Run\n")
    f.write(f"- Starting observation: {observation.tolist()}\n")
    f.write(f"- Total reward: {total_reward}\n")
