In [1]:
import gymnasium
import math
from gymnasium.envs.classic_control.cartpole import CartPoleEnv
from gymnasium import logger, spaces
import numpy as np
from gymnasium.envs.registration import register


In [17]:
class SwingPole(CartPoleEnv):
    """https://gymnasium.farama.org/environments/classic_control/cart_pole/"""
    def __init__(self, render_mode: str | None = None):
        super().__init__(render_mode=render_mode)
        self.x_threshold = 3
        self.step_num = 0
        high = np.array(
            [
                self.x_threshold * 2,
                np.finfo(np.float32).max,
                np.finfo(np.float32).max,
                # 4*math.pi,
                np.finfo(np.float32).max,
            ],
            dtype=np.float32,
        )

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
    
    def step(self, action):
        obs, rew, _, _, info =  super().step(action)
        self.step_num += 1
        term = False
        x, x_dot, theta, theta_dot = self.state
        term = bool(
            x < -self.x_threshold
            or x > self.x_threshold
            or self.step_num > 1000
            or abs(theta_dot) < 0.01
        )
        rew = theta_dot

        return obs, rew, term, False, info

    def reset(self, *, seed: int | None = None, options: dict | None = None):
        self.step_num = 0
        return super().reset(seed=seed, options=options)        

In [None]:
"""Optionally register the environment to use it with gymnasium.make"""
register(
     id="SwingPole",
     entry_point=SwingPole,
)

env = gymnasium.make("SwingPole", render_mode='rgb_array')
model = None

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(SwingPole, n_envs=10)
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./ppo_swingpole_tensorboard/")
model.learn(total_timesteps=100_000, progress_bar=True, tb_log_name="PPO_SwingPole")
model.save("ppo_swingpole")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_swingpole_tensorboard/PPO_SwingPole_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 120      |
|    ep_rew_mean     | 338      |
| time/              |          |
|    fps             | 1093     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 98.1        |
|    ep_rew_mean          | -34.6       |
| time/                   |             |
|    fps                  | 881         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007813991 |
|    clip_fraction        | 0.0222      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.688      |
|    explained_variance   | 0.00721     |
|    learning_rate        | 0.

In [22]:
"""Here is one episode using the trained algorithm to show a pygame episode"""
from time import sleep
episode_reward = 0
reward = 0
step = 0
env2 = SwingPole(render_mode='human') # get the environment from the model
obs, info = env2.reset()
terminated = truncated = False
while not terminated and not truncated:
    action = model.predict(obs, deterministic=True)[0]  # use the trained model to predict the action
    obs, reward, terminated, truncated, info = env2.step(action)
    episode_reward += reward
    env2.render()
    step += 1
    sleep(0.0001)  # sleep to slow down the rendering a bit

print('episode reward was: ', episode_reward)
print('steps taken: ', step)
env2.close()

  logger.warn(


episode reward was:  14571.514995381593
steps taken:  1001


In [23]:
"""Here is one episode using the trained algorithm to show a pygame episode"""
import moviepy as mpy
episode_reward = 0
reward = 0
step = 0
img_list = []
env2 = SwingPole(render_mode='rgb_array') # get the environment from the model
obs, info = env2.reset()
terminated = truncated = False
while not terminated and not truncated:
    action = model.predict(obs, deterministic=True)[0]  # use the trained model to predict the action
    obs, reward, terminated, truncated, info = env2.step(action)
    episode_reward += reward
    img_list.append(env2.render())
    step += 1

print('episode reward was: ', episode_reward)
print('steps taken: ', step)
clip = mpy.ImageSequenceClip(img_list, fps=30)
clip.write_videofile("cartswing_episode.mp4")

episode reward was:  14698.70725315298
steps taken:  1001
MoviePy - Building video cartswing_episode.mp4.
MoviePy - Writing video cartswing_episode.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready cartswing_episode.mp4
