In [61]:
# !pip install stable-baselines3[extra]


In [7]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### Loading the environment

In [8]:
# loading the environment
env_name = 'CartPole-v0'
env = gym.make(env_name, render_mode='human')


In [6]:
episodes = 5

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info,_ = env.step(action)
        score += reward
    print(f"Episode: {episode} Score: {score}")
# env.close()


Episode: 1 Score: 36.0
Episode: 2 Score: 14.0
Episode: 3 Score: 22.0
Episode: 4 Score: 13.0
Episode: 5 Score: 30.0


In [7]:
env.close()

### Understanding the environment

In [8]:
env.action_space

Discrete(2)

In [9]:
env.action_space.sample()

1

In [10]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

### Train an RL model

In [14]:
log_path = os.path.join('Training', 'Logs')


In [37]:
env = gym.make(env_name, render_mode='human')
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


  logger.deprecation(


In [38]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 47   |
|    iterations      | 1    |
|    time_elapsed    | 43   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 2           |
|    time_elapsed         | 87          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010068495 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.00207    |
|    learning_rate        | 0.0003      |
|    loss                 | 5.29        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0199     |
|    value_loss           | 48.9        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7efe71313f20>

### Save and reload the model

In [16]:
PPO_Path = os.path.join('Training', 'SavedModel', 'PPO_Model_CartPole')

In [16]:
import gymnasium
import sys

# Monkey-patch to make gymnasium look like gym
sys.modules['gym'] = gymnasium
import gym
gym.__version__ = "26.2.0"  # Or any dummy version string


In [34]:
model.save(PPO_Path)

In [39]:
# del model
# model = PPO.load(PPO_Path, env=env)

### Evaluation

In [47]:
eval_env = gym.make(env_name, render_mode="human")
evaluate_policy(model, eval_env, n_eval_episodes=10, render=True)

(200.0, 0.0)

In [48]:
eval_env.close()

### Testing our model

In [58]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
episodes = 5

for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _= model.predict(obs)
        obs, reward, done, info= env.step(action)
        score += reward
    print(f"Episode: {episode} Score: {score}")
# env.close()

Episode: 1 Score: [200.]
Episode: 2 Score: [200.]
Episode: 3 Score: [200.]
Episode: 4 Score: [200.]
Episode: 5 Score: [200.]


In [56]:
env.step(action)

(array([[-0.02249668, -0.43952107,  0.03527744,  0.5846608 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

### Logging with tensorboard
Good to do this in the command prompt since it can lock up the notebook here

In [62]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [4]:
# !tensorboard --logdir={training_log_path} # in the terminal as well.
!ls Training

Logs  SavedModel


### Adding a callback to the training stage

In [9]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [11]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
save_path = os.path.join('Training', 'SavedModel')

In [18]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)

eval_callback = EvalCallback(
    env, callback_on_new_best=stop_callback, eval_freq=10000,
    verbose=1, 
    best_model_save_path=save_path
)

In [19]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 1266 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 988         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008301033 |
|    clip_fraction        | 0.091       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00269     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.66        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0144     |
|    value_loss           | 51.9        |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7760e75af8c0>

### changing policies

In [21]:
new_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [22]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch': new_arch})

model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to Training/Logs/PPO_5




-----------------------------
| time/              |      |
|    fps             | 1219 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 759         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015390305 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00228    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.97        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.024      |
|    value_loss           | 19.8        |
-----------------------------------------
----------------------------------



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 200          |
|    mean_reward          | 200          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0155266775 |
|    clip_fraction        | 0.164        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.562       |
|    explained_variance   | 0.607        |
|    learning_rate        | 0.0003       |
|    loss                 | 8.83         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0186      |
|    value_loss           | 40           |
------------------------------------------
------------------------------
| time/              |       |
|    fps             | 637   |
|    iterations      | 5     |
|    time_ela

<stable_baselines3.ppo.ppo.PPO at 0x7760e7482ba0>

### Using an alternate algorithm

In [23]:
from stable_baselines3 import DQN

In [24]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.955    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4298     |
|    time_elapsed     | 0        |
|    total_timesteps  | 94       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.901    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1806     |
|    time_elapsed     | 0        |
|    total_timesteps  | 208      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.539    |
|    n_updates        | 26       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.854    |
| time/               |          |
|    episodes         | 12       |
|    fp



----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 708      |
|    fps              | 675      |
|    time_elapsed     | 14       |
|    total_timesteps  | 9626     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000239 |
|    n_updates        | 2381     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 712      |
|    fps              | 678      |
|    time_elapsed     | 14       |
|    total_timesteps  | 9804     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000247 |
|    n_updates        | 2425     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

<stable_baselines3.dqn.dqn.DQN at 0x7760e74b25a0>