# Import dependencies

In [1]:
import os 
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Load environment

In [2]:
env_name = 'CartPole-v1'
env = gym.make('CartPole-v1')

In [3]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        score += reward
    print(f"Episode: {episode}, Score: {score}")
env.close()

Episode: 1, Score: 28.0
Episode: 2, Score: 15.0
Episode: 3, Score: 22.0
Episode: 4, Score: 30.0
Episode: 5, Score: 12.0


In [4]:
env.reset()

array([ 0.0158752 ,  0.04656625, -0.02160056, -0.03279774])

Returns a random action

In [47]:
env.action_space.sample()

1

In [48]:
env.action_space

Discrete(2)

Returns a random observation

In [49]:
env.observation_space.sample()

array([2.5956640e+00, 1.4879597e+38, 1.6635396e-01, 1.4719585e+37],
      dtype=float32)

In [8]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

Takes one action on the environment

In [9]:
env.step(1)

(array([ 0.01680652,  0.2419912 , -0.02225652, -0.33221685]), 1.0, False, {})

In [53]:
env.render()

True

In [None]:
env.close()

# Train an RL Model

In [12]:
log_path = os.path.join('training', 'logs')

In [13]:
log_path

'training\\logs'

In [15]:
env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device




In [26]:
model.learn(total_timesteps=20000)

Logging to training\logs\PPO_2


-----------------------------
| time/              |      |
|    fps             | 1321 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 863          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0037674876 |
|    clip_fraction        | 0.0257       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.538       |
|    explained_variance   | 0.465        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.502        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.000233    |
|    value_loss           | 7.08         |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x1c7e9335930>

gym.wrappers.time_limit.TimeLimit

# Save and Reload Model

In [27]:
PPO_Path = os.path.join('training', 'saved models', 'ppo_model_cartpole')

In [29]:
model.save(PPO_Path)



In [30]:
del model

In [100]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




# Evaluation

In [57]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(500.0, 0.0)

# Test Model

In [101]:
episodes = 5
for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs) # Now using model here
        obs, reward, done, info = env.step(action)
        score += reward
    print(f"Episode: {episode}, Score: {score}")
env.close()

Episode: 1, Score: 500.0
Episode: 2, Score: 500.0
Episode: 3, Score: 500.0
Episode: 4, Score: 500.0
Episode: 5, Score: 500.0


In [4]:
obs = env.reset()

In [66]:
action, _ = model.predict(obs)

In [67]:
env.step(action)

(array([ 0.0028137 , -0.16945557,  0.04016525,  0.2665782 ]), 1.0, False, {})

# Viewing Logs in Tensorboard

In [71]:
training_log_path = os.path.join(log_path, 'PPO_2')
training_log_path

'training\\logs\\PPO_2'

In [73]:
%tensorboard --logdir={training_log_path}

^C


# Adding a callback to the training Stage

In [74]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [75]:
save_path = os.path.join('training', 'saved models')

In [76]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)
eval_callback = EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=10000,
    best_model_save_path=save_path,
    verbose=1
)



In [77]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [78]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training\logs\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.3     |
|    ep_rew_mean     | 23.3     |
| time/              |          |
|    fps             | 1257     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29          |
|    ep_rew_mean          | 29          |
| time/                   |             |
|    fps                  | 845         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007785005 |
|    clip_fraction        | 0.0818      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.0006     |



Eval num_timesteps=10000, episode_reward=427.80 +/- 105.17
Episode length: 427.80 +/- 105.17
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 428         |
|    mean_reward          | 428         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.006825499 |
|    clip_fraction        | 0.0751      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.608      |
|    explained_variance   | 0.261       |
|    learning_rate        | 0.0003      |
|    loss                 | 25.9        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0179     |
|    value_loss           | 65.9        |
-----------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 59.2     |
|    ep_rew_mean     | 59.2

<stable_baselines3.ppo.ppo.PPO at 0x1c7f0a07be0>

# Changing Policies

In [89]:
net_arch = dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])

In [90]:
model = PPO(
    'MlpPolicy',
    env,
    verbose=1,
    tensorboard_log=log_path,
    policy_kwargs={'net_arch': net_arch}
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [91]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training\logs\PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.4     |
|    ep_rew_mean     | 20.4     |
| time/              |          |
|    fps             | 742      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.7        |
|    ep_rew_mean          | 28.7        |
| time/                   |             |
|    fps                  | 468         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012707664 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00648    |



Eval num_timesteps=10000, episode_reward=409.20 +/- 106.45
Episode length: 409.20 +/- 106.45
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 409         |
|    mean_reward          | 409         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.009310633 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.569      |
|    explained_variance   | 0.51        |
|    learning_rate        | 0.0003      |
|    loss                 | 12.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0238     |
|    value_loss           | 42.8        |
-----------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 74.9     |
|    ep_rew_mean     | 74.9     |
| time/        

<stable_baselines3.ppo.ppo.PPO at 0x1c7f06a75b0>

# Using an Alternate Algorithm

In [92]:
from stable_baselines3 import DQN

In [94]:
model = DQN(
    'MlpPolicy',
    env,
    verbose=1,
    tensorboard_log=log_path
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [98]:
model.learn(total_timesteps=80000)

Logging to training\logs\DQN_3
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21.2     |
|    ep_rew_mean      | 21.2     |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5097     |
|    time_elapsed     | 0        |
|    total_timesteps  | 85       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.1     |
|    ep_rew_mean      | 23.1     |
|    exploration_rate | 0.978    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5967     |
|    time_elapsed     | 0        |
|    total_timesteps  | 185      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 28.3     |
|    ep_rew_mean      | 28.3     |
|    exploration_rate | 0.96     |
| time/               | 

<stable_baselines3.dqn.dqn.DQN at 0x1c7f0a19360>