In [20]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3 import PPO
import gym

environment_name = 'CartPole-v0'

In [21]:
import os
PPO_PATH = os.path.join('Training','Saved Models','PPO_MODEL_Cartpole')

log_path = os.path.join('Training','Logs')
save_path = os.path.join('Training','Saved Models')

In [22]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback = EvalCallback(env,callback_on_new_best=stop_callback,eval_freq=10000,best_model_save_path=save_path,verbose=1)

In [23]:
env = gym.make(environment_name)
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [24]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_10
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.1     |
|    ep_rew_mean     | 23.1     |
| time/              |          |
|    fps             | 455      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.8        |
|    ep_rew_mean          | 26.8        |
| time/                   |             |
|    fps                  | 358         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007472491 |
|    clip_fraction        | 0.0916      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00819    

<stable_baselines3.ppo.ppo.PPO at 0x2bf10f64150>

In [25]:
# Changing Policies
net_arch = [dict(pi=[128,128,128,128],vf=[128,128,128,128])]

In [27]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [28]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_11
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.4     |
|    ep_rew_mean     | 23.4     |
| time/              |          |
|    fps             | 320      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.1        |
|    ep_rew_mean          | 29.1        |
| time/                   |             |
|    fps                  | 277         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016139835 |
|    clip_fraction        | 0.254       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00156    



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.011129186 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.557      |
|    explained_variance   | 0.6         |
|    learning_rate        | 0.0003      |
|    loss                 | 10.4        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0171     |
|    value_loss           | 36          |
-----------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 74.4     |
|    ep_rew_mean     | 74.4     |
| time/            

<stable_baselines3.ppo.ppo.PPO at 0x2bf10f9f690>

In [29]:
#using diff Algorithms
from stable_baselines3 import DQN

In [31]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [32]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21.8     |
|    ep_rew_mean      | 21.8     |
|    exploration_rate | 0.959    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4345     |
|    time_elapsed     | 0        |
|    total_timesteps  | 87       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 26.1     |
|    ep_rew_mean      | 26.1     |
|    exploration_rate | 0.901    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5181     |
|    time_elapsed     | 0        |
|    total_timesteps  | 209      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 22.8     |
|    ep_rew_mean      | 22.8     |
|    exploration_rate | 0.87     |
| time/               | 



Eval num_timesteps=9520, episode_reward=9.60 +/- 0.80
Episode length: 9.60 +/- 0.80
----------------------------------
| eval/               |          |
|    mean_ep_length   | 9.6      |
|    mean_reward      | 9.6      |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 9520     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21       |
|    ep_rew_mean      | 21       |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 452      |
|    fps              | 4630     |
|    time_elapsed     | 2        |
|    total_timesteps  | 9596     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21       |
|    ep_rew_mean      | 21       |
|    exploration_rate | 0.05     |
| time/               |          |
|    e

<stable_baselines3.dqn.dqn.DQN at 0x2bf133ee5d0>