Loading Dependencies

In [3]:
import os
import gym
from stable_baselines3.ppo import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

Environment Setup

In [4]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name)
#env = gym.wrappers.Monitor(env, 'recordings')

In [5]:
environment_name

'CartPole-v1'

In [6]:
episodes = 5
for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:14.0
Episode:2 Score:21.0
Episode:3 Score:33.0
Episode:4 Score:16.0


In [7]:
env.reset()

array([ 0.04125516, -0.00577319,  0.03185437,  0.01989638], dtype=float32)

In [8]:
env.action_space

Discrete(2)

In [9]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [10]:
log_path = os.path.join('Training', 'Logs')

Model Generation and Training

In [11]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [12]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 232  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 259         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009249961 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0055     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.65        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0168     |
|    value_loss           | 52.2        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x18875917dc8>

In [13]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_CartPole')

In [14]:
model.save(PPO_Path)

Model Evaluation

In [15]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(500.0, 0.0)

In [16]:
episodes = 5
for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]


In [17]:
env.close()

Prediction

In [18]:
obs = env.reset()
action, _ = model.predict(obs)

In [19]:
env.step(action)

(array([[-0.03600594, -0.19213681,  0.02276718,  0.2989331 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

TensorBoard Logs

In [20]:
training_log_path = os.path.join(log_path, environment_name)

Callbacks and Threshold

In [22]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [23]:
save_path = os.path.join('Training', 'Saved Models')

In [24]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, best_model_save_path = save_path, eval_freq=10000, n_eval_episodes=10, verbose=1)

In [25]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [26]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 473  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 391          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0076647336 |
|    clip_fraction        | 0.0956       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.686       |
|    explained_variance   | -0.00177     |
|    learning_rate        | 0.0003       |
|    loss                 | 10.1         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0148      |
|    value_loss           | 57.6         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x18801f96d88>

Changing Policy (New Architechture)

In [27]:
new_arch = [dict(pi=[128, 128, 128, 128],   vf=[128, 128, 128, 128])]

In [28]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs=dict(net_arch=new_arch))

Using cuda device


In [29]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 467  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 344         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014867399 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00655    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0206     |
|    value_loss           | 17.8        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x18801f8b6c8>

Alternate Model

In [36]:
from stable_baselines3.a2c import A2C

In [37]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [38]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\A2C_1
------------------------------------
| time/                 |          |
|    fps                | 155      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.69    |
|    explained_variance | 0.0332   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.25     |
|    value_loss         | 5.48     |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 162      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.571   |
|    explained_variance | -0.0576  |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 1.39     |
|    va



Eval num_timesteps=10000, episode_reward=354.40 +/- 87.17
Episode length: 354.40 +/- 87.17
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 354      |
|    mean_reward        | 354      |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0.49    |
|    explained_variance | 0.000416 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1999     |
|    policy_loss        | 0.0206   |
|    value_loss         | 0.012    |
------------------------------------
------------------------------
| time/              |       |
|    fps             | 206   |
|    iterations      | 2000  |
|    time_elapsed    | 48    |
|    total_timesteps | 10000 |
------------------------------
------------------------------------
| time/                 |          |
|    fps                | 208      |
|    iterations         | 2100     |
|    time_elapsed       | 

<stable_baselines3.a2c.a2c.A2C at 0x188019416c8>

In [39]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(52.3, 8.649277426467485)

In [40]:
env.close()