# Import Dependencies

In [None]:
!pip install 'stable-baselines3[extra]'

In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

  from .autonotebook import tqdm as notebook_tqdm


# Load Environment

In [2]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [4]:
episodes = 5
for episode in range(episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:0 Score:38.0
Episode:1 Score:31.0
Episode:2 Score:20.0
Episode:3 Score:14.0
Episode:4 Score:16.0
Episode:5 Score:11.0


In [9]:
env.observation_space
env.step(1)

(array([ 0.1986826 ,  1.2201842 , -0.31454405, -2.3454444 ], dtype=float32),
 0.0,
 True,
 {})

# Training

In [11]:
#Make your directories first
log_path = os.path.join('Training', 'Logs')
log_path

'Training/Logs'

In [12]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [33]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 1152 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 833          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0045640413 |
|    clip_fraction        | 0.0203       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.546       |
|    explained_variance   | 0.446        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.297        |
|    n_updates            | 130          |
|    policy_gradient_loss | -0.00173     |
|    value_loss           | 2.62         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f3a05df08e0>

# Save and Reload Model

In [18]:
ppo_path = os.path.join('Training', 'Saved Models', 'ppo_model_cart_model')

In [21]:
model.save(ppo_path)

In [22]:
del model

In [23]:
model = PPO.load(ppo_path, env=env)

# Evaluation

In [24]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [None]:
env.close()

# Testing

In [29]:
episodes = 5
for episode in range(episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:0 Score:[200.]
Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


# Viewing logs in Tensorboard

In [37]:
training_log_path = os.path.join('src', log_path, 'PPO_4')

In [None]:
!tensorboard --logdir=training_log_path

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

I0605 23:45:59.788526 140006597322304 plugin.py:429] Monitor runs begin
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.10.0 at http://localhost:6006/ (Press CTRL+C to quit)


In [39]:
training_log_path

'src/Training/Logs/PPO_4'