The course content and data was found from this video: 
https://www.youtube.com/watch?v=Mut_u40Sqz4&list=PLhm50nnh4R_QNWQX45Z0u5_C2Yg1PueCu&index=1&t=814s

## Setup

In [1]:
# reinforcement learning package
# documentation: https://stable-baselines3.readthedocs.io/en/master/
!pip install stable-baselines3[extra]



In [2]:
!pip install gym



In [1]:
import os
import gym
from stable_baselines3 import PPO # algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # wrapper to boost training time
from stable_baselines3.common.evaluation import evaluate_policy # avg reward over episodes

## Environment

open ai gym allows you to build simulated envs quickly
Documentation: https://gym.openai.com/docs/



In [None]:
# OpenAi Spaces
# Box - Continuous values
# Discrete - set of items
# Tuple - tuple of other spaces (not supported with baseline)
# Dict - dictionary of spaces
# MultiBinary - one hot encoded binary values (i.e. [0, 1, 0, 0])
# MultiDiscrete - multiple discrete values

In [None]:
# 2 different spaces
# Observation (circumstance) and Action (actions you can take)

In [2]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

In [3]:
env_name

'CartPole-v0'

In [7]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset() # obersvations for our environment (circumstance)
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample() # random action (0 or 1)
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{}, Score:{}'.format(episode, score))
    
env.close()

Episode:1, Score:38.0
Episode:2, Score:10.0
Episode:3, Score:46.0
Episode:4, Score:19.0
Episode:5, Score:59.0


### Understanding the Environment

Documentation: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [8]:
env.action_space # 0 = left , 1 = right

Discrete(2)

In [10]:
env.action_space.sample

<bound method Discrete.sample of Discrete(2)>

In [11]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [12]:
env.observation_space.sample()

array([-3.4120877e+00,  5.4195881e+37,  1.9362877e-01, -2.9853823e+38],
      dtype=float32)

## Training RL Model

In [None]:
# We're focused on
# Model-Free RL -> Policy Optimization -> PPO, A2C
# Model-Free RL -> Q-Learning -> QR-DQN
# Model-Free only makes actions based on current information
# stable-baseline only works with Model-Free

In [13]:
# Setup up Directories 
# Training -> Logs
# Training -> Saved Models
log_path = os.path.join('Training', 'Logs')


In [14]:
env = gym.make(env_name)
env = DummyVecEnv([lambda:env]) # wrapping env
model=PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
# multi layerd perceptron policy is a Vanilla Neural Network

Using cpu device


In [16]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 2110 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1451       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00615729 |
|    clip_fraction        | 0.0719     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | 0.000759   |
|    learning_rate        | 0.0003     |
|    loss                 | 8.18       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.011     |
|    value_loss           | 66         |
----------------------------------------
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x1ce49e70190>

## Save and Reload Model

In [19]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [21]:
model.save(PPO_path)

In [22]:
del model

In [24]:
model = PPO.load(PPO_path, env=env)

## Evaluation

In [26]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
# render lets you see it realtime



(200.0, 0.0)

In [27]:
# 200 = avg reward (200 is the max)
# 0 is the standard deviation
env.close()

## Test Model

In [29]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset() # obersvations for our environment (circumstance)
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) # NOW USING MODEL
        obs, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{}, Score:{}'.format(episode, score))
    
env.close()

Episode:1, Score:[200.]
Episode:2, Score:[182.]
Episode:3, Score:[200.]
Episode:4, Score:[200.]
Episode:5, Score:[200.]


In [30]:
obs = env.reset()

In [33]:
action, _ = model.predict(obs) # model action and next state

In [None]:
env.action_space.sample()

In [34]:
env.step(action)

(array([[-0.02628942,  0.20397522, -0.0127049 , -0.32968247]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

## Viewing Logs in Tensorboard

In [36]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [38]:
# kick it off in the desired PPO folder
# http://localhost:6006/
!tensorboard --logdir={training_log_path}

^C


## Adding a callback to the training stage

In [39]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [40]:
save_path = os.path.join('Training', 'Saved Models')

In [43]:
# stop once we reach the 200 reward threshold
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000, 
                            best_model_save_path=save_path,
                            verbose=1)

In [44]:
# new PPO model
model=PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [45]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 2270 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1487        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008871145 |
|    clip_fraction        | 0.0822      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.0124      |
|    learning_rate        | 0.0003      |
|    loss                 | 6.18        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0126     |
|    value_loss           | 54          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1ce51b364f0>

## Changing Policies

In [48]:
# 2 neural networks
# first is for the actor 128 nodes and 4 layers
# second is for the ... 128 nodes and 4 layers
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [49]:
# Specify new neural network policy
model=PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [50]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 1374 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 874         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016260192 |
|    clip_fraction        | 0.277       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.68       |
|    explained_variance   | 0.00013     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0324     |
|    value_loss           | 19.6        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1ce500d8e50>

## Using an Alternate Algorithm

In [51]:
# try using DQN
from stable_baselines3 import DQN

In [52]:
model=DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [53]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.959    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5120     |
|    time_elapsed     | 0        |
|    total_timesteps  | 87       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.913    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6779     |
|    time_elapsed     | 0        |
|    total_timesteps  | 183      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.874    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 7567     |
|    time_elapsed     | 0        |
|    total_timesteps  | 265      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 8769     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2482     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 8819     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2584     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 8826     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2648     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 8960     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4534     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 8980     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4607     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 9005     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4683     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 9251     |
|    time_elapsed     | 0        |
|    total_timesteps  | 6929     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 9234     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 9247     |
|    time_elapsed     | 0        |
|    total_timesteps  | 7093     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 9393     |
|    time_elapsed     | 0        |
|    total_timesteps  | 9206     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 9388     |
|    time_elapsed     | 0        |
|    total_timesteps  | 9295     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 9353     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9391     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 9157     |
|    time_elapsed     | 1        |
|    total_timesteps  | 11520    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 9145     |
|    time_elapsed     | 1        |
|    total_timesteps  | 11597    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 9134     |
|    time_elapsed     | 1        |
|    total_timesteps  | 11665    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 9236     |
|    time_elapsed     | 1        |
|    total_timesteps  | 13993    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 9254     |
|    time_elapsed     | 1        |
|    total_timesteps  | 14122    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 9259     |
|    time_elapsed     | 1        |
|    total_timesteps  | 14214    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 9358     |
|    time_elapsed     | 1        |
|    total_timesteps  | 16453    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 9357     |
|    time_elapsed     | 1        |
|    total_timesteps  | 16535    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 9353     |
|    time_elapsed     | 1        |
|    total_timesteps  | 16640    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 9421     |
|    time_elapsed     | 2        |
|    total_timesteps  | 18890    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 9433     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19027    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 9432     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19110    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x1ce51b617c0>

In [None]:
model.save()