### 1. Import Packages
!pip3 install 'stable-baselines3[extra]'

In [2]:
import os
import shutil
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

pygame 2.4.0 (SDL 2.26.4, Python 3.9.6)
Hello from the pygame community. https://www.pygame.org/contribute.html


### 2. Load Environment

In [3]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [4]:
initial_runs = 0
if initial_runs:
    episodes = 5
    for episode in range(1, episodes+1):
        state = env.reset()
        done = False
        score = 0

        while not done:
            env.render()
            action = env.action_space.sample()
            n_state, reward, done, info = env.step(action) # n_state, reward, done, trunc, info
            score += reward

        print("Epiosode: {} Score: {}".format(episode, score))
    env.close()

In [5]:
env.reset()
env.step(1)

(array([ 0.00971066,  0.2224381 ,  0.04465823, -0.3274855 ], dtype=float32),
 1.0,
 False,
 {})

In [6]:
print("Action Space: ", env.action_space, env.action_space.sample()) 
print("State_Space:  ", env.observation_space, env.observation_space.sample())

Action Space:  Discrete(2) 0
State_Space:   Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32) [ 2.3154364e+00 -1.3044926e+38  2.8292456e-01 -9.0793807e+36]


In [7]:
log_path = os.path.join('Logs')

In [8]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [9]:
if os.path.exists(log_path): shutil.rmtree(log_path)
model.learn(total_timesteps=20000)

Logging to Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 1677 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1281        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008291826 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000602    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.89        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 49.1        |
-----------------------------------------
------------

<stable_baselines3.ppo.ppo.PPO at 0x7fc933cc6ac0>

### 4. Save and Reload Model

In [10]:
PPO_Path = os.path.join('_weights', 'PPO_Model_CartPole')
model.save(PPO_Path)

In [11]:
del model
model = PPO.load(PPO_Path, env=env)

### 5. Evalutation

In [12]:
evaluate_policy(model, env, n_eval_episodes=10, render=True) # Reports Reward Statistics : Mean, Variance

2023-05-07 16:28:35.382 Python[62965:2014007] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/j3/7jchtq_n7cz4zm5b85nj95lc0000gn/T/org.python.python.savedState


(200.0, 0.0)

### 6. Test Model

In [14]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, next_state = model.predict(state) # next_state is only important for recurrent models 
        state, reward, done, info = env.step(action) # state, reward, done, trunc, info
        score += reward

    print("Epiosode: {} Score: {}".format(episode, score))
env.close()

Epiosode: 1 Score: [200.]
Epiosode: 2 Score: [200.]
Epiosode: 3 Score: [200.]
Epiosode: 4 Score: [200.]
Epiosode: 5 Score: [200.]
Epiosode: 6 Score: [200.]
Epiosode: 7 Score: [200.]
Epiosode: 8 Score: [200.]
Epiosode: 9 Score: [200.]
Epiosode: 10 Score: [200.]


In [68]:
state = env.reset()
model.predict(state) # next_state is only important for recurrent models

(array([0]), None)

###  7. Viewing Logs in Tensorboard

In [71]:
!tensorboard --logdir={log_path}

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

^C


###  8. Adding Callback to the Training Stage

In [72]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [73]:
save_path = os.path.join('_weights')

In [None]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
if os.path.exists(log_path): shutil.rmtree(log_path)

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)
model.learn(total_timesteps=20000, callback=eval_callback)

### 9. Changing Policies

In [101]:
net_arch = dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])
# pi = [128, 128, 128, 128] : 4 layer MLP of 128 units per layer for the Policy Network "Actor"
# vf = [128, 128, 128, 128] : 4 layer MLP of 128 units per layer for the Value Function Network "Critic"

model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [102]:
if os.path.exists(log_path): shutil.rmtree(log_path)

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 1405 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 937         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014031261 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00582     |
|    learning_rate        | 0.0003      |
|    loss                 | 2.35        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0245     |
|    value_loss           | 18.9        |
-----------------------------------------
------------

<stable_baselines3.ppo.ppo.PPO at 0x7fc9245083d0>

In [103]:
evaluate_policy(model, env, n_eval_episodes=10, render=True) # Reports Reward Statistics : Mean, Variance

(200.0, 0.0)

### 10. Alternate Algorithm

In [104]:
from stable_baselines3 import DQN

In [105]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [106]:
if os.path.exists(log_path): shutil.rmtree(log_path)

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.949    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6450     |
|    time_elapsed     | 0        |
|    total_timesteps  | 108      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.907    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6262     |
|    time_elapsed     | 0        |
|    total_timesteps  | 196      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.869    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 6430     |
|    time_elapsed     | 0        |
|    total_timesteps  | 275      |
----------------------------------
---------------------------------

<stable_baselines3.dqn.dqn.DQN at 0x7fc924508730>

In [108]:
evaluate_policy(model, env, n_eval_episodes=10, render=True) # Wow DQN sucks for this

(9.4, 0.66332495807108)