# Import dependencies

In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

# Test Environment

In [26]:
env_name = 'CarRacing-v2'
env = gym.make(env_name, render_mode='human')

In [16]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {})

In [3]:
env.action_space

Box([-1.  0.  0.], 1.0, (3,), float32)

In [4]:
env.action_space.sample()

array([0.7654535 , 0.64423543, 0.22518677], dtype=float32)

In [5]:
env.observation_space

Box(0, 255, (96, 96, 3), uint8)

In [None]:
env.observation_space.sample()

In [8]:
env.render()

True

In [9]:
env.close()

In [None]:
episodes = 3
for e in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample() 
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print(f'Episode: {e}, score: {score}')
#env.close()  

# Train Model

In [29]:
env = gym.make(env_name, render_mode='human')
env = DummyVecEnv([lambda: env])



In [10]:
log_path = os.path.join('training', 'logs')
model = PPO(
    'CnnPolicy',
    env,
    verbose=1,
    tensorboard_log=log_path
)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [31]:
save_path = os.path.join('training', 'saved models', 'project2_model')

In [12]:
TIMESTEPS = 10_000
for i in range(1, 6):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False)
    model.save(save_path + str(i * TIMESTEPS))

Logging to training\logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 25   |
|    iterations      | 1    |
|    time_elapsed    | 79   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 18          |
|    iterations           | 2           |
|    time_elapsed         | 224         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008798907 |
|    clip_fraction        | 0.0907      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | -0.00101    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.349       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00482    |
|    std                  | 0.989       |
|    value_loss           | 0.73        |
---

# Load the model

In [16]:
ppo_path = os.path.join('training', 'saved models', 'project2_model50000')

In [30]:
model = PPO.load(ppo_path, env)

Wrapping the env in a VecTransposeImage.


# Evaluate and test the model

In [22]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(282.04864914789795, 121.89050860509542)

In [36]:
episodes = 3
for e in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print(f'Episode: {e}, score: {score}')
#env.close()  

Episode: 1, score: [226.01773]
Episode: 2, score: [309.89023]
Episode: 3, score: [193.93756]


In [23]:
env.close()

# Continue training

In [32]:
TIMESTEPS = 10_000
for i in range(6, 9):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False)
    model.save(save_path + str(i * TIMESTEPS))

Logging to training\logs\PPO_4
------------------------------
| time/              |       |
|    fps             | 13    |
|    iterations      | 1     |
|    time_elapsed    | 151   |
|    total_timesteps | 53248 |
------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 11         |
|    iterations           | 2          |
|    time_elapsed         | 369        |
|    total_timesteps      | 55296      |
| train/                  |            |
|    approx_kl            | 0.02893966 |
|    clip_fraction        | 0.246      |
|    clip_range           | 0.2        |
|    entropy_loss         | -3.38      |
|    explained_variance   | 0.917      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0613     |
|    n_updates            | 260        |
|    policy_gradient_loss | -0.0312    |
|    std                  | 0.748      |
|    value_loss           | 0.577      |
--------------

In [37]:
%tensorboard --log_dir={log_path}

UsageError: Line magic function `%tensorboard` not found.
