# Breakout Project

### Here I will be working to test out a practical project/exmaple with SB3. I will diverge from the training material in that I will be looking to get a feel for how to implement A2C and compare it with SAC (the algorithm used by TC-Driver). By using both in the same project, I hope to translate this to the TC-Driver codebase so that I can succesfully implement A2C.

# 1. Import Dependencies

In [2]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env



import os

# 2. Test Environment

In order to get the atari env working, run the following:

`pip install gymnasium[atari]` \
`pip install gymnasium[accept-rom-license]` \
`pip install "autorom[accept-rom-license]"` \
`AutoROM` \
Select Y \
`AutoROM --install-dir /path/to/install`


In [3]:
environment_name = 'ALE/Breakout-v5'
env=gym.make(environment_name)

In [4]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [5]:
env.action_space

Discrete(4)

In [6]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [None]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action=env.action_space.sample()
        obs, reward, done, info, _ = env.step(action)
        score+=reward
        print('Episode:{} Score:{}'.format(episode, score))
env.close()

# 3. Vectorise Environment and Train Model

In [7]:
env = make_atari_env(environment_name, n_envs=4, seed=0)
env = VecFrameStack(env,n_stack=4)

In [8]:
env.reset()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [9]:
log_path=os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [10]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 225      |
|    ep_rew_mean        | 1.55     |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 100      |
|    time_elapsed       | 11       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.981    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.156   |
|    value_loss         | 0.0164   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 228      |
|    ep_rew_mean        | 1.66     |
| time/                 |          |
|    fps                | 172      |
|    iterations         | 200      |
|    time_elapsed       | 23       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x24c497b7490>

# 4. Save and Reload Model

In [11]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_Breakout_Model')
model.save(a2c_path)

In [12]:
del model

In [13]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


# 5. Evaluate and Test

In [17]:
env_keyword_args = {'render_mode': 'human'}
env = make_atari_env(environment_name, n_envs=1, seed=0, env_kwargs=env_keyword_args)
env = VecFrameStack(env,n_stack=4)
# env.metadata["render_fps"] = 20
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


: 

In [15]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

  logger.warn(


(8.0, 3.4641016151377544)