# 1. Import Dependencies

In [1]:
import gym
from stable_baselines3 import A2C
from sb3_contrib import TRPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

# 2. Test Environment

[Download Atari Rom](www.atarimania.com/rom/Roms.rar)

[Install atari-py](https://github.com/openai/atari-py)

Pip install didn't work for me, [conda installed](https://anaconda.org/conda-forge/atari_py) with `conda install -c conda-forge atari_py` 

In [2]:
!python -m atari_py.import_roms ./ROMS

copying asterix.bin from ./ROMS/Asterix (AKA Taz) (1984) (Atari, Jerome Domurat, Steve Woita) (CX2696).bin to /home/stark/anaconda3/envs/RL/lib/python3.10/site-packages/atari_py/atari_roms/asterix.bin
copying mr_do.bin from ./ROMS/Mr. Do! (1983) (CBS Electronics, Ed English) (4L4478) (PAL).bin to /home/stark/anaconda3/envs/RL/lib/python3.10/site-packages/atari_py/atari_roms/mr_do.bin
copying star_gunner.bin from ./ROMS/Stargunner (1983) (Telesys, Alex Leavens) (1005) ~.bin to /home/stark/anaconda3/envs/RL/lib/python3.10/site-packages/atari_py/atari_roms/star_gunner.bin
copying video_pinball.bin from ./ROMS/Pinball (AKA Video Pinball) (Zellers).bin to /home/stark/anaconda3/envs/RL/lib/python3.10/site-packages/atari_py/atari_roms/video_pinball.bin
copying bowling.bin from ./ROMS/Bowling (1979) (Atari, Larry Kaplan - Sears) (CX2628 - 6-99842, 49-75117) ~.bin to /home/stark/anaconda3/envs/RL/lib/python3.10/site-packages/atari_py/atari_roms/bowling.bin
copying road_runner.bin from patched v

In [2]:
environment_name = 'Breakout-v0'
env = gym.make(environment_name)

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [9]:
env.reset()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [5]:
print(type(env.action_space))
print(env.action_space.n)

<class 'gym.spaces.discrete.Discrete'>
4


In [6]:
print(type(env.observation_space))
print(env.observation_space.shape)

<class 'gym.spaces.box.Box'>
(210, 160, 3)


In [8]:
episodes = 5
for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    print(f'Episode {episode} score: {score}')
env.close()

Episode 1 score: 0.0
Episode 2 score: 1.0
Episode 3 score: 2.0
Episode 4 score: 0.0
Episode 5 score: 0.0


# 3. Vectorise Environment and Train Model

In [10]:
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [11]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)
# model = TRPO('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [12]:
timesteps = 1000000

In [18]:
model.learn(total_timesteps=timesteps)

Logging to Training/Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 319      |
|    ep_rew_mean        | 2.12     |
| time/                 |          |
|    fps                | 152      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | nan      |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.133   |
|    value_loss         | 0.00967  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 289      |
|    ep_rew_mean        | 1.67     |
| time/                 |          |
|    fps                | 150      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x7f33df649e10>

# 4. Save and Reload Model

In [13]:
# save_path = os.path.join('Training', 'Saved_Models', f'A2C_Breakout_model_{timesteps}_timesteps')
save_path = os.path.join('Training', 'Saved_Models', f'TRPO_Breakout_model_{timesteps}_timesteps')

In [None]:
model.save(save_path)

In [14]:
del model

In [15]:
model = A2C.load(save_path, env)

Wrapping the env in a VecTransposeImage.


# 5. Evaluate and Test

In [16]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [17]:
avg, std = evaluate_policy(model, env, n_eval_episodes=10, render=True)
print(f'Average: {avg}\nSTD: {std}')
env.close()

  logger.warn(


Average: 11.2
STD: 3.2186953878862163


In [18]:
# Render for nice pretty video
from time import time
obs = env.reset()

t_start = time()
while time() - t_start < 60:
    env.render()
    action, _ = model.predict(obs)
    obs, _, _, _ = env.step(action)
env.close()