# Import dependencies

In [39]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

# Test Enverinment

In [40]:
env_name = 'Breakout-v4'
env = gym.make(env_name)

In [41]:
env.action_space

Discrete(4)

In [19]:
env.action_space.sample()

3

In [20]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [None]:
env.observation_space.sample()

In [21]:
episodes = 3
for e in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    print(f'Episode: {e}, score: {score}')
env.close()    

Episode: 1, score: 0.0
Episode: 2, score: 3.0
Episode: 3, score: 0.0


# Vectorise Environment and Train Model

In [22]:
env = make_atari_env('Breakout-v4', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [23]:
log_path = os.path.join('training', 'logs')

In [24]:
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(
    total_timesteps=100_000
)

# Save and Reload Model

In [42]:
a2c_path = os.path.join('training', 'saved models', 'a2c_breakout_model')

In [32]:
model.save(a2c_path)

In [33]:
del model

In [46]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


# Evaluate and Test

In [45]:
env = make_atari_env('Breakout-v4', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [47]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(6.8, 1.8330302779823358)

In [37]:
env.close()

In [48]:
episodes = 10
for e in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render(mode='human')
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print(f'Episode: {e}, score: {score}')
env.close()  

Episode: 1, score: [0.]
Episode: 2, score: [4.]
Episode: 3, score: [1.]
Episode: 4, score: [0.]
Episode: 5, score: [1.]
Episode: 6, score: [5.]
Episode: 7, score: [2.]
Episode: 8, score: [0.]
Episode: 9, score: [1.]
Episode: 10, score: [0.]
