## Training a reinforcement learning to play Atari Games

### Import Dependencies

In [3]:
import os
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

In [5]:
!pip install -q torch torchvision torchaudio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.8 -m pip install --upgrade pip[0m


### Test Environments

In [7]:
environment_name="Breakout-v0"

In [8]:
env=gym.make(environment_name)

A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [14]:
print(env.action_space.sample())
print(env.action_space)
# print(env.observation_space.sample())
print(env.observation_space)

1
Discrete(4)
Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 25

In [16]:
episodes = 5
for episode in range(1, episodes+1):
    state=env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action=env.action_space.sample()
        n_state,reward,done,info=env.step(action)
        score+=reward
    print(f"Episode :{episode} Score : {score}")

env.close()

Episode :1 Score : 2.0
Episode :2 Score : 0.0
Episode :3 Score : 0.0
Episode :4 Score : 0.0
Episode :5 Score : 0.0


In [17]:
env.close()

### Vectorise Environment and Train Model

In [20]:
# Vectorizing the environment, 
# particularly with multiple environments, 
# allows you to train the agent faster by training in parallel

In [22]:
env=make_atari_env("Breakout-v0",n_envs=4,seed=0)
env=VecFrameStack(env,n_stack=4)

In [23]:
env.reset()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [25]:
env.render() #4 environments

In [29]:
env.close()

In [30]:
#setting up model for training
log_path=os.path.join("Training","logs")
model=A2C("CnnPolicy", env, verbose=1,tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [31]:
model.learn(total_timesteps=40000)

Logging to Training/logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 287      |
|    ep_rew_mean        | 1.71     |
| time/                 |          |
|    fps                | 103      |
|    iterations         | 100      |
|    time_elapsed       | 19       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0472   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.129   |
|    value_loss         | 0.102    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 282      |
|    ep_rew_mean        | 1.58     |
| time/                 |          |
|    fps                | 147      |
|    iterations         | 200      |
|    time_elapsed       | 27       |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 279      |
|    ep_rew_mean        | 1.96     |
| time/                 |          |
|    fps                | 242      |
|    iterations         | 1400     |
|    time_elapsed       | 115      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.109   |
|    explained_variance | 0.861    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.00955 |
|    value_loss         | 0.0342   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 271      |
|    ep_rew_mean        | 1.78     |
| time/                 |          |
|    fps                | 245      |
|    iterations         | 1500     |
|    time_elapsed       | 122      |
|    total_timesteps    | 30000    |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x109dc7eb0>

In [32]:
# after 40000 tiemsteps, average reward mean is 4.37

### Save and Reload Model

In [33]:
a2c_path=os.path.join("Training","Saved Models", "A2c_model")

In [39]:
model.save(a2c_path)

NameError: name 'model' is not defined

In [42]:
del model

In [43]:
model=A2C.load(a2c_path,env)

Wrapping the env in a VecTransposeImage.


### Evaluate and Test

In [44]:
evaluate_policy(model, env, n_eval_episodes=10,render=True)

(5.8, 2.1354156504062622)

In [47]:
# avg rewards(score), stadard deviation

In [45]:
obs=env.reset()
score=0

while True:
    action,_=model.predict(obs)
    obs,rewards, dones, info=env.step(action)
    env.render()
    score+=rewards
    print(f"Score : {score}")

Score : [0. 0. 0. 0.]
Score : [0. 0. 0. 0.]
Score : [0. 0. 0. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [0. 0. 1. 0.]
Score : [1. 0. 1. 0.]
Score : [1. 0. 1. 0.]
Score : [1. 0. 2. 0.]
Score : [1. 0. 2. 0.]
Score : [1. 0. 2. 0.]
Score : [1. 0. 2. 0.]
Score : [1. 0. 2. 0.]
Score : [1. 1. 2. 0.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 2. 1.]
Score : [1. 1. 3. 1.]
Score : [1. 1. 3. 1.]
Score : [1. 1. 3. 1.]
Score : [2. 1. 3. 1.]
Score : [2. 1. 3. 1.]
Score : [2. 1. 3. 1.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [2. 1. 3. 2.]
Score : [3. 1. 3. 2.]
Score : [3

Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 17. 19. 19.]
Score : [17. 18. 19. 19.]
Score : [17. 18. 19. 19.]
Score : [17. 18. 19. 19.]
Score : [17. 18. 20. 19.]
Score : [17. 18. 20. 19.]
Score : [17. 18. 20. 19.]
Score : [17. 18. 20. 20.]
Score : [17. 18. 20. 20.]
Score : [17. 18. 20. 20.]
Score : [17. 18. 20. 20.]
Score : [17. 18. 20. 20.]
Score : [17. 18. 20. 20.]
Score : [17. 19. 20. 20.]
Score : [17. 19. 20. 20.]
Score : [17. 19. 20. 20.]
Score : [17. 19. 21. 20.]
Score : [17. 19. 21. 20.]
Score : [17. 19. 21. 20.]
Score : [18. 19. 21. 20.]
Score : [18. 19. 21. 20.]
Score : [18. 19. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 21. 20.]
Score : [18. 20. 22. 20.]
Score : [18. 20. 22. 20.]
Score : [18. 20. 22. 20.]
Score : [18. 20. 22. 21.]
Score : [18.

KeyboardInterrupt: 

In [46]:
env.close()