# Break-Out Game (using pytorch)

# import the Dependencies

In [2]:
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack # we can trin multiple nev at same time threading??
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env # atari env helps us to use atari games
import os

# Test environment for Atari

In [22]:
env=gym.make("ALE/Breakout-v5",render_mode='human')

In [23]:
env.reset() # get the observation

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [24]:
env.action_space

Discrete(4)

In [25]:

env.observation_space #its an image based model


Box(0, 255, (210, 160, 3), uint8)

In [26]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
       
        action = env.action_space.sample()
        n_state, reward, done,_, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
# env.close()

Episode:1 Score:2.0
Episode:2 Score:2.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:1.0


In [28]:
env.close()

# Lets vectorise our environment and run on 4 different environment at the same time(kind of threading)

In [30]:
env = make_atari_env('ALE/Breakout-v5', n_envs=4, seed=0) # 4 environments

In [31]:
env = VecFrameStack(env, n_stack=4)

In [34]:
log_path = os.path.join('pytorch_RL', 'Logs')

In [35]:
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [36]:
# model.learn(total_timesteps=400000) # Define the model by algorithm and policy

Logging to pytorch_RL/Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 240      |
|    ep_rew_mean        | 1.91     |
| time/                 |          |
|    fps                | 163      |
|    iterations         | 100      |
|    time_elapsed       | 12       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.015    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.244    |
|    value_loss         | 0.252    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 239      |
|    ep_rew_mean        | 1.97     |
| time/                 |          |
|    fps                | 164      |
|    iterations         | 200      |
|    time_elapsed       | 24       |
|    total_timesteps    | 4000     |
| tra

<stable_baselines3.a2c.a2c.A2C at 0x7f8c175e8d60>

In [37]:
a2c_path = os.path.join('pytorch_RL', 'saved_model','A2C_BreakOut_Model')

In [38]:
model.save(a2c_path)

# Evaluate and Test our model

In [39]:
#define the env again because while testing we can only pass 1 env at a time
env=gym.make("ALE/Breakout-v5")

In [46]:

env = make_atari_env('ALE/Breakout-v5', n_envs=1, seed=0) # 1 env
env = VecFrameStack(env, n_stack=4)

In [47]:
evaluate_policy(model, env, n_eval_episodes=10)

(22.3, 4.561797891182818)

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render("human")

In [None]:
env.close()