## 1. Importing Dependencies
https://gym.openai.com/envs/#atari

In [3]:
import gym
from stable_baselines3 import A2C #different algorithm
from stable_baselines3.common.vec_env import VecFrameStack  #using 4 envs at the same time to train our model
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os



## 2. Test Environment
Download it from here : http://www.atarimania.com/roms/Roms.rar

Breakout-v0   
Maximize your score in the Atari 2600 game Breakout. In this environment, **the observation is an RGB image of the screen**,         
which is an array of shape (210, 160, 3) Each action is repeatedly performed for a duration of kk frames, where kk is uniformly sampled from \{2, 3, 4\}{2,3,4}.


In [2]:
!python -m atari_py.import_roms .\ROMS\ROMS

In [3]:
#Exploring the Env
environment_name = 'Breakout-v0'
env = gym.make(environment_name)

In [4]:
env.action_space

Discrete(4)

In [5]:
env.action_space.sample() # 0 1 2 3 

0

In [6]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [7]:
env.reset()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [8]:
episodes = 5
for episode in range(1, episodes+1): # looping through 5 episodes
    obs = env.reset()   #get the environment to the initial state
    done = False        #temp variables
    score = 0           #temp variables (Score counter for every episode.)
    
    while not done:
        env.render()    
        action = env.action_space.sample()  # Later to be replaced with the Trained Model
        debug_text = ''
        obs, reward, done, info = env.step(action)
        debug_text = str((obs, reward, done, info))
        #print(debug_text)
        
        score+=reward
    print('\n------------> Episode:{} Score:{}'.format(episode, score))
env.close()




------------> Episode:1 Score:3.0

------------> Episode:2 Score:1.0

------------> Episode:3 Score:7.0

------------> Episode:4 Score:1.0

------------> Episode:5 Score:0.0


## 3. Vectorise Environment and Train the Model

In [8]:
env = make_atari_env('Breakout-v0' , n_envs=4 , seed=0)
env = VecFrameStack(env , n_stack=4)
env.reset()
# env.render()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [10]:
env.render()

In [11]:
env.close()

In [16]:
#Deleting and Reloading the Model from File
del model
model = PPO.load(PPO_path , env)

NameError: name 'model' is not defined

In [9]:
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

D:\Coursera_Udacity\03_Reinforcement_Learning\02_Main_Course\Training\Logs
Using cuda device
Wrapping the env in a VecTransposeImage.


In [10]:
# Uncomment If you want to Train the Model and Try to Train if for Longer Time for Improvement
#model.learn(total_timesteps=100000)

Logging to D:\Coursera_Udacity\03_Reinforcement_Learning\02_Main_Course\Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 282      |
|    ep_rew_mean        | 1.6      |
| time/                 |          |
|    fps                | 89       |
|    iterations         | 100      |
|    time_elapsed       | 22       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0536   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.452    |
|    value_loss         | 0.326    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 289      |
|    ep_rew_mean        | 1.68     |
| time/                 |          |
|    fps                | 124      |
|    iterations         | 200      |
|    time_elapsed   

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 370      |
|    ep_rew_mean        | 3.5      |
| time/                 |          |
|    fps                | 193      |
|    iterations         | 1400     |
|    time_elapsed       | 144      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.623   |
|    explained_variance | 0.304    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.164   |
|    value_loss         | 0.231    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 379      |
|    ep_rew_mean        | 3.84     |
| time/                 |          |
|    fps                | 195      |
|    iterations         | 1500     |
|    time_elapsed       | 153      |
|    total_timesteps    | 30000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 449      |
|    ep_rew_mean        | 5.44     |
| time/                 |          |
|    fps                | 206      |
|    iterations         | 2800     |
|    time_elapsed       | 271      |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -0.147   |
|    explained_variance | 0.855    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | -0.00881 |
|    value_loss         | 0.0943   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 446      |
|    ep_rew_mean        | 5.32     |
| time/                 |          |
|    fps                | 207      |
|    iterations         | 2900     |
|    time_elapsed       | 279      |
|    total_timesteps    | 58000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 470      |
|    ep_rew_mean        | 5.66     |
| time/                 |          |
|    fps                | 212      |
|    iterations         | 4200     |
|    time_elapsed       | 394      |
|    total_timesteps    | 84000    |
| train/                |          |
|    entropy_loss       | -0.698   |
|    explained_variance | 0.807    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 0.00522  |
|    value_loss         | 0.066    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 460      |
|    ep_rew_mean        | 5.49     |
| time/                 |          |
|    fps                | 213      |
|    iterations         | 4300     |
|    time_elapsed       | 403      |
|    total_timesteps    | 86000    |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1c5d39e0550>

## 4. Save and Reload Model

In [12]:
a2c_path = os.path.join(os.getcwd(),'Training','Saved Models','A2C_Breakout_Model')
print(a2c_path)
model.save(a2c_path)

D:\Coursera_Udacity\03_Reinforcement_Learning\02_Main_Course\Training\Saved Models\A2C_Breakout_Model


In [17]:
model = A2C.load(a2c_path , env)

Wrapping the env in a VecTransposeImage.


## 5. Evaluate the Test

In [18]:
env = make_atari_env('Breakout-v0' , n_envs=1 , seed=0)
env = VecFrameStack(env , n_stack=4)

In [21]:
evaluate_policy(model, env, n_eval_episodes=10 , render=True)
# Return : Mean Reward , Std Reward 

(7.3, 2.2825424421026654)

In [22]:
evaluate_policy??