# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params

import torch

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=100000, SAVE_EVERY=1000


## 2. Model Creation and Training

### 2a. Rainbow Deep Q-Network (RDQN)

In [3]:
# Create Rainbow DQN instance
dqn = create_model('rainbow')

  logger.warn(


CUDA available. Device set to GPU.


In [4]:
dqn.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

In [5]:
# Train model
dqn.train(num_episodes=4, print_every=1, save_count=2)

Training agent on SpaceInvaders with 4 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100.
(1/4)  Episode Score: 120,   Train Loss: 3.65424,  Time taken: 8.26 secs.
(2/4)  Episode Score: 120,   Train Loss: 3.78992,  Time taken: 8.99 secs.
Saved model at episode 2 as: 'rainbow_batch32_buffer_size1000_ep2.pt'.
Saved logger data to 'saved_models/rainbow_logger_data.tar.gz'. Total size: 444 bytes
(3/4)  Episode Score: 110,   Train Loss: 3.53117,  Time taken: 9.63 secs.
(4/4)  Episode Score: 235,   Train Loss: 2.04486,  Time taken: 14.68 secs.
Saved model at episode 4 as: 'rainbow_batch32_buffer_size1000_ep4.pt'.
Saved logger data to 'saved_models/rainbow_logger_data.tar.gz'. Total size: 494 bytes
Training complete. Access metrics from 'logger' attribute. 

In [6]:
dqn.logger

Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores']'

In [15]:
dqn.logger.actions

[Counter({4: 5444, 0: 1380, 5: 2868, 1: 2262, 2: 1497, 3: 1013})]

### 2b. Proximal Policy Optimization (PPO)

In [8]:
# Create PPO instance
ppo = create_model('ppo')

CUDA available. Device set to GPU.


In [9]:
ppo.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

In [10]:
torch.cuda.empty_cache()

In [11]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_agents * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 4)

In [12]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=2)  # 4 training iterations

Training agent on SpaceInvaders with 3K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num agents: 8, num network updates: 4, batch size: 800, training iterations: 4.
(1/4) Episodic Return: 0.47469,  Approx KL: 0.00071,  Total Loss: 0.11363,  Policy Loss: -0.00132,  Value Loss: 0.26567,  Entropy Loss: 1.78940,  Time taken: 1.89 secs.
(2/4) Episodic Return: 0.38852,  Approx KL: 0.00066,  Total Loss: 0.03287,  Policy Loss: -0.00384,  Value Loss: 0.10916,  Entropy Loss: 1.78648,  Time taken: 1.56 secs.
Saved model at episode 2 as: 'ppo_rollout100_agents8_ep2.pt'.
Saved logger data to 'saved_models/ppo_logger_data.tar.gz'. Total size: 563 bytes
(3/4) Episodic Return: 0.63116,  Approx KL: 0.60172,  Total Loss: -0.13292,  Policy Loss: -0.21828,  Value Loss: 0.19687,  Entropy Loss: 1.30766,  Time taken: 1.76 secs.
(4/4) Episodic Return: 0.44795,  Approx KL: 0.06731,  Total Loss: 0.04140,  Policy Loss: -0.00553,  Value Loss: 0.09404,  Entropy Loss: 0.00873,  Time taken: 1.65 secs.


In [17]:
ppo.logger

Available attributes: '['actions', 'avg_rewards', 'avg_returns', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl']'

In [18]:
ppo.logger.actions

[Counter({2: 1440, 3: 1736, 0: 1568, 4: 1416, 1: 1732, 5: 4908})]