# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model

import torch

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=50000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA available. Device set to GPU -> 'cuda:0'


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Rainbow Deep Q-Network (RDQN)

In [5]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device)

  logger.warn(


In [6]:
rainbow.env_details

{'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [7]:
# Train model
rainbow.train(num_episodes=4, print_every=1, save_count=2)

Training agent on Qbert with 4 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100.
(1.0/4)  Episode Score: 250,   Train Loss: 3.62703,  Time taken: 8.84 secs.
(2.0/4)  Episode Score: 275,   Train Loss: 3.69297,  Time taken: 8.98 secs.
Saved model at episode 2 as: 'rainbow_batch32_buffer1000_Qbert_ep2.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 623 bytes
(3.0/4)  Episode Score: 50,   Train Loss: 3.34267,  Time taken: 3.64 secs.
(4.0/4)  Episode Score: 75,   Train Loss: 3.12163,  Time taken: 4.08 secs.
Saved model at episode 4 as: 'rainbow_batch32_buffer1000_Qbert_ep4.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 673 bytes
Training complete. Access metrics from 'logger' attribute. 

In [8]:
rainbow.logger

Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores']'

In [9]:
rainbow.logger.actions

[Counter({5: 2935, 1: 3012, 0: 1261, 3: 1099, 4: 1028, 2: 553})]

### 2b. Proximal Policy Optimization (PPO)

In [10]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device)

In [11]:
ppo.env_details

{'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [12]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_agents * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 4)

In [13]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=2)  # 4 training iterations

Training agent on MontezumaRevenge with 3K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num agents: 8, num network updates: 4, batch size: 800, training iterations: 4.
(1.0/4) Episodic Return: 0.01683,  Approx KL: -0.00013,  Total Loss: -0.03043,  Policy Loss: -0.00156,  Value Loss: 0.00002,  Entropy Loss: 2.88842,  Time taken: 1.55 secs.
(2.0/4) Episodic Return: 0.00821,  Approx KL: 0.00035,  Total Loss: -0.02935,  Policy Loss: -0.00048,  Value Loss: 0.00001,  Entropy Loss: 2.88770,  Time taken: 2.32 secs.
Saved model at episode 2 as: 'ppo_rollout100_agents8_MonRev_ep2.pt'.
Saved logger data to 'saved_models/ppo_MonRev_logger_data.tar.gz'. Total size: 776 bytes
(3.0/4) Episodic Return: 0.00386,  Approx KL: 0.00045,  Total Loss: -0.02936,  Policy Loss: -0.00049,  Value Loss: 0.00000,  Entropy Loss: 2.88753,  Time taken: 1.44 secs.
(4.0/4) Episodic Return: 0.00117,  Approx KL: -0.00016,  Total Loss: -0.02936,  Policy Loss: -0.00049,  Value Loss: 0.00000,  Entropy Loss: 2.8

In [14]:
ppo.logger

Available attributes: '['actions', 'avg_rewards', 'avg_returns', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl']'

In [15]:
ppo.logger.actions

[Counter({6: 816,
          8: 696,
          2: 776,
          9: 612,
          7: 676,
          10: 684,
          0: 576,
          1: 740,
          14: 612,
          4: 772,
          3: 788,
          5: 700,
          11: 668,
          16: 800,
          15: 876,
          17: 712,
          12: 664,
          13: 632})]