# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=100000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA unavailable. Device set to CPU -> 'cpu'.


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [5]:
# # Create DQN instance
dqn = create_model('dqn', device=device, im_type='curiosity')
dqn2 = create_model('dqn', device=device)
dqn3 = create_model('dqn', device=device, im_type='empowerment')

  logger.warn(


In [6]:
dqn.env_details, dqn2.env_details, dqn3.env_details

({'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [7]:
# # Train model
dqn.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120,  Train Loss: 33715.38281,  Curiosity Loss: 3.38288,  Time taken: 17 secs.
(2.0/5) Episode Score: 65,  Train Loss: 2834.34448,  Curiosity Loss: 3.38263,  Time taken: 20 secs.
(3.0/5) Episode Score: 60,  Train Loss: 1425.95959,  Curiosity Loss: 3.38442,  Time taken: 31 secs.
(4.0/5) Episode Score: 30,  Train Loss: 872.49402,  Curiosity Loss: 3.38381,  Time taken: 14 secs.
(5.0/5) Episode Score: 170,  Train Loss: 238.22807,  Curiosity Loss: 3.37853,  Time taken: 26 secs.
Training complete. Access metrics from 'logger' attribute. 

In [8]:
dqn2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 240,  Train Loss: 2350.07202,  Time taken: 11 secs.
(2.0/5) Episode Score: 320,  Train Loss: 78.95786,  Time taken: 11 secs.
(3.0/5) Episode Score: 45,  Train Loss: 18.63291,  Time taken: 6 secs.
(4.0/5) Episode Score: 75,  Train Loss: 46726.08203,  Time taken: 5 secs.
(5.0/5) Episode Score: 65,  Train Loss: 583.63867,  Time taken: 4 secs.
Training complete. Access metrics from 'logger' attribute. 

In [9]:
dqn3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: empowerment
(1.0/5) Episode Score: 100,  Train Loss: 0.01559,  Source Loss: 0.00044,  Forward Loss: 1.22817,  Time taken: 13 secs.
(2.0/5) Episode Score: 195,  Train Loss: 0.02922,  Source Loss: 0.00015,  Forward Loss: 1.30748,  Time taken: 19 secs.
(3.0/5) Episode Score: 335,  Train Loss: 11.47224,  Source Loss: 0.00007,  Forward Loss: 1.32410,  Time taken: 28 secs.
(4.0/5) Episode Score: 105,  Train Loss: 8.23236,  Source Loss: 0.00005,  Forward Loss: 1.33231,  Time taken: 16 secs.
(5.0/5) Episode Score: 110,  Train Loss: 0.36981,  Source Loss: 0.00004,  Forward Loss: 1.31164,  Time taken: 16 secs.
Training complete. Access metrics from 'logger' attribute. 

In [10]:
dqn.logger, dqn2.logger, dqn3.logger

(Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [11]:
dqn.logger.actions, dqn2.logger.actions, dqn3.logger.actions

([Counter({4: 357, 1: 383, 0: 388, 2: 371, 5: 368, 3: 418})],
 [Counter({5: 383, 1: 435, 0: 370, 4: 400, 3: 397, 2: 397})],
 [Counter({0: 518, 4: 470, 5: 494, 1: 469, 2: 491, 3: 475})])

### 2b. Rainbow Deep Q-Network (RDQN)

In [12]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device, im_type='curiosity')
rainbow2 = create_model('rainbow', env=env2, device=device)
rainbow3 = create_model('rainbow', env=env2, device=device, im_type='empowerment')

In [13]:
rainbow.env_details, rainbow2.env_details, rainbow3.env_details

({'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [14]:
# Train model
rainbow.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: curiosity.
(1.0/5)  Episode Score: 0,  Train Loss: 7.34908,  Curiosity Loss: 3.54450,  Time taken: 20 secs.
(2.0/5)  Episode Score: 200,  Train Loss: 7.33582,  Curiosity Loss: 3.54445,  Time taken: 13 secs.
(3.0/5)  Episode Score: 0,  Train Loss: 7.36223,  Curiosity Loss: 3.54735,  Time taken: 10 secs.
(4.0/5)  Episode Score: 25,  Train Loss: 7.10950,  Curiosity Loss: 3.54097,  Time taken: 14 secs.
(5.0/5)  Episode Score: 50,  Train Loss: 6.64158,  Curiosity Loss: 3.53350,  Time taken: 15 secs.
Training complete. Access metrics from 'logger' attribute. 

In [15]:
rainbow2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: None.
(1.0/5)  Episode Score: 125,  Train Loss: 3.73911,  Time taken: 7 secs.
(2.0/5)  Episode Score: 25,  Train Loss: 3.77298,  Time taken: 6 secs.
(3.0/5)  Episode Score: 150,  Train Loss: 3.16742,  Time taken: 6 secs.
(4.0/5)  Episode Score: 200,  Train Loss: 2.54698,  Time taken: 7 secs.
(5.0/5)  Episode Score: 125,  Train Loss: 2.80329,  Time taken: 7 secs.
Training complete. Access metrics from 'logger' attribute. 

In [16]:
rainbow3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: empowerment.
(1.0/5)  Episode Score: 375,  Train Loss: 3.87390,  Source Loss: 0.00107,  Forward Loss: 1.24322,  Time taken: 15 secs.
(2.0/5)  Episode Score: 300,  Train Loss: 3.72653,  Source Loss: 0.00022,  Forward Loss: 1.02064,  Time taken: 20 secs.
(3.0/5)  Episode Score: 225,  Train Loss: 3.63405,  Source Loss: 0.00018,  Forward Loss: 0.97620,  Time taken: 6 secs.
(4.0/5)  Episode Score: 350,  Train Loss: 3.29648,  Source Loss: 0.00010,  Forward Loss: 0.65110,  Time taken: 11 secs.
(5.0/5)  Episode Score: 225,  Train Loss: 2.53961,  Source Loss: 0.00009,  Forward Loss: 0.69833,  Time taken: 11 secs.
Training complete. Access metrics from 'logger' attribute. 

In [17]:
rainbow.logger, rainbow2.logger, rainbow3.logger

(Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [18]:
rainbow.logger.actions, rainbow2.logger.actions, rainbow3.logger.actions

([Counter({4: 4273, 0: 227, 2: 917, 5: 2543, 1: 1171, 3: 117})],
 [Counter({4: 3040, 0: 193, 2: 2344, 3: 2213, 5: 1428, 1: 62})],
 [Counter({5: 4140, 0: 3503, 3: 3750, 4: 1381, 1: 2058, 2: 976})])

### 2c. Proximal Policy Optimization (PPO)

In [19]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device, im_type='curiosity')
ppo2 = create_model('ppo', env=env3, device=device)
ppo3 = create_model('ppo', env=env3, device=device, im_type='empowerment')

In [20]:
ppo.env_details, ppo2.env_details, ppo3.env_details

({'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [21]:
ppo.train(num_episodes=5, print_every=1, save_count=10)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: curiosity.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.05,  Approx KL: 0.006,  Total Loss: 7875.180,  Policy Loss: -0.001,  Value Loss: 5268.324,  Entropy Loss: 2.889,  Curiosity Loss: 5211.31006,  Time taken: 2 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.31,  Approx KL: 0.020,  Total Loss: 7913.859,  Policy Loss: -0.117,  Value Loss: 5305.976,  Entropy Loss: 2.851,  Curiosity Loss: 5285.29150,  Time taken: 2 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 2.32,  Approx KL: 2.133,  Total Loss: 7864.080,  Policy Loss: -0.245,  Value Loss: 5244.941,  Entropy Loss: 1.231,  Curiosity Loss: 5241.83643,  Time taken: 2 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 17.46,  Approx KL: -0.000,  Total Loss: 7672.958,  Policy Loss: 0.000,  Value Loss: 4956.402,  Entrop

In [22]:
ppo2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: None.
(1.0/5) Episode Score: 0.00,  Episodic Return: -0.05,  Approx KL: 0.001,  Total Loss: -0.040,  Policy Loss: -0.011,  Value Loss: 0.000,  Entropy Loss: 2.889,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: -0.05,  Approx KL: 0.011,  Total Loss: -0.046,  Policy Loss: -0.017,  Value Loss: 0.000,  Entropy Loss: 2.886,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: -0.03,  Approx KL: 0.209,  Total Loss: -0.172,  Policy Loss: -0.145,  Value Loss: 0.000,  Entropy Loss: 2.691,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: -0.03,  Approx KL: 2.401,  Total Loss: -0.218,  Policy Loss: -0.211,  Value Loss: 0.002,  Entropy Loss: 0.733,  Time taken: 0 secs.
(5.0/5) Episode Score: 0.00,  Episodic Return: -0.13,  Approx KL: -0.000,

In [23]:
ppo3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: empowerment.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.02,  Approx KL: 0.001,  Total Loss: -0.035,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.882,  Source Loss: 0.19936,  Forward Loss: 1.31049,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.002,  Total Loss: -0.035,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.881,  Source Loss: 0.17717,  Forward Loss: 1.16026,  Time taken: 1 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.004,  Total Loss: -0.034,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.879,  Source Loss: 0.11061,  Forward Loss: 1.06365,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.001,  Total Loss: -0.034,  Policy Los

In [24]:
ppo.logger, ppo2.logger, ppo3.logger

(Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']')

In [25]:
ppo.logger.actions, ppo2.logger.actions, ppo3.logger.actions

([Counter({4: 44,
           5: 24,
           9: 44,
           3: 32,
           12: 24,
           8: 16,
           17: 40,
           14: 20,
           16: 28,
           0: 16,
           7: 20,
           10: 36,
           13: 20,
           11: 16,
           2: 20,
           1: 28,
           15: 8,
           6: 364})],
 [Counter({16: 12,
           10: 32,
           4: 64,
           9: 28,
           14: 44,
           1: 28,
           11: 24,
           0: 36,
           12: 28,
           3: 36,
           13: 24,
           8: 28,
           5: 24,
           7: 44,
           15: 32,
           17: 24,
           6: 64,
           2: 228})],
 [Counter({11: 60,
           9: 52,
           6: 32,
           15: 36,
           1: 64,
           17: 56,
           7: 60,
           14: 28,
           4: 76,
           10: 60,
           13: 48,
           3: 60,
           2: 24,
           5: 28,
           8: 40,
           12: 40,
           16: 20,
           0: 1

## 3. Load Model

In [26]:
from utils.render import video_render
from utils.model_utils import load_model

In [27]:
# model = load_model('rainbowcuriosity_batch32_buffer1k_Qbert_ep5', 'cuda:0', 'rainbow')
# model = load_model('dqncuriosity_batch32_SpaInv_ep5', 'cuda:0', 'dqn')
# model = load_model('ppocuriosity_rollout100_agents8_MonRev_ep5', 'cuda:0', 'ppo')

In [28]:
# video_render(model, 1)