# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=100000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA unavailable. Device set to CPU -> 'cpu'.


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [15]:
# # Create DQN instance
# dqn = create_model('dqn', device=device, im_type='curiosity')
# dqn2 = create_model('dqn', device=device)
# dqn3 = create_model('dqn', device=device, im_type='empowerment')

In [16]:
# dqn.env_details, dqn2.env_details, dqn3.env_details

({'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [17]:
# # Train model
# dqn.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120,  Train Loss: 28244897020206741716992.00000,  Curiosity Loss: 1844.45471,  Time taken: 19 secs.
(2.0/5) Episode Score: 65,  Train Loss: 297685826675757859676880896.00000,  Curiosity Loss: 1882.88391,  Time taken: 18 secs.
(3.0/5) Episode Score: 80,  Train Loss: 604445200933003826183536640.00000,  Curiosity Loss: 1815.59290,  Time taken: 17 secs.
(4.0/5) Episode Score: 155,  Train Loss: 1042372528556347067987918848.00000,  Curiosity Loss: 1849.64197,  Time taken: 26 secs.
(5.0/5) Episode Score: 155,  Train Loss: 1080812517939929685189197824.00000,  Curiosity Loss: 1780.05713,  Time taken: 20 secs.
Saved model at episode 5 as: 'dqncuriosity_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqncuriosity_SpaInv_logger_data.tar.gz'. Total size: 852 bytes
Training complete. Access metrics from 'logger' att

In [18]:
# dqn2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 215,  Train Loss: 8929011000602317155205120.00000,  Time taken: 10 secs.
(2.0/5) Episode Score: 290,  Train Loss: 668769292665934212021354496.00000,  Time taken: 13 secs.
(3.0/5) Episode Score: 80,  Train Loss: 967477935960347045206687744.00000,  Time taken: 4 secs.
(4.0/5) Episode Score: 105,  Train Loss: 1458471454982388323174055936.00000,  Time taken: 7 secs.
(5.0/5) Episode Score: 260,  Train Loss: 1700385746113830124976603136.00000,  Time taken: 11 secs.
Saved model at episode 5 as: 'dqn_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqn_SpaInv_logger_data.tar.gz'. Total size: 808 bytes
Training complete. Access metrics from 'logger' attribute. 

In [19]:
# dqn3.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: empowerment
(1.0/5) Episode Score: 15,  Train Loss: 0.00935,  Source Loss: 0.03580,  Forward Loss: 1.16270,  Time taken: 5 secs.
(2.0/5) Episode Score: 110,  Train Loss: 7.36104,  Source Loss: 0.03171,  Forward Loss: 1.25821,  Time taken: 6 secs.
(3.0/5) Episode Score: 240,  Train Loss: 2514.66577,  Source Loss: 0.03427,  Forward Loss: 0.53753,  Time taken: 10 secs.
(4.0/5) Episode Score: 135,  Train Loss: 30789.06836,  Source Loss: 0.03172,  Forward Loss: 0.19171,  Time taken: 6 secs.
(5.0/5) Episode Score: 135,  Train Loss: 412146.56250,  Source Loss: 0.03403,  Forward Loss: 0.09606,  Time taken: 8 secs.
Saved model at episode 5 as: 'dqnempowerment_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqnempowerment_SpaInv_logger_data.tar.gz'. Total size: 905 bytes
Training complete. Access metrics from 'logger' attribute. 

In [20]:
# dqn.logger, dqn2.logger, dqn3.logger

(Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [21]:
# dqn.logger.actions, dqn2.logger.actions, dqn3.logger.actions

([Counter({4: 389, 1: 399, 0: 397, 2: 373, 5: 379, 3: 430})],
 [Counter({0: 536, 2: 459, 3: 540, 1: 479, 5: 493, 4: 505})],
 [Counter({2: 406, 3: 428, 0: 413, 5: 426, 1: 456, 4: 423})])

### 2b. Rainbow Deep Q-Network (RDQN)

In [22]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device, im_type='curiosity')
rainbow2 = create_model('rainbow', env=env2, device=device)
rainbow3 = create_model('rainbow', env=env2, device=device, im_type='empowerment')

In [23]:
rainbow.env_details, rainbow2.env_details, rainbow3.env_details

({'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [24]:
# Train model
rainbow.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: curiosity.
(1.0/5)  Episode Score: 0,  Train Loss: 7.34908,  Curiosity Loss: 3.54450,  Time taken: 13 secs.
(2.0/5)  Episode Score: 200,  Train Loss: 7.33582,  Curiosity Loss: 3.54445,  Time taken: 13 secs.
(3.0/5)  Episode Score: 0,  Train Loss: 7.36223,  Curiosity Loss: 3.54735,  Time taken: 12 secs.
(4.0/5)  Episode Score: 25,  Train Loss: 7.10950,  Curiosity Loss: 3.54097,  Time taken: 15 secs.
(5.0/5)  Episode Score: 50,  Train Loss: 6.64158,  Curiosity Loss: 3.53350,  Time taken: 16 secs.
Saved model at episode 5 as: 'rainbowcuriosity_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbowcuriosity_Qbert_logger_data.tar.gz'. Total size: 714 bytes
Training complete. Access metrics from 'logger' attribute. 

In [25]:
rainbow2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: None.
(1.0/5)  Episode Score: 125,  Train Loss: 3.73911,  Time taken: 8 secs.
(2.0/5)  Episode Score: 25,  Train Loss: 3.77298,  Time taken: 5 secs.
(3.0/5)  Episode Score: 150,  Train Loss: 3.16742,  Time taken: 6 secs.
(4.0/5)  Episode Score: 200,  Train Loss: 2.54698,  Time taken: 7 secs.
(5.0/5)  Episode Score: 125,  Train Loss: 2.80329,  Time taken: 6 secs.
Saved model at episode 5 as: 'rainbow_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 667 bytes
Training complete. Access metrics from 'logger' attribute. 

In [26]:
rainbow3.train(num_episodes=50, print_every=5, save_count=50)

Training agent on Qbert with 50 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: empowerment.
(1.0/50)  Episode Score: 375,  Train Loss: 3.87390,  Source Loss: 0.00107,  Forward Loss: 1.24322,  Time taken: 7 secs.
(5.0/50)  Episode Score: 225,  Train Loss: 2.53961,  Source Loss: 0.00009,  Forward Loss: 0.69833,  Time taken: 42 secs.
(10.0/50)  Episode Score: 100,  Train Loss: 1.38658,  Source Loss: 0.00005,  Forward Loss: 2.22284,  Time taken: 31 secs.
(15.0/50)  Episode Score: 175,  Train Loss: 1.82600,  Source Loss: 0.00002,  Forward Loss: 0.71608,  Time taken: 35 secs.
(20.0/50)  Episode Score: 750,  Train Loss: 0.54478,  Source Loss: 0.00001,  Forward Loss: 1.54741,  Time taken: 37 secs.
(25.0/50)  Episode Score: 125,  Train Loss: 1.81434,  Source Loss: 0.00001,  Forward Loss: 2.01735,  Time taken: 27 secs.
(30.0/50)  Episode Score: 175,  Train Loss: 1.86770,  Source Loss: 0.00001,  Forward Loss: 1.07069,  T

In [27]:
rainbow.logger, rainbow2.logger, rainbow3.logger

(Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [28]:
rainbow.logger.actions, rainbow2.logger.actions, rainbow3.logger.actions

([Counter({4: 4273, 0: 227, 2: 917, 5: 2543, 1: 1171, 3: 117})],
 [Counter({4: 3040, 0: 193, 2: 2344, 3: 2213, 5: 1428, 1: 62})],
 [Counter({5: 28685, 0: 18801, 3: 21871, 4: 12482, 1: 16860, 2: 15765})])

### 2c. Proximal Policy Optimization (PPO)

In [29]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device, im_type='curiosity')
ppo2 = create_model('ppo', env=env3, device=device)
ppo3 = create_model('ppo', env=env3, device=device, im_type='empowerment')

In [30]:
ppo.env_details, ppo2.env_details, ppo3.env_details

({'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [31]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_envs * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 5) # 5 training iterations
demo_episodes2 = int((PPO_NUM_EPISODES / NUM_EPISODES) * 50) # 5 training iterations

In [32]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: curiosity.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.05,  Approx KL: 0.006,  Total Loss: 7862.735,  Policy Loss: -0.001,  Value Loss: 5255.996,  Entropy Loss: 2.889,  Curiosity Loss: 5197.06201,  Time taken: 1 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.31,  Approx KL: 0.020,  Total Loss: 7912.451,  Policy Loss: -0.117,  Value Loss: 5304.566,  Entropy Loss: 2.851,  Curiosity Loss: 5283.66455,  Time taken: 1 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 2.32,  Approx KL: 2.133,  Total Loss: 7863.190,  Policy Loss: -0.245,  Value Loss: 5244.028,  Entropy Loss: 1.230,  Curiosity Loss: 5238.63428,  Time taken: 1 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 17.46,  Approx KL: -0.000,  Total Loss: 7663.471,  Policy Loss: 0.000,  Value Loss: 4947.016,  Entrop

In [33]:
ppo2.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: None.
(1.0/5) Episode Score: 0.00,  Episodic Return: -0.05,  Approx KL: 0.001,  Total Loss: -0.040,  Policy Loss: -0.011,  Value Loss: 0.000,  Entropy Loss: 2.889,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: -0.05,  Approx KL: 0.011,  Total Loss: -0.046,  Policy Loss: -0.017,  Value Loss: 0.000,  Entropy Loss: 2.886,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: -0.03,  Approx KL: 0.209,  Total Loss: -0.172,  Policy Loss: -0.145,  Value Loss: 0.000,  Entropy Loss: 2.691,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: -0.03,  Approx KL: 2.401,  Total Loss: -0.218,  Policy Loss: -0.211,  Value Loss: 0.002,  Entropy Loss: 0.733,  Time taken: 0 secs.
(5.0/5) Episode Score: 0.00,  Episodic Return: -0.13,  Approx KL: -0.000,

In [34]:
ppo3.train(num_episodes=demo_episodes2, print_every=5, save_count=50)

Training agent on MontezumaRevenge with 2K episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 50, intrinsic method: empowerment.
(1.0/50) Episode Score: 0.00,  Episodic Return: 0.02,  Approx KL: 0.001,  Total Loss: -0.035,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.882,  Source Loss: 0.20103,  Forward Loss: 1.30656,  Time taken: 0 secs.
(5.0/50) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.001,  Total Loss: -0.033,  Policy Loss: -0.005,  Value Loss: 0.000,  Entropy Loss: 2.879,  Source Loss: 0.02357,  Forward Loss: 1.22585,  Time taken: 2 secs.
(10.0/50) Episode Score: 0.00,  Episodic Return: 0.02,  Approx KL: -0.000,  Total Loss: -0.031,  Policy Loss: -0.002,  Value Loss: 0.000,  Entropy Loss: 2.875,  Source Loss: 0.01383,  Forward Loss: 1.38899,  Time taken: 2 secs.
(15.0/50) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.004,  Total Loss: -0.033,  Poli

In [35]:
ppo.logger, ppo2.logger, ppo3.logger

(Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']')

In [36]:
ppo.logger.actions, ppo2.logger.actions, ppo3.logger.actions

([Counter({4: 44,
           5: 24,
           9: 44,
           3: 32,
           12: 24,
           8: 16,
           17: 40,
           14: 20,
           16: 28,
           0: 16,
           7: 20,
           10: 36,
           13: 20,
           11: 16,
           2: 20,
           1: 28,
           15: 8,
           6: 364})],
 [Counter({16: 12,
           10: 32,
           4: 64,
           9: 28,
           14: 44,
           1: 28,
           11: 24,
           0: 36,
           12: 28,
           3: 36,
           13: 24,
           8: 28,
           5: 24,
           7: 44,
           15: 32,
           17: 24,
           6: 64,
           2: 228})],
 [Counter({11: 308,
           9: 368,
           6: 188,
           15: 168,
           1: 332,
           17: 316,
           7: 1564,
           14: 236,
           4: 492,
           10: 1484,
           13: 360,
           3: 280,
           2: 404,
           5: 232,
           8: 404,
           12: 324,
           16: 2

## 3. Load Model

In [None]:
from utils.render import video_render
from utils.model_utils import load_model

In [None]:
# model = load_model('rainbowcuriosity_batch32_buffer1k_Qbert_ep5', 'cuda:0', 'rainbow')
# model = load_model('dqncuriosity_batch32_SpaInv_ep5', 'cuda:0', 'dqn')
# model = load_model('ppocuriosity_rollout100_agents8_MonRev_ep5', 'cuda:0', 'ppo')

In [None]:
# video_render(model, 1)