# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=500000, SAVE_EVERY=5000


In [3]:
# Set CUDA device
device = set_device()

CUDA unavailable. Device set to CPU -> 'cpu'.


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [5]:
# # Create DQN instance
dqn = create_model('dqn', device=device, im_type='curiosity')
dqn2 = create_model('dqn', device=device)
dqn3 = create_model('dqn', device=device, im_type='empowerment')

  logger.warn(


In [6]:
dqn.env_details, dqn2.env_details, dqn3.env_details

({'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [7]:
# # Train model
dqn.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120,  Train Loss: 33715.38281,  Curiosity Loss: 3.38288,  Time taken: 20 secs.
(2.0/5) Episode Score: 65,  Train Loss: 2834.34448,  Curiosity Loss: 3.38263,  Time taken: 24 secs.
(3.0/5) Episode Score: 60,  Train Loss: 6212.41504,  Curiosity Loss: 3.37696,  Time taken: 19 secs.
(4.0/5) Episode Score: 210,  Train Loss: 11405.25098,  Curiosity Loss: 3.37880,  Time taken: 26 secs.
(5.0/5) Episode Score: 115,  Train Loss: 34.63552,  Curiosity Loss: 3.37863,  Time taken: 19 secs.
Training complete. Access metrics from 'logger' attribute. 

In [8]:
dqn2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 10,  Train Loss: 10.83754,  Time taken: 4 secs.
(2.0/5) Episode Score: 135,  Train Loss: 10441.08203,  Time taken: 10 secs.
(3.0/5) Episode Score: 120,  Train Loss: 19689.09961,  Time taken: 10 secs.
(4.0/5) Episode Score: 105,  Train Loss: 2231.62939,  Time taken: 8 secs.
(5.0/5) Episode Score: 75,  Train Loss: 1471.75049,  Time taken: 6 secs.
Training complete. Access metrics from 'logger' attribute. 

In [9]:
dqn3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: empowerment
(1.0/5) Episode Score: 215,  Train Loss: 20.10689,  Source Loss: 0.00024,  Forward Loss: 1.22751,  Time taken: 12 secs.
(2.0/5) Episode Score: 120,  Train Loss: 20.15374,  Source Loss: 0.00016,  Forward Loss: 1.31044,  Time taken: 7 secs.
(3.0/5) Episode Score: 125,  Train Loss: 3.03293,  Source Loss: 0.00010,  Forward Loss: 1.32907,  Time taken: 9 secs.
(4.0/5) Episode Score: 210,  Train Loss: 26.67262,  Source Loss: 0.00007,  Forward Loss: 1.33250,  Time taken: 10 secs.
(5.0/5) Episode Score: 100,  Train Loss: 0.21294,  Source Loss: 0.00005,  Forward Loss: 1.32392,  Time taken: 7 secs.
Training complete. Access metrics from 'logger' attribute. 

In [10]:
dqn.logger, dqn2.logger, dqn3.logger

(Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [11]:
dqn.logger.actions, dqn2.logger.actions, dqn3.logger.actions

([Counter({4: 390, 1: 394, 0: 414, 2: 391, 5: 371, 3: 384})],
 [Counter({1: 384, 4: 368, 2: 370, 3: 349, 5: 372, 0: 404})],
 [Counter({0: 417, 3: 387, 1: 383, 4: 426, 5: 465, 2: 441})])

### 2b. Rainbow Deep Q-Network (RDQN)

In [12]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device, im_type='curiosity')
rainbow2 = create_model('rainbow', env=env2, device=device)
rainbow3 = create_model('rainbow', env=env2, device=device, im_type='empowerment')

In [13]:
rainbow.env_details, rainbow2.env_details, rainbow3.env_details

({'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [14]:
# Train model
rainbow.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: curiosity.
(1.0/5)  Episode Score: 0,  Train Loss: 7.45471,  Curiosity Loss: 3.54361,  Time taken: 15 secs.
(2.0/5)  Episode Score: 200,  Train Loss: 7.36903,  Curiosity Loss: 3.54737,  Time taken: 19 secs.
(3.0/5)  Episode Score: 0,  Train Loss: 7.33716,  Curiosity Loss: 3.54385,  Time taken: 17 secs.
(4.0/5)  Episode Score: 0,  Train Loss: 7.12473,  Curiosity Loss: 3.54115,  Time taken: 18 secs.
(5.0/5)  Episode Score: 125,  Train Loss: 7.11427,  Curiosity Loss: 3.54152,  Time taken: 15 secs.
Training complete. Access metrics from 'logger' attribute. 

In [15]:
rainbow2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: None.
(1.0/5)  Episode Score: 0,  Train Loss: 3.86856,  Time taken: 7 secs.
(2.0/5)  Episode Score: 125,  Train Loss: 3.72778,  Time taken: 9 secs.
(3.0/5)  Episode Score: 25,  Train Loss: 2.64229,  Time taken: 7 secs.
(4.0/5)  Episode Score: 125,  Train Loss: 1.57758,  Time taken: 8 secs.
(5.0/5)  Episode Score: 200,  Train Loss: 3.34030,  Time taken: 8 secs.
Training complete. Access metrics from 'logger' attribute. 

In [16]:
rainbow3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on Qbert with 5 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: empowerment.
(1.0/5)  Episode Score: 175,  Train Loss: 3.89586,  Source Loss: 0.00147,  Forward Loss: 1.28909,  Time taken: 7 secs.
(2.0/5)  Episode Score: 0,  Train Loss: 3.82982,  Source Loss: 0.00055,  Forward Loss: 1.16748,  Time taken: 5 secs.
(3.0/5)  Episode Score: 250,  Train Loss: 3.53767,  Source Loss: 0.00024,  Forward Loss: 1.10967,  Time taken: 8 secs.
(4.0/5)  Episode Score: 125,  Train Loss: 3.29969,  Source Loss: 0.00018,  Forward Loss: 1.11687,  Time taken: 8 secs.
(5.0/5)  Episode Score: 200,  Train Loss: 2.16264,  Source Loss: 0.00015,  Forward Loss: 1.07415,  Time taken: 7 secs.
Training complete. Access metrics from 'logger' attribute. 

In [17]:
rainbow.logger, rainbow2.logger, rainbow3.logger

(Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']',
 Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']')

In [18]:
rainbow.logger.actions, rainbow2.logger.actions, rainbow3.logger.actions

([Counter({4: 4205, 0: 178, 2: 1071, 5: 2550, 1: 485, 3: 87})],
 [Counter({4: 3558, 0: 163, 3: 1758, 2: 1997, 5: 1083, 1: 17})],
 [Counter({5: 3900, 3: 1199, 0: 1163, 1: 868, 2: 1787, 4: 747})])

### 2c. Proximal Policy Optimization (PPO)

In [19]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device, im_type='curiosity')
ppo2 = create_model('ppo', env='primary', device=device)
ppo3 = create_model('ppo', env=env3, device=device, im_type='empowerment')

In [20]:
ppo.env_details, ppo2.env_details, ppo3.env_details

({'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [21]:
ppo.train(num_episodes=5, print_every=1, save_count=10)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: curiosity.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.05,  Approx KL: 0.006,  Total Loss: 7863.952,  Policy Loss: -0.001,  Value Loss: 5257.306,  Entropy Loss: 2.889,  Curiosity Loss: 5196.11914,  Time taken: 2 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.31,  Approx KL: 0.020,  Total Loss: 7919.022,  Policy Loss: -0.117,  Value Loss: 5311.103,  Entropy Loss: 2.851,  Curiosity Loss: 5283.39502,  Time taken: 1 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 2.32,  Approx KL: 2.133,  Total Loss: 7872.543,  Policy Loss: -0.245,  Value Loss: 5253.448,  Entropy Loss: 1.230,  Curiosity Loss: 5242.19043,  Time taken: 1 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 17.46,  Approx KL: -0.000,  Total Loss: 7663.472,  Policy Loss: 0.000,  Value Loss: 4947.017,  Entrop

In [22]:
ppo2.train(num_episodes=5, print_every=1, save_count=10)

Training agent on SpaceInvaders with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: None.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: -0.001,  Total Loss: -0.021,  Policy Loss: -0.003,  Value Loss: 0.000,  Entropy Loss: 1.791,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: -0.003,  Total Loss: -0.021,  Policy Loss: -0.003,  Value Loss: 0.000,  Entropy Loss: 1.790,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 0.06,  Approx KL: 0.001,  Total Loss: -0.025,  Policy Loss: -0.007,  Value Loss: 0.000,  Entropy Loss: 1.788,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: -0.006,  Total Loss: -0.035,  Policy Loss: -0.017,  Value Loss: 0.000,  Entropy Loss: 1.775,  Time taken: 0 secs.
(5.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: 1.885,  Tota

In [23]:
ppo3.train(num_episodes=5, print_every=1, save_count=10)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: empowerment.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.02,  Approx KL: 0.001,  Total Loss: -0.035,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.882,  Source Loss: 0.19777,  Forward Loss: 1.30755,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.002,  Total Loss: -0.035,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.881,  Source Loss: 0.17730,  Forward Loss: 1.15840,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.004,  Total Loss: -0.034,  Policy Loss: -0.006,  Value Loss: 0.000,  Entropy Loss: 2.879,  Source Loss: 0.10939,  Forward Loss: 1.07264,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 0.03,  Approx KL: -0.001,  Total Loss: -0.034,  Policy Los

In [24]:
ppo.logger, ppo2.logger, ppo3.logger

(Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']')

In [25]:
ppo.logger.actions, ppo2.logger.actions, ppo3.logger.actions

([Counter({4: 44,
           5: 24,
           9: 44,
           3: 32,
           12: 24,
           8: 16,
           17: 40,
           14: 20,
           16: 28,
           0: 16,
           7: 20,
           10: 36,
           13: 20,
           11: 16,
           2: 20,
           1: 28,
           15: 8,
           6: 364})],
 [Counter({5: 128, 3: 164, 1: 132, 4: 148, 0: 116, 2: 112})],
 [Counter({11: 60,
           9: 52,
           6: 32,
           15: 36,
           1: 64,
           17: 56,
           7: 60,
           14: 28,
           4: 76,
           10: 60,
           13: 48,
           3: 60,
           2: 24,
           5: 28,
           8: 40,
           12: 40,
           16: 20,
           0: 16})])

## 3. Load Model

In [26]:
from utils.render import video_render
from utils.model_utils import load_model

In [29]:
# model = load_model('rainbowcuriosity_batch32_buffer1k_Qbert_ep5', 'cuda:0', 'rainbow')
# model = load_model('dqncuriosity_batch32_SpaInv_ep5', 'cuda:0', 'dqn')
# model = load_model('ppocuriosity_rollout100_agents8_MonRev_ep5', 'cuda:0', 'ppo')
# model = load_model('ppo_rollout10_agents4_SpaInv_ep40k', 'cpu', 'ppo')

In [30]:
# video_render(model, 1)