# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=50000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA available. Device set to GPU -> 'cuda:0'


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [5]:
# Create DQN instance
dqn = create_model('dqn', device=device, im_type='curiosity')
dqn2 = create_model('dqn', device=device)

  logger.warn(


In [6]:
dqn.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [7]:
# Train model
dqn.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120,  Train Loss: 2426.58936,  Curiosity Loss: 3.57615,  Time taken: 5.83 secs.
(2.0/5) Episode Score: 65,  Train Loss: 308.26587,  Curiosity Loss: 3.57719,  Time taken: 3.27 secs.
(3.0/5) Episode Score: 80,  Train Loss: 251.28535,  Curiosity Loss: 3.57784,  Time taken: 2.72 secs.
(4.0/5) Episode Score: 155,  Train Loss: 173.95744,  Curiosity Loss: 3.57762,  Time taken: 4.03 secs.
(5.0/5) Episode Score: 155,  Train Loss: 16.91314,  Curiosity Loss: 3.57728,  Time taken: 3.91 secs.
Saved model at episode 5 as: 'dqncuriosity_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqncuriosity_SpaInv_logger_data.tar.gz'. Total size: 843 bytes
Training complete. Access metrics from 'logger' attribute. 

In [8]:
dqn2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 210,  Train Loss: 17073.07812,  Time taken: 2.23 secs.
(2.0/5) Episode Score: 35,  Train Loss: 1859.45862,  Time taken: 1.39 secs.
(3.0/5) Episode Score: 50,  Train Loss: 1251.52161,  Time taken: 1.48 secs.
(4.0/5) Episode Score: 90,  Train Loss: 257.93082,  Time taken: 2.42 secs.
(5.0/5) Episode Score: 205,  Train Loss: 21.34808,  Time taken: 3.44 secs.
Saved model at episode 5 as: 'dqn_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqn_SpaInv_logger_data.tar.gz'. Total size: 794 bytes
Training complete. Access metrics from 'logger' attribute. 

In [9]:
dqn.logger

Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']'

In [10]:
dqn.logger.actions

[Counter({4: 385, 1: 416, 0: 423, 2: 390, 5: 406, 3: 455})]

### 2b. Rainbow Deep Q-Network (RDQN)

In [11]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device, im_type='curiosity')
rainbow2 = create_model('rainbow', env=env2, device=device)

In [12]:
rainbow.env_details

{'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [13]:
# Train model
rainbow.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: curiosity.
(1.0/5)  Episode Score: 0,  Train Loss: 7.43996,  Curiosity Loss: 3.53157,  Time taken: 4.72 secs.
(2.0/5)  Episode Score: 250,  Train Loss: 7.36464,  Curiosity Loss: 3.54006,  Time taken: 6.64 secs.
(3.0/5)  Episode Score: 125,  Train Loss: 7.14056,  Curiosity Loss: 3.53956,  Time taken: 4.30 secs.
(4.0/5)  Episode Score: 175,  Train Loss: 6.85037,  Curiosity Loss: 3.54415,  Time taken: 4.61 secs.
(5.0/5)  Episode Score: 175,  Train Loss: 5.20295,  Curiosity Loss: 3.54321,  Time taken: 5.11 secs.
Saved model at episode 5 as: 'rainbowcuriosity_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbowcuriosity_Qbert_logger_data.tar.gz'. Total size: 708 bytes
Training complete. Access metrics from 'logger' attribute. 

In [14]:
rainbow2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: None.
(1.0/5)  Episode Score: 175,  Train Loss: 3.71781,  Time taken: 4.32 secs.
(2.0/5)  Episode Score: 125,  Train Loss: 3.80244,  Time taken: 3.58 secs.
(3.0/5)  Episode Score: 325,  Train Loss: 3.20816,  Time taken: 4.20 secs.
(4.0/5)  Episode Score: 125,  Train Loss: 3.06909,  Time taken: 3.49 secs.
(5.0/5)  Episode Score: 25,  Train Loss: 2.56938,  Time taken: 4.69 secs.
Saved model at episode 5 as: 'rainbow_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 669 bytes
Training complete. Access metrics from 'logger' attribute. 

In [15]:
rainbow.logger

Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']'

In [16]:
rainbow.logger.actions, rainbow2.logger.actions

([Counter({4: 3268, 0: 165, 2: 1150, 5: 1339, 3: 3457, 1: 29})],
 [Counter({5: 4121, 2: 1149, 3: 2054, 4: 1367, 0: 599, 1: 54})])

### 2c. Proximal Policy Optimization (PPO)

In [17]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device, im_type='curiosity')
ppo2 = create_model('ppo', env=env3, device=device)

In [18]:
ppo.env_details

{'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [19]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_envs * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 5) # 5 training iterations

In [20]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on MontezumaRevenge with 4K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num environments: 8, num network updates: 4, batch size: 800, training iterations: 5, intrinsic method: curiosity.
(1.0/5) Episodic Return: -0.03268,  Approx KL: -0.00050,  Total Loss: 8451.31329,  Policy Loss: -0.00067,  Value Loss: 5856.00031,  Entropy Loss: 2.88883,  Curiosity Loss: 5523.34271,  Time taken: 2.23 secs.
(2.0/5) Episodic Return: 0.16280,  Approx KL: 0.04365,  Total Loss: 8423.76660,  Policy Loss: -0.05518,  Value Loss: 5821.33765,  Entropy Loss: 2.82460,  Curiosity Loss: 5513.18121,  Time taken: 1.76 secs.
(3.0/5) Episodic Return: 2.12083,  Approx KL: 3.06891,  Total Loss: 8350.86261,  Policy Loss: -0.38449,  Value Loss: 5685.05963,  Entropy Loss: 0.79983,  Curiosity Loss: 5508.72528,  Time taken: 1.74 secs.
(4.0/5) Episodic Return: 16.31793,  Approx KL: -0.00002,  Total Loss: 8203.01459,  Policy Loss: 0.00000,  Value Loss: 5101.84595,  Entropy Loss: 0.00002,  Curiosit

In [21]:
ppo2.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on MontezumaRevenge with 4K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num environments: 8, num network updates: 4, batch size: 800, training iterations: 5, intrinsic method: None.
(1.0/5) Episodic Return: -0.03268,  Approx KL: 0.00023,  Total Loss: -0.02987,  Policy Loss: -0.00104,  Value Loss: 0.00012,  Entropy Loss: 2.88933,  Time taken: 1.12 secs.
(2.0/5) Episodic Return: -0.02270,  Approx KL: -0.00004,  Total Loss: -0.02925,  Policy Loss: -0.00038,  Value Loss: 0.00005,  Entropy Loss: 2.88914,  Time taken: 1.17 secs.
(3.0/5) Episodic Return: -0.01221,  Approx KL: -0.00001,  Total Loss: -0.02946,  Policy Loss: -0.00058,  Value Loss: 0.00001,  Entropy Loss: 2.88902,  Time taken: 1.15 secs.
(4.0/5) Episodic Return: -0.00705,  Approx KL: -0.00025,  Total Loss: -0.02917,  Policy Loss: -0.00029,  Value Loss: 0.00000,  Entropy Loss: 2.88858,  Time taken: 1.09 secs.
(5.0/5) Episodic Return: -0.00253,  Approx KL: -0.00014,  Total Loss: -0.02937,  Policy Loss:

In [22]:
ppo.logger

Available attributes: '['actions', 'avg_rewards', 'avg_returns', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']'

In [26]:
ppo.logger.total_losses, ppo2.logger.total_losses

([8451.313293457031,
  8423.7666015625,
  8350.862609863281,
  8203.014587402344,
  7447.140594482422],
 [-0.029874820553231984,
  -0.029246532707475126,
  -0.029461760306730866,
  -0.029169123619794846,
  -0.029371113108936697])

## 3. Load Model

In [24]:
# model = load_model('', device, 'dqn')

In [25]:
# video_render(model)