# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=100000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA available. Device set to GPU -> 'cuda:0'


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [5]:
# Create DQN instance
dqn = create_model('dqn', device=device, im_type='curiosity')
dqn2 = create_model('dqn', device=device)

  logger.warn(


In [6]:
dqn.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [7]:
# Train model
dqn.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120,  Train Loss: 2066.30322,  Curiosity Loss: 3.57615,  Time taken: 5 secs.
(2.0/5) Episode Score: 65,  Train Loss: 64.39565,  Curiosity Loss: 3.57719,  Time taken: 2 secs.
(3.0/5) Episode Score: 80,  Train Loss: 202.60321,  Curiosity Loss: 3.57784,  Time taken: 2 secs.
(4.0/5) Episode Score: 155,  Train Loss: 345.67868,  Curiosity Loss: 3.57762,  Time taken: 3 secs.
(5.0/5) Episode Score: 120,  Train Loss: 70.57154,  Curiosity Loss: 3.58028,  Time taken: 3 secs.
Saved model at episode 5 as: 'dqncuriosity_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqncuriosity_SpaInv_logger_data.tar.gz'. Total size: 844 bytes
Training complete. Access metrics from 'logger' attribute. 

In [8]:
dqn2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 210,  Train Loss: 373.15045,  Time taken: 2 secs.
(2.0/5) Episode Score: 70,  Train Loss: 171.66989,  Time taken: 1 secs.
(3.0/5) Episode Score: 125,  Train Loss: 84.75105,  Time taken: 2 secs.
(4.0/5) Episode Score: 225,  Train Loss: 5.63408,  Time taken: 2 secs.
(5.0/5) Episode Score: 55,  Train Loss: 2.66105,  Time taken: 1 secs.
Saved model at episode 5 as: 'dqn_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqn_SpaInv_logger_data.tar.gz'. Total size: 790 bytes
Training complete. Access metrics from 'logger' attribute. 

In [9]:
dqn.logger

Available attributes: '['actions', 'train_losses', 'ep_scores', 'intrinsic_losses']'

In [10]:
dqn.logger.actions, dqn2.logger.actions

([Counter({4: 383, 1: 417, 0: 419, 2: 390, 5: 403, 3: 452})],
 [Counter({4: 392, 0: 398, 3: 388, 1: 385, 5: 373, 2: 366})])

### 2b. Rainbow Deep Q-Network (RDQN)

In [11]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device, im_type='curiosity')
rainbow2 = create_model('rainbow', env=env2, device=device)

In [12]:
rainbow.env_details

{'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [13]:
# Train model
rainbow.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: curiosity.
(1.0/5)  Episode Score: 50,  Train Loss: 7.41264,  Curiosity Loss: 3.53169,  Time taken: 5 secs.
(2.0/5)  Episode Score: 0,  Train Loss: 7.37692,  Curiosity Loss: 3.53100,  Time taken: 4 secs.
(3.0/5)  Episode Score: 100,  Train Loss: 7.26354,  Curiosity Loss: 3.53183,  Time taken: 5 secs.
(4.0/5)  Episode Score: 25,  Train Loss: 7.02121,  Curiosity Loss: 3.53367,  Time taken: 4 secs.
(5.0/5)  Episode Score: 100,  Train Loss: 6.48080,  Curiosity Loss: 3.53301,  Time taken: 4 secs.
Saved model at episode 5 as: 'rainbowcuriosity_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbowcuriosity_Qbert_logger_data.tar.gz'. Total size: 709 bytes
Training complete. Access metrics from 'logger' attribute. 

In [14]:
rainbow2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on Qbert with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100, intrinsic method: None.
(1.0/5)  Episode Score: 200,  Train Loss: 3.87803,  Time taken: 4 secs.
(2.0/5)  Episode Score: 125,  Train Loss: 3.80456,  Time taken: 3 secs.
(3.0/5)  Episode Score: 0,  Train Loss: 3.58229,  Time taken: 3 secs.
(4.0/5)  Episode Score: 300,  Train Loss: 3.46488,  Time taken: 4 secs.
(5.0/5)  Episode Score: 125,  Train Loss: 3.14850,  Time taken: 3 secs.
Saved model at episode 5 as: 'rainbow_batch32_buffer1k_Qbert_ep5.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 670 bytes
Training complete. Access metrics from 'logger' attribute. 

In [15]:
rainbow.logger

Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores', 'intrinsic_losses']'

In [16]:
rainbow.logger.actions, rainbow2.logger.actions

([Counter({4: 4850, 3: 1187, 0: 48, 5: 951, 2: 1348})],
 [Counter({4: 2982, 2: 1706, 0: 386, 3: 1379, 5: 2489, 1: 18})])

### 2c. Proximal Policy Optimization (PPO)

In [17]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device, im_type='curiosity')
ppo2 = create_model('ppo', device=device)

In [18]:
ppo.env_details, ppo2.env_details

({'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000},
 {'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000})

In [19]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_envs * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 5) # 5 training iterations

In [20]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on MontezumaRevenge with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: curiosity.
(1.0/5) Episode Score: 0.00,  Episodic Return: -0.05,  Approx KL: 0.004,  Total Loss: 8585.443,  Policy Loss: -0.004,  Value Loss: 5993.761,  Entropy Loss: 2.889,  Curiosity Loss: 5588.595,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.25,  Approx KL: -0.045,  Total Loss: 8498.282,  Policy Loss: -0.064,  Value Loss: 5902.224,  Entropy Loss: 2.849,  Curiosity Loss: 5547.263,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 2.04,  Approx KL: 1.503,  Total Loss: 8544.135,  Policy Loss: -0.198,  Value Loss: 5940.120,  Entropy Loss: 1.592,  Curiosity Loss: 5574.289,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: 14.52,  Approx KL: 0.170,  Total Loss: 8492.956,  Policy Loss: 0.005,  Value Loss: 5812.961,  Entropy Los

In [21]:
ppo2.train(num_episodes=demo_episodes, print_every=1, save_count=5)

Training agent on SpaceInvaders with 200 episodes.
Surrogate clipping size: 0.1, rollout size: 10, num environments: 4, num network updates: 4, batch size: 40, training iterations: 5, intrinsic method: None.
(1.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: 0.008,  Total Loss: -0.034,  Policy Loss: -0.016,  Value Loss: 0.000,  Entropy Loss: 1.790,  Time taken: 0 secs.
(2.0/5) Episode Score: 0.00,  Episodic Return: 0.07,  Approx KL: -0.004,  Total Loss: -0.076,  Policy Loss: -0.058,  Value Loss: 0.000,  Entropy Loss: 1.773,  Time taken: 0 secs.
(3.0/5) Episode Score: 0.00,  Episodic Return: 0.08,  Approx KL: 2.068,  Total Loss: -0.307,  Policy Loss: -0.301,  Value Loss: 0.000,  Entropy Loss: 0.691,  Time taken: 0 secs.
(4.0/5) Episode Score: 0.00,  Episodic Return: -0.02,  Approx KL: -0.000,  Total Loss: 0.002,  Policy Loss: -0.000,  Value Loss: 0.003,  Entropy Loss: 0.000,  Time taken: 0 secs.
(5.0/5) Episode Score: 0.00,  Episodic Return: 0.06,  Approx KL: 0.000,  Total

In [22]:
ppo.logger, ppo2.logger

(Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']',
 Available attributes: '['actions', 'avg_returns', 'avg_rewards', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl', 'intrinsic_losses']')

In [23]:
ppo.logger.avg_rewards, ppo2.logger.avg_rewards

([0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0])

## 3. Load Model

In [24]:
from utils.render import video_render
from utils.model_utils import load_model

In [25]:
# model = load_model('rainbowcuriosity_batch32_buffer1k_Qbert_ep5', 'cuda:0', 'rainbow')
# model = load_model('dqncuriosity_batch32_SpaInv_ep5', 'cuda:0', 'dqn')
# model = load_model('ppocuriosity_rollout100_agents8_MonRev_ep5', 'cuda:0', 'ppo')

In [26]:
# video_render(model, 1)