# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
from core.create import create_model, get_utility_params
from utils.helper import set_device
from utils.model_utils import load_model
from utils.render import video_render

In [2]:
# Get utility parameters from yaml file
util_params = get_utility_params()

# Set them as hyperparameters
NUM_EPISODES = util_params['num_episodes']
SAVE_EVERY = util_params['save_every']
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=50000, SAVE_EVERY=10000


In [3]:
# Set CUDA device
device = set_device()

CUDA available. Device set to GPU -> 'cuda:0'


In [4]:
env2 = util_params['env_2']
env3 = util_params['env_3']
print(f'env2={env2}, env3={env3}')

env2=ALE/Qbert-v5, env3=ALE/MontezumaRevenge-v5


## 2. Model Creation and Training

### 2a. Deep Q-Network (DQN)

In [5]:
# Create DQN instance
dqn = create_model('dqn', device=device, im_type='curiosity')
dqn2 = create_model('dqn', device=device)

  logger.warn(


In [6]:
dqn.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [7]:
# Train model
dqn.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: curiosity
(1.0/5) Episode Score: 120, Train Loss: 2607.47656
(2.0/5) Episode Score: 65, Train Loss: 313.47992
(3.0/5) Episode Score: 80, Train Loss: 222.82666
(4.0/5) Episode Score: 155, Train Loss: 78.88565
(5.0/5) Episode Score: 155, Train Loss: 42.40712
Saved model at episode 5 as: 'dqncuriosity_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqncuriosity_SpaInv_logger_data.tar.gz'. Total size: 800 bytes
Training complete. Access metrics from 'logger' attribute. 

In [8]:
dqn2.train(num_episodes=5, print_every=1, save_count=5)

Training agent on SpaceInvaders with 5 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, intrinsic method: None
(1.0/5) Episode Score: 90, Train Loss: 311.83264
(2.0/5) Episode Score: 245, Train Loss: 5.62023
(3.0/5) Episode Score: 290, Train Loss: 27.85412
(4.0/5) Episode Score: 70, Train Loss: 6.93439
(5.0/5) Episode Score: 210, Train Loss: 144.98006
Saved model at episode 5 as: 'dqn_batch32_SpaInv_ep5.pt'.
Saved logger data to 'saved_models/dqn_SpaInv_logger_data.tar.gz'. Total size: 785 bytes
Training complete. Access metrics from 'logger' attribute. 

In [9]:
dqn.logger

Available attributes: '['actions', 'train_losses', 'ep_scores']'

In [10]:
dqn.logger.actions

[Counter({4: 386, 1: 421, 0: 420, 2: 392, 5: 404, 3: 453})]

### 2b. Rainbow Deep Q-Network (RDQN)

In [11]:
# Create Rainbow DQN instance
rainbow = create_model('rainbow', env=env2, device=device)

In [12]:
rainbow.env_details

{'gym_name': 'ALE/Qbert-v5', 'name': 'Qbert', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(6), 'input_shape': (4, 84, 84), 'n_actions': 6, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [13]:
# Train model
rainbow.train(num_episodes=4, print_every=1, save_count=2)

Training agent on Qbert with 4 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100.
(1.0/4)  Episode Score: 0,   Train Loss: 3.56554,  Time taken: 3.86 secs.
(2.0/4)  Episode Score: 225,   Train Loss: 3.73456,  Time taken: 5.76 secs.
Saved model at episode 2 as: 'rainbow_batch32_buffer1000_Qbert_ep2.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 602 bytes
(3.0/4)  Episode Score: 50,   Train Loss: 3.21229,  Time taken: 4.72 secs.
(4.0/4)  Episode Score: 150,   Train Loss: 3.10329,  Time taken: 5.28 secs.
Saved model at episode 4 as: 'rainbow_batch32_buffer1000_Qbert_ep4.pt'.
Saved logger data to 'saved_models/rainbow_Qbert_logger_data.tar.gz'. Total size: 657 bytes
Training complete. Access metrics from 'logger' attribute. 

In [14]:
rainbow.logger

Available attributes: '['avg_returns', 'actions', 'train_losses', 'ep_scores']'

In [15]:
rainbow.logger.actions

[Counter({4: 2793, 0: 298, 2: 1884, 5: 969, 3: 1108, 1: 52})]

### 2c. Proximal Policy Optimization (PPO)

In [16]:
# Create PPO instance
ppo = create_model('ppo', env=env3, device=device)

In [17]:
ppo.env_details

{'gym_name': 'ALE/MontezumaRevenge-v5', 'name': 'MontezumaRevenge', 'obs_space': Box(0, 255, (4, 84, 84), uint8), 'action_space': Discrete(18), 'input_shape': (4, 84, 84), 'n_actions': 18, 'img_size': 84, 'stack_size': 4, 'capture_video': False, 'record_every': 10000}

In [19]:
PPO_NUM_EPISODES = ppo.params.rollout_size * ppo.params.num_envs * NUM_EPISODES
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 4)

In [20]:
ppo.train(num_episodes=demo_episodes, print_every=1, save_count=2)  # 4 training iterations

Training agent on MontezumaRevenge with 3K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num agents: 8, num network updates: 4, batch size: 800, training iterations: 4.
(1.0/4) Episodic Return: -0.03268,  Approx KL: 0.00036,  Total Loss: -0.02980,  Policy Loss: -0.00095,  Value Loss: 0.00009,  Entropy Loss: 2.88944,  Time taken: 1.15 secs.
(2.0/4) Episodic Return: -0.01729,  Approx KL: -0.00013,  Total Loss: -0.02926,  Policy Loss: -0.00038,  Value Loss: 0.00003,  Entropy Loss: 2.88919,  Time taken: 1.03 secs.
Saved model at episode 2 as: 'ppo_rollout100_agents8_MonRev_ep2.pt'.
Saved logger data to 'saved_models/ppo_MonRev_logger_data.tar.gz'. Total size: 767 bytes
(3.0/4) Episodic Return: -0.00839,  Approx KL: -0.00035,  Total Loss: -0.02974,  Policy Loss: -0.00085,  Value Loss: 0.00001,  Entropy Loss: 2.88890,  Time taken: 1.16 secs.
(4.0/4) Episodic Return: -0.00459,  Approx KL: 0.00026,  Total Loss: -0.02973,  Policy Loss: -0.00085,  Value Loss: 0.00001,  Entropy Loss:

In [21]:
ppo.logger

Available attributes: '['actions', 'avg_rewards', 'avg_returns', 'policy_losses', 'value_losses', 'entropy_losses', 'total_losses', 'approx_kl']'

In [22]:
ppo.logger.actions

[Counter({6: 712,
          8: 792,
          2: 740,
          7: 740,
          9: 688,
          0: 664,
          1: 712,
          14: 600,
          4: 812,
          3: 796,
          5: 836,
          11: 676,
          10: 620,
          16: 700,
          15: 748,
          13: 612,
          17: 724,
          12: 628})]

## 3. Load Model

In [None]:
# model = load_model('', device, 'dqn')

In [None]:
# video_render(model)