# Reinforcement Learning with Atari Games

## 1. Initial Setup

In [1]:
import os
from dotenv import load_dotenv

from core.create import create_model, set_save_every

import torch

In [2]:
load_dotenv()  # Create access to .env file

NUM_EPISODES = int(os.getenv('NUM_EPISODES'))
PPO_NUM_EPISODES = int(os.getenv('ROLLOUT_SIZE')) * int(os.getenv('NUM_AGENTS')) * NUM_EPISODES
SAVE_EVERY = set_save_every(1000)
print(f'NUM_EPISODES={NUM_EPISODES}, SAVE_EVERY={SAVE_EVERY}')

NUM_EPISODES=10000, SAVE_EVERY=1000


In [3]:
rainbow = create_model('rainbow')

  logger.warn(


CUDA available. Device set to GPU.


In [4]:
rainbow.train(num_episodes=4, print_every=3)

Training agent on SpaceInvaders with 4 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100.
(1/4)  Episode Score: 120,   Train Loss: 3.92731,  Time taken: 7.86 secs.
(3/4)  Episode Score: 140,   Train Loss: 3.30087,  Time taken: 19.36 secs.
(4/4)  Episode Score: 155,   Train Loss: 3.18888,  Time taken: 11.25 secs.
Training complete. Access metrics from 'logger' attribute. 

In [5]:
rainbow.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

## 2. Model Creation and Training

### 2a. Rainbow Deep Q-Network (RDQN)

In [6]:
# Create Rainbow DQN instance
dqn = create_model('rainbow')

CUDA available. Device set to GPU.


In [7]:
dqn.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

In [8]:
# Train model
dqn.train(num_episodes=4, print_every=1)

Training agent on SpaceInvaders with 4 episodes.
Buffer size: 1k, batch size: 32, max timesteps: 1k, num network updates: 4, replay period: 100.
(1/4)  Episode Score: 120,   Train Loss: 3.92731,  Time taken: 5.69 secs.
(2/4)  Episode Score: 120,   Train Loss: 3.59958,  Time taken: 9.68 secs.
(3/4)  Episode Score: 110,   Train Loss: 3.30087,  Time taken: 9.67 secs.
(4/4)  Episode Score: 100,   Train Loss: 3.18888,  Time taken: 6.05 secs.
Training complete. Access metrics from 'logger' attribute. 

### 2b. Proximal Policy Optimization (PPO)

In [9]:
# Create PPO instance
ppo = create_model('ppo')

CUDA available. Device set to GPU.


In [10]:
ppo.env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

In [12]:
torch.cuda.empty_cache()

In [13]:
demo_episodes = int((PPO_NUM_EPISODES / NUM_EPISODES) * 4)

In [14]:
ppo.train(num_episodes=demo_episodes, print_every=1)  # 4 training iterations

Training agent on SpaceInvaders with 3K episodes.
Surrogate clipping size: 0.1, rollout size: 100, num agents: 8, num network updates: 4, batch size: 800, training iterations: 4.
(1/4) Episodic Return: 0.65218,  Approx KL: -0.00013,  Total Loss: 0.16497,  Policy Loss: -0.00235,  Value Loss: 0.37043,  Entropy Loss: 1.78998,  Time taken: 1.69 secs.
(2/4) Episodic Return: 0.71180,  Approx KL: -0.00021,  Total Loss: 0.10673,  Policy Loss: -0.01412,  Value Loss: 0.27722,  Entropy Loss: 1.77619,  Time taken: 1.49 secs.
(3/4) Episodic Return: 0.56792,  Approx KL: 2.44697,  Total Loss: -0.29434,  Policy Loss: -0.36546,  Value Loss: 0.15562,  Entropy Loss: 0.66896,  Time taken: 1.53 secs.
(4/4) Episodic Return: 0.13229,  Approx KL: -0.00001,  Total Loss: 0.00239,  Policy Loss: 0.00000,  Value Loss: 0.00477,  Entropy Loss: 0.00001,  Time taken: 1.50 secs.
Training complete. Access metrics from 'logger' attribute. 