# Reinforcement Learning with Atari Games

In [1]:
import os
import random
import numpy as np

import torch
from dotenv import load_dotenv

from agents.ppo import PPO
from core.parameters import (
    EnvParameters,
    DQNParameters,
    PPOParameters,
    ModelParameters
)
from core.env_details import EnvDetails
from agents.dqn import DQN
from models.actor_critic import ActorCritic
from models.cnn import CNNModel

import torch.optim as optim
import torch.nn as nn

## 1. Initial Setup

In [2]:
 # Create access to .env file (hyperparameters)
load_dotenv()

SEED = int(os.getenv('SEED'))
LEARNING_RATE = float(os.getenv('LEARNING_RATE'))
EPSILON = float(os.getenv('EPSILON'))
NUM_EPISODES = int(os.getenv('NUM_EPISODES'))
SAVE_EVERY = int(os.getenv('SAVE_EVERY'))
CAPTURE_VIDEO = True if os.getenv('CAPTURE_VIDEO') == 'True' else False

In [3]:
# Seeding
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED);

In [4]:
env_params = EnvParameters(
    env_name=os.getenv('ENV_1'),
    img_size=int(os.getenv('IMG_SIZE')),
    stack_size=int(os.getenv('STACK_SIZE')),
    capture_video=CAPTURE_VIDEO,
    record_every=SAVE_EVERY
)

In [5]:
env_details = EnvDetails(env_params)

  logger.warn(


In [6]:
env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

## 2. Model Creation

### 2a. Deep Q-Network (DQN)

In [7]:
# Set DQN hyperparameters
network = CNNModel(input_shape=env_details.input_shape, n_actions=env_details.n_actions)

dqn_model_params = ModelParameters(
    network=network,
    optimizer=optim.Adam(network.parameters(), lr=LEARNING_RATE, eps=EPSILON),
    loss_metric=nn.MSELoss()
)

dqn_params = DQNParameters(
    gamma=float(os.getenv('GAMMA')),
    tau=float(os.getenv('TAU')),
    buffer_size=int(float(os.getenv('BUFFER_SIZE'))),
    batch_size=int(os.getenv('BATCH_SIZE')),
    update_steps=int(os.getenv('UPDATE_STEPS')),
    eps_start=float(os.getenv('EPS_START')),
    eps_end=float(os.getenv('EPS_END')),
    eps_decay=float(os.getenv('EPS_DECAY')),
    max_timesteps=int(os.getenv('MAX_TIMESTEPS'))
)

In [8]:
# Create DQN instance
dqn = DQN(env_details, dqn_model_params, dqn_params, SEED)

CUDA available. Device set to GPU.


In [9]:
# Train model
# dqn.train(num_episodes=3, print_every=1)

### 2b. Proximal Policy Optimization (PPO)

In [10]:
# Set PPO hyperparameters
actor_critic = ActorCritic(input_shape=env_details.input_shape, n_actions=env_details.n_actions)

ppo_model_params = ModelParameters(
    network=actor_critic,
    optimizer=optim.Adam(actor_critic.parameters(), lr=LEARNING_RATE, eps=EPSILON),
    loss_metric=nn.MSELoss()
)

ppo_params = PPOParameters(
    gamma=float(os.getenv('GAMMA')),
    update_steps=int(os.getenv('UPDATE_STEPS')),
    clip_grad=float(os.getenv('CLIP_GRAD')),
    rollout_size=int(os.getenv('ROLLOUT_SIZE')),
    num_agents=int(os.getenv('NUM_AGENTS')),
    num_mini_batches=int(os.getenv('NUM_MINI_BATCHES')),
    entropy_coef=float(os.getenv('ENTROPY_COEF')),
    value_loss_coef=float(os.getenv('VALUE_LOSS_COEF')),
    max_grad_norm=float(os.getenv('MAX_GRAD_NORM'))
)

In [11]:
# Create PPO instance
ppo = PPO(env_details, ppo_model_params, ppo_params, SEED)

CUDA available. Device set to GPU.


In [12]:
torch.cuda.empty_cache()

In [13]:
ppo.train(num_episodes=240, print_every=1)

Training agent on SpaceInvaders with 240 episodes.
Gradient clip size: 0.1, rollout size: 10, num_agents: 8, network updates: 4, batch size: 80, training iterations: 3.
(1/3) Episodic Return: 0.01499,  Approx KL: 0.00828,  Total Loss: -0.03070,  Policy Loss: -0.01284,  Value Loss: 0.00001,  Entropy Loss: 1.78657
(2/3) Episodic Return: 0.02089,  Approx KL: -0.01420,  Total Loss: -0.06547,  Policy Loss: -0.04787,  Value Loss: 0.00006,  Entropy Loss: 1.76285
(3/3) Episodic Return: 0.02174,  Approx KL: 1.68957,  Total Loss: -0.32796,  Policy Loss: -0.31982,  Value Loss: 0.00031,  Entropy Loss: 0.82958
Training complete. Access metrics from 'logger' attribute.
