# Reinforcement Learning with Atari Games

In [1]:
import os
import random
import numpy as np

import torch
from dotenv import load_dotenv

from agents.ppo import PPO
from core.parameters import (
    EnvParameters,
    DQNModelParameters,
    DQNParameters,
    PPOModelParameters,
    PPOParameters,
)
from core.env_details import EnvDetails
from agents.dqn import DQN
from models.actor_critic import Actor, Critic
from models.cnn import CNNModel

import torch.optim as optim
import torch.nn as nn

## 1. Initial Setup

In [2]:
 # Create access to .env file (hyperparameters)
load_dotenv()

SEED = int(os.getenv('SEED'))
LEARNING_RATE = float(os.getenv('LEARNING_RATE'))
EPSILON = float(os.getenv('EPSILON'))
NUM_EPISODES = int(os.getenv('NUM_EPISODES'))
SAVE_EVERY = int(os.getenv('SAVE_EVERY'))
CAPTURE_VIDEO = True if os.getenv('CAPTURE_VIDEO') == 'True' else False

In [3]:
# Seeding
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED);

In [4]:
env_params = EnvParameters(
    env_name=os.getenv('ENV_1'),
    img_size=int(os.getenv('IMG_SIZE')),
    stack_size=int(os.getenv('STACK_SIZE')),
    capture_video=CAPTURE_VIDEO,
    record_every=SAVE_EVERY
)

In [5]:
# Set environment class
env_details = EnvDetails(env_params)

In [6]:
env_details

{'gym_name': 'ALE/SpaceInvaders-v5', 'name': 'SpaceInvaders', 'env': <FrameStack<GrayScaleObservation<ResizeObservation<RecordEpisodeStatistics<TimeLimit<OrderEnforcing<AtariEnv<ALE/SpaceInvaders-v5>>>>>>>>, 'obs_space': Box(0, 255, (4, 128, 128), uint8), 'action_space': Discrete(6), 'input_shape': (4, 128, 128), 'n_actions': 6, 'img_size': 128, 'stack_size': 4, 'capture_video': False, 'record_every': 1000}

## 2. Model Creation

### 2a. Deep Q-Network (DQN)

In [7]:
# Set DQN hyperparameters
network = CNNModel(input_shape=env_details.input_shape, n_actions=env_details.n_actions)

dqn_model_params = DQNModelParameters(
    network=network,
    optimizer=optim.Adam(network.parameters(), lr=LEARNING_RATE, eps=EPSILON),
    loss_metric=nn.MSELoss()
)

dqn_params = DQNParameters(
    gamma=float(os.getenv('GAMMA')),
    tau=float(os.getenv('TAU')),
    buffer_size=int(float(os.getenv('BUFFER_SIZE'))),
    batch_size=int(os.getenv('BATCH_SIZE')),
    update_steps=int(os.getenv('UPDATE_STEPS')),
    eps_start=float(os.getenv('EPS_START')),
    eps_end=float(os.getenv('EPS_END')),
    eps_decay=float(os.getenv('EPS_DECAY')),
    max_timesteps=int(os.getenv('DQN_MAX_TIMESTEPS'))
)

In [8]:
# Create DQN instance
dqn = DQN(env_details, dqn_model_params, dqn_params, SEED)

CUDA available. Device set to GPU.


In [9]:
# Train model
dqn.train(num_episodes=3, print_every=1)

Training agent on SpaceInvaders with 3 episodes.
Buffer size: 100k, batch size: 32, max timesteps: 1k, network updates: 4.
(1/3) Episode Score: 30, Train Loss: 0.00463
(2/3) Episode Score: 80, Train Loss: 0.00396
(3/3) Episode Score: 230, Train Loss: 0.00292
Training complete. Access metrics from 'logger' attribute.


### 2b. Proximal Policy Optimization (PPO)

In [10]:
# Set PPO hyperparameters
actor = Actor(input_shape=env_details.input_shape, n_actions=env_details.n_actions)
critic = Critic(input_shape=env_details.input_shape, n_actions=env_details.n_actions)

ppo_model_params = PPOModelParameters(
    actor=actor,
    critic=critic,
    actor_optimizer=optim.Adam(network.parameters(), lr=LEARNING_RATE, eps=EPSILON),
    critic_optimizer=optim.Adam(critic.parameters(), lr=LEARNING_RATE, eps=EPSILON),
    loss_metric=nn.MSELoss()
)

ppo_params = PPOParameters(
    gamma=float(os.getenv('GAMMA')),
    update_steps=int(os.getenv('UPDATE_STEPS')),
    clip_grad=float(os.getenv('CLIP_GRAD')),
    rollout_size=int(os.getenv('ROLLOUT_SIZE')),
    max_timesteps=int(os.getenv('PPO_MAX_TIMESTEPS')),
    num_agents=int(os.getenv('NUM_AGENTS'))
)

In [11]:
# Create PPO instance
ppo = PPO(env_details, ppo_model_params, ppo_params, SEED)

CUDA available. Device set to GPU.


In [12]:
torch.cuda.empty_cache()

In [13]:
# ppo.train(num_episodes=3, print_every=1)