# Use QR-DQN to Play Pong-v4

PyTorch version

In [1]:
%matplotlib inline

import copy
import logging
import itertools
import sys

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
from gym.wrappers.atari_preprocessing import AtariPreprocessing
from gym.wrappers.frame_stack import FrameStack
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
from torch import nn
from torch import optim

logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

Environment

In [2]:
env = FrameStack(AtariPreprocessing(gym.make('PongNoFrameskip-v4')),
        num_stack=4)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:00:46 [INFO] env: <AtariPreprocessing<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>
00:00:46 [INFO] action_space: Discrete(6)
00:00:46 [INFO] observation_space: Box(0, 255, (4, 84, 84), uint8)
00:00:46 [INFO] reward_range: (-inf, inf)
00:00:46 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:00:46 [INFO] num_stack: 4
00:00:46 [INFO] lz4_compress: False
00:00:46 [INFO] frames: deque([], maxlen=4)
00:00:46 [INFO] id: PongNoFrameskip-v4
00:00:46 [INFO] entry_point: gym.envs.atari:AtariEnv
00:00:46 [INFO] reward_threshold: None
00:00:46 [INFO] nondeterministic: False
00:00:46 [INFO] max_episode_steps: 400000
00:00:46 [INFO] _kwargs: {'game': 'pong', 'obs_type': 'image', 'frameskip': 1}
00:00:46 [INFO] _env_name: PongNoFrameskip


Agent

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)

In [4]:
class Agent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.epsilon = 1.

        self.replayer = DQNReplayer(capacity=100000)

        self.quantile_count = 200
        self.cumprob_tensor = torch.arange(1 / (2 * self.quantile_count),
                1, 1 / self.quantile_count).view(1, -1, 1)

        self.evaluate_net = nn.Sequential(
                nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(),
                nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
                nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(),
                nn.Flatten(),
                nn.Linear(in_features=3136, out_features=512), nn.ReLU(),
                nn.Linear(in_features=512,
                out_features=self.action_n * self.quantile_count))
        self.target_net = copy.deepcopy(self.evaluate_net)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.0001)

        self.loss = nn.SmoothL1Loss(reduction="none")

    def reset(self, mode=None):
        self.mode = mode
        if mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        state_tensor = torch.as_tensor(observation,
                dtype=torch.float).unsqueeze(0)
        q_component_tensor = self.evaluate_net(state_tensor).view(-1,
                self.action_n, self.quantile_count)
        q_tensor = q_component_tensor.mean(2)
        action_tensor = q_tensor.argmax(dim=1)
        actions = action_tensor.detach().numpy()
        action = actions[0]
        if self.mode == 'train':
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.action_n)
            
            self.trajectory += [observation, reward, done, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)
            if self.replayer.count >= 1024 and self.replayer.count % 10 == 0:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        for target_param, evaluate_param in zip(
                target_net.parameters(), evaluate_net.parameters()):
            target_param.data.copy_(learning_rate * evaluate_param.data
                    + (1 - learning_rate) * target_param.data)

    def learn(self):
        # replay
        batch_size = 32
        states, actions, rewards, next_states, dones = \
                self.replayer.sample(batch_size)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)

        # compute target
        next_q_component_tensor = self.evaluate_net(next_state_tensor).view(
                -1, self.action_n, self.quantile_count)
        next_q_tensor = next_q_component_tensor.mean(2)
        next_action_tensor = next_q_tensor.argmax(dim=1)
        next_actions = next_action_tensor.detach().numpy()
        all_next_q_quantile_tensor = self.target_net(next_state_tensor
                ).view(-1, self.action_n, self.quantile_count)
        next_q_quantile_tensor = all_next_q_quantile_tensor[
                range(batch_size), next_actions, :]
        target_quantile_tensor = reward_tensor.reshape(batch_size, 1) \
                + self.gamma * next_q_quantile_tensor \
                * (1. - done_tensor).reshape(-1, 1)
        
        all_q_quantile_tensor = self.evaluate_net(state_tensor).view(-1,
                self.action_n, self.quantile_count)
        q_quantile_tensor = all_q_quantile_tensor[range(batch_size), actions,
                :]
        
        target_quantile_tensor = target_quantile_tensor.unsqueeze(1)
        q_quantile_tensor = q_quantile_tensor.unsqueeze(2)
        hubor_loss_tensor = self.loss(target_quantile_tensor, q_quantile_tensor)
        comparison_tensor = (target_quantile_tensor
                < q_quantile_tensor).detach().float()
        quantile_regression_tensor = (self.cumprob_tensor
                - comparison_tensor).abs()
        quantile_huber_loss_tensor = (hubor_loss_tensor
                * quantile_regression_tensor).sum(-1).mean(1)
        loss_tensor = quantile_huber_loss_tensor.mean()
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()

        self.update_net(self.target_net, self.evaluate_net)

        self.epsilon = max(self.epsilon - 1e-5, 0.05)


agent = Agent(env)

Train & Test

In [None]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, mode='train')
    episode_rewards.append(episode_reward)
    logging.debug('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-5:]) > 16.:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:47 [INFO] ==== train ====
00:01:04 [DEBUG] train episode 0: reward = -19.00, steps = 1010
00:01:34 [DEBUG] train episode 1: reward = -21.00, steps = 852
00:02:04 [DEBUG] train episode 2: reward = -21.00, steps = 850
00:02:35 [DEBUG] train episode 3: reward = -21.00, steps = 898
00:03:11 [DEBUG] train episode 4: reward = -20.00, steps = 989
00:03:48 [DEBUG] train episode 5: reward = -20.00, steps = 1040
00:04:25 [DEBUG] train episode 6: reward = -20.00, steps = 1035
00:05:04 [DEBUG] train episode 7: reward = -19.00, steps = 1058
00:05:34 [DEBUG] train episode 8: reward = -21.00, steps = 785
00:06:03 [DEBUG] train episode 9: reward = -21.00, steps = 758
00:06:41 [DEBUG] train episode 10: reward = -20.00, steps = 990
00:07:13 [DEBUG] train episode 11: reward = -20.00, steps = 837
00:07:43 [DEBUG] train episode 12: reward = -21.00, steps = 790
00:08:18 [DEBUG] train episode 13: reward = -19.00, steps = 964
00:08:54 [DEBUG] train episode 14: reward = -19.00, steps = 966
00:09:30 [DEBU