# Use Double DQN to Play MoutainCar-v0

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import imp
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)

imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('MountainCar-v0')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])

20:28:17 [INFO] env: <MountainCarEnv<MountainCar-v0>>
20:28:17 [INFO] action_space: Discrete(3)
20:28:17 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
20:28:17 [INFO] reward_range: (-inf, inf)
20:28:17 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
20:28:17 [INFO] _max_episode_steps: 200
20:28:17 [INFO] _elapsed_steps: None


In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)

In [4]:
class DoubleDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.evaluate_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[64, 64], output_size=self.action_n)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, done):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).reshape(1, -1)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
            if len(self.trajectory) >= 8:
                state, _, _, action, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, action, reward, next_state, done)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = \
                self.replayer.sample(1024) # replay transitions
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)

        # train
        next_eval_q_tensor = self.evaluate_net(next_state_tensor)
        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor = torch.gather(next_q_tensor, 1,
                next_action_tensor.unsqueeze(1)).squeeze(1)
        target_tensor = reward_tensor + self.gamma * (1. - done_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DoubleDQNAgent(env)

In [None]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,
            max_episode_steps=env._max_episode_steps, mode='train')
    episode_rewards.append(episode_reward)
    logging.debug('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

20:28:17 [INFO] ==== train ====
20:28:17 [DEBUG] train episode 0: reward = -200.00, steps = 200
20:28:17 [DEBUG] train episode 1: reward = -200.00, steps = 200
20:28:18 [DEBUG] train episode 2: reward = -200.00, steps = 200
20:28:18 [DEBUG] train episode 3: reward = -200.00, steps = 200
20:28:18 [DEBUG] train episode 4: reward = -200.00, steps = 200
20:28:18 [DEBUG] train episode 5: reward = -200.00, steps = 200
20:28:19 [DEBUG] train episode 6: reward = -200.00, steps = 200
20:28:19 [DEBUG] train episode 7: reward = -200.00, steps = 200
20:28:19 [DEBUG] train episode 8: reward = -200.00, steps = 200
20:28:19 [DEBUG] train episode 9: reward = -200.00, steps = 200
20:28:19 [DEBUG] train episode 10: reward = -200.00, steps = 200
20:28:20 [DEBUG] train episode 11: reward = -200.00, steps = 200
20:28:20 [DEBUG] train episode 12: reward = -200.00, steps = 200
20:28:20 [DEBUG] train episode 13: reward = -200.00, steps = 200
20:28:20 [DEBUG] train episode 14: reward = -200.00, steps = 200
20:

21:36:19 [DEBUG] train episode 126: reward = -200.00, steps = 200
21:38:07 [DEBUG] train episode 127: reward = -200.00, steps = 200
21:39:56 [DEBUG] train episode 128: reward = -200.00, steps = 200
21:41:47 [DEBUG] train episode 129: reward = -200.00, steps = 200
21:43:39 [DEBUG] train episode 130: reward = -200.00, steps = 200
21:45:33 [DEBUG] train episode 131: reward = -200.00, steps = 200
21:47:30 [DEBUG] train episode 132: reward = -200.00, steps = 200
21:49:23 [DEBUG] train episode 133: reward = -200.00, steps = 200
21:51:05 [DEBUG] train episode 134: reward = -200.00, steps = 200
21:51:44 [DEBUG] train episode 135: reward = -200.00, steps = 200
21:52:16 [DEBUG] train episode 136: reward = -200.00, steps = 200
21:52:23 [DEBUG] train episode 137: reward = -200.00, steps = 200
21:52:33 [DEBUG] train episode 138: reward = -200.00, steps = 200
21:53:11 [DEBUG] train episode 139: reward = -200.00, steps = 200
21:53:43 [DEBUG] train episode 140: reward = -200.00, steps = 200
21:55:01 [

In [None]:
env.close()