In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import namedtuple
import random

from itertools import count
import numpy as np
import time


from wrapppers.atari_wrappers import *
from utils import get_state, select_action

import io
import base64
from IPython.display import Video


# What is this notebook for? 

This notebook is intended to walk through how the network interacts with a test environment and run an episode with a 
pretrained network. It assumes that you've trained a network using the training code in this repo or you have a copy of the trained weights for this notebook. 

Let's start by defining a named tuple that denotes an observation. This helps us provide a convenient data object for storing observations in our Replay Memory. We need to store the `state`, `action`, `next state`, and `reward` in order to train our model later on. 

In [2]:
Transition = namedtuple('Transition',  ('state', 'action', 'next_state', 'reward'))

# DQN Network

Below, we define the DQN neural network. This is not strictly necessary for the testing code, as the network is reconstructed from the weights. Our network consists of three convolutional layers and two linear layers, as defined below. The network takes in a state ($s$) as input and outputs a vector of $Q(s,a)$ for all possible actions $a$ from that state. $Q(s,a)$ defines the quality function of a given state-action pair. The network is then trained on real quality actions from the received reward. 

In [3]:

class DQN(nn.Module):
    """
    This class implements DQN with conv layers. The idea is to take an impage of the Pong
    game state as input and output a vector of 

    Args:
        in_channels (int): number of channels in the input image
        n_actions (int): number of possible actions
    """
    def __init__(self, in_channels = 4, n_actions = 4):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7 * 7 * 64, 512)
        self.fc2 = nn.Linear(512, n_actions)

    def forward(self, x):
        x = x.float() / 255
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Memory Replay

We define the `ReplayMemory` data structure below. This is strictly to allow us to have a friendly way to deal with the `ReplayMemory`. It's functionally, just a list with a sampling function build in. It allows us to have a nice way to store our observations (with the transition named tuple above) and sample from it when we need to train. 

In [4]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Testing Function

Here we define our `test` function. This takes in a Gym environment (`env`), a number of episodes to test on (`n_episodes`), and a `render` boolean. The gym environment is what the agent interacts with. In our case, it's an environment for Pong, but Gym has environments for hundreds of games, including robot applications and others. Gym allows our focus to be on the RL algorithm instead of having to worry about physics. 

## Monitor
In the first line of the function, we define `gym.wrappers.Monitor`. This is a wrapper around our Gym environment that allows us to track some metadata about the exectued environment and it will save a video to the file path that we passed in. The `force` parameter decides whether we want to rewrite data in a folder that already has Gym wrapper data in it. The agent can interact with the wrapper in the same way that it could with the raw environment. 


## Testing loop

We have two loops. The first loop iterates through the number of episodes and the second iterates through an episode. In the case of Pong, an episode is just the length of the game. More generally, an episode is from a start state to an end state, so in any game, the start state is the start of the game and the end state is the end of the game. 

Before each episode, we do the following:
1. Reset the environment 
2. Get a start state
3. Reset reward attained in a given episode to zero. 

During each episode, we do the following: 
1. Predict a new action by selecting the action that gives us the maximum Q-value from our network. 
2. Render the environment if that argument is True
3. Tell the environment what our agent's predicted action is and receive information about observation, reward, and whether the game ended. 
4. If the game hasn't ended, get the next state. Otherwise, report the total reward attained during that episode and go to the next episode. 

In [15]:
def test(env, n_episodes, render=True):

    env = gym.wrappers.Monitor(env, SAVE_DIR + 'dqn_pong_video', force = True)
    for episode in range(n_episodes):
        obs = env.reset()
        state = get_state(obs)
        total_reward = 0.0
        for t in count():
            with torch.no_grad():
                action = policy_net(state.cuda()).max(1)[1].view(1, 1)
            if render:
                env.render() 
                
                time.sleep(0.02)

            obs, reward, done, info = env.step(action)

            total_reward += reward

            if not done:
                next_state = get_state(obs)
            else:
                next_state = None

            state = next_state

            if done:
                print("Finished Episode {} with reward {}".format(episode, total_reward))
                break

    env.close()
    return

In [10]:
env = gym.make('PongNoFrameskip-v4')
env = make_env(env)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights_path = 'train_videos2/dqn_pong_model.pth'
SAVE_DIR = './videos/'


# Loading the Network

We use `torch.load` to load the network from the saved weights path. Calling `.to(device)` either passes the network to the GPU or the CPU, depending on if there's a Nvidia GPU in the system (and PyTorch CUDA is initialized correctly). 

During training, we use `torch.save` to save the network. 

In [20]:
policy_net = torch.load(weights_path).to(device)
print(policy_net)

DQN(
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=4, bias=True)
)


Now, we simply have to call the function! 

In [12]:
test(env, 1, render = True)

  logger.warn(


Finished Episode 0 with reward 21.0


We can replay the video of the agent, as well. 

In [19]:
Video(SAVE_DIR + '/dqn_pong_video/openaigym.video.1.24628.video000000.mp4', embed=True)