In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

import numpy as np
import gym
import random
from collections import deque, namedtuple
import copy
from itertools import count
import math
import random
import time

In [2]:
!git clone 'https://github.com/jmichaux/dqn-pytorch'

fatal: destination path 'dqn-pytorch' already exists and is not an empty directory.


In [3]:
import os
os.chdir('dqn-pytorch')

In [4]:
from wrappers import *
from wrappers import *
from memory import ReplayMemory
from models import *

In [5]:
from collections import namedtuple
import random

Transition = namedtuple('Transion', 
                        ('state', 'action', 'next_state', 'reward'))

Data = namedtuple('Data',('priority','probability','weight','index'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, experiences_per_sampling, seed, compute_weights):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            experiences_per_sampling (int): number of experiences to sample during a sampling iteration
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.experiences_per_sampling = experiences_per_sampling
        
        self.alpha = 0.5
        self.alpha_decay_rate = 0.99
        self.beta = 0.5
        self.beta_growth_rate = 1.001
        self.seed = random.seed(seed)
        self.compute_weights = compute_weights
        self.experience_count = 0
        
        self.experience = namedtuple("Experience", 
            field_names=["state", "action", "reward", "next_state", "done"])
        self.data = namedtuple("Data", 
            field_names=["priority", "probability", "weight","index"])

        indexes = []
        datas = []
        for i in range(buffer_size):
            indexes.append(i)
            d = self.data(0,0,0,i)
            datas.append(d)
        
        self.memory = {key: self.experience for key in indexes}
        self.memory_data = {key: data for key,data in zip(indexes, datas)}
        self.sampled_batches = []
        self.current_batch = 0
        self.priorities_sum_alpha = 0
        self.priorities_max = 1
        self.weights_max = 1
    
    def update_priorities(self, tds, indices):
        for td, index in zip(tds, indices):
            N = min(self.experience_count, self.buffer_size)

            updated_priority = td[0]
            if updated_priority > self.priorities_max:
                self.priorities_max = updated_priority
            
            if self.compute_weights:
                updated_weight = ((N * updated_priority)**(-self.beta))/self.weights_max
                if updated_weight > self.weights_max:
                    self.weights_max = updated_weight
            else:
                updated_weight = 1

            old_priority = self.memory_data[index].priority
            self.priorities_sum_alpha += updated_priority**self.alpha - old_priority**self.alpha
            updated_probability = td[0]**self.alpha / self.priorities_sum_alpha
            data = self.data(updated_priority, updated_probability, updated_weight, index) 
            self.memory_data[index] = data

    def update_memory_sampling(self):
        """Randomly sample X batches of experiences from memory."""
        # X is the number of steps before updating memory
        self.current_batch = 0
        values = list(self.memory_data.values())
        random_values = random.choices(self.memory_data, 
                                       [data.probability for data in values], 
                                       k=self.experiences_per_sampling)
        self.sampled_batches = [random_values[i:i + self.batch_size] 
                                    for i in range(0, len(random_values), self.batch_size)]

    def update_parameters(self):
        self.alpha *= self.alpha_decay_rate
        self.beta *= self.beta_growth_rate
        if self.beta > 1:
            self.beta = 1
        N = min(self.experience_count, self.buffer_size)
        self.priorities_sum_alpha = 0
        sum_prob_before = 0
        for element in self.memory_data.values():
            sum_prob_before += element.probability
            self.priorities_sum_alpha += element.priority**self.alpha
        sum_prob_after = 0
        for element in self.memory_data.values():
            probability = element.priority**self.alpha / self.priorities_sum_alpha
            sum_prob_after += probability
            weight = 1
            if self.compute_weights:
                weight = ((N *  element.probability)**(-self.beta))/self.weights_max
            d = self.data(element.priority, probability, weight, element.index)
            self.memory_data[element.index] = d
        print("sum_prob before", sum_prob_before)
        print("sum_prob after : ", sum_prob_after)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        self.experience_count += 1
        index = self.experience_count % self.buffer_size

        if self.experience_count > self.buffer_size:
            temp = self.memory_data[index]
            self.priorities_sum_alpha -= temp.priority**self.alpha
            if temp.priority == self.priorities_max:
                self.memory_data[index].priority = 0
                self.priorities_max = max(self.memory_data.items(), key=operator.itemgetter(1)).priority
            if self.compute_weights:
                if temp.weight == self.weights_max:
                    self.memory_data[index].weight = 0
                    self.weights_max = max(self.memory_data.items(), key=operator.itemgetter(2)).weight

        priority = self.priorities_max
        weight = self.weights_max
        self.priorities_sum_alpha += priority ** self.alpha
        probability = priority ** self.alpha / self.priorities_sum_alpha
        e = self.experience(state, action, reward, next_state, done)
        self.memory[index] = e
        d = self.data(priority, probability, weight, index)
        self.memory_data[index] = d
            
    def sample(self):
        sampled_batch = self.sampled_batches[self.current_batch]
        self.current_batch += 1
        experiences = []
        weights = []
        indices = []
        
        for data in sampled_batch:
            experiences.append(self.memory.get(data.index))
            weights.append(data.weight)
            indices.append(data.index)

        states = torch.from_numpy(
            np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones, weights, indices)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [6]:
class ConvDQNbn(nn.Module):
    def __init__(self, in_channels=4, n_actions=14):
        """
        Initialize Deep Q Network
        Args:
            in_channels (int): number of input channels
            n_actions (int): number of outputs
        """
        super(ConvDQNbn, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.head = nn.Linear(512, n_actions)
        
    def forward(self, x):
        x = x.float() / 255
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.head(x)


class ConvDQN(nn.Module):
    def __init__(self, in_channels=4, n_actions=14):
        """
        Initialize Deep Q Network
        Args:
            in_channels (int): number of input channels
            n_actions (int): number of outputs
        """
        super(ConvDQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        # self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        # self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        # self.bn3 = nn.BatchNorm2d(64)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.head = nn.Linear(512, n_actions)
        
    def forward(self, x):
        x = x.float() / 255
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.head(x)


class DQN(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.fc = nn.Sequential(
            nn.Linear(self.input_dim[0], 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, self.output_dim)
        )

    def forward(self, state):
        qvals = self.fc(state)
        return qvals

In [7]:
Transition = namedtuple('Transion', 
                        ('state', 'action', 'next_state', 'reward'))


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END)* \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    with torch.no_grad():
      values = policy_net(state.to(device)).max(1)
    if sample > eps_threshold:
            return values[0], values[1].view(1,1) #choose greedy policy
    else:
            return values[0],torch.tensor([[random.randrange(env.action_space.n)]], device=device, dtype=torch.long)

    
def optimize_model(mode = 'DDQN'):
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    """
    zip(*transitions) unzips the transitions into
    Transition(*) creates new named tuple
    batch.state - tuple of all the states (each state is a tensor)
    batch.next_state - tuple of all the next states (each state is a tensor)
    batch.reward - tuple of all the rewards (each reward is a float)
    batch.action - tuple of all the actions (each action is an int)    
    """
    batch = Transition(*zip(*transitions))
    
    actions = tuple((map(lambda a: torch.tensor([[a]], device=device), batch.action))) 
    rewards = tuple((map(lambda r: torch.tensor([r], device=device), batch.reward))) 

    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        device=device, dtype=torch.uint8)
    
    non_final_next_states = torch.cat([s for s in batch.next_state
                                       if s is not None]).to(device)
    

    state_batch = torch.cat(batch.state).to(device)
    action_batch = torch.cat(actions).to(device)
    reward_batch = torch.cat(rewards).to(device)
    
    state_action_values = policy_net(state_batch).gather(1, action_batch) # q values


    if mode == 'DQN':
      next_state_values = torch.zeros(BATCH_SIZE, device=device)
      next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
      expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    
    elif mode == 'DDQN':
      next_state_values = torch.zeros(BATCH_SIZE, device=device)
      next_action_values = policy_net(non_final_next_states).max(1)[1].detach()
      next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1,next_action_values.unsqueeze(1)).detach().squeeze()
      expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

def get_state(obs):
    state = np.array(obs)
    state = state.transpose((2, 0, 1))
    state = torch.from_numpy(state)
    return state.unsqueeze(0)

In [8]:
def train(env, n_episodes, set_size, mode = 'DQN', render=False):
    val_tensor = []
    s = 0
    total_reward = 0
    result = np.zeros(int(n_episodes))
    for episode in range(n_episodes):
        obs = env.reset()
        state = get_state(obs)
        value_list = []
        total_reward = 0
        n_steps = 0
        for t in count():
            value, action = select_action(state)
            n_steps += 1
            value_list.append(value[0].item())
            if steps_done % 1000 ==0:
              val_tensor.append([np.mean(value_list), np.std(value_list)/np.sqrt(10000-1)])
              value_list = []


            if render:
                env.render()

            obs, reward, done, info = env.step(action)
           
            total_reward += reward

            if not done:
                next_state = get_state(obs)
            else:
                next_state = None
                #total_reward += 1

            reward = torch.tensor([reward], device=device)

            memory.push(state, action.to('cpu'), next_state, reward.to('cpu'))
            state = next_state

            if steps_done > INITIAL_MEMORY:
                optimize_model(mode)

                if steps_done % TARGET_UPDATE == 0:
                    target_net.load_state_dict(policy_net.state_dict())

            if done:
                result[episode] = total_reward / n_steps
                n_steps = 0
                break
        if episode % 20 == 0:
                print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps_done, episode, t, total_reward))
    env.close()
    return val_tensor, result

def test(env, n_episodes, policy, render=True):
    env = gym.wrappers.Monitor(env, './videos/' + 'dqn_pong_video')
    for episode in range(n_episodes):
        obs = env.reset()
        state = get_state(obs)
        total_reward = 0.0
        for t in count():
            action = policy(state.to('cuda')).max(1)[1].view(1,1)

            if render:
                env.render()
                time.sleep(0.02)

            obs, reward, done, info = env.step(action)

            total_reward += reward

            if not done:
                next_state = get_state(obs)
            else:
                next_state = None

            state = next_state

            if done:
                print("Finished Episode {} with reward {}".format(episode, total_reward))
                break

    env.close()
    return

In [9]:
# set device

def weight_reset(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        m.reset_parameters()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# hyperparameters
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1
EPS_END = 0.02
EPS_DECAY = 1000000
TARGET_UPDATE = 1000
RENDER = False
lr = 1e-4
INITIAL_MEMORY = 10000
MEMORY_SIZE = 10 * INITIAL_MEMORY

# create environment
env = gym.make("Alien-v0")
env = make_env(env)


# create networks
policy_net = ConvDQNbn(n_actions=env.action_space.n).to(device)
target_net = ConvDQNbn(n_actions=env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())

# setup optimizer
optimizer = optim.Adam(policy_net.parameters(), lr=lr)

steps_done = 0

# initialize replay memory
memory = ReplayMemory(MEMORY_SIZE)
policy_net.apply(weight_reset)
target_net.apply(weight_reset)
# train model
val_tensor_DDQN, result_DDQN = train(env, 2000, 10, mode='DDQN')
torch.save(policy_net, "ddqn_alien_model")

Total steps: 92 	 Episode: 0/91 	 Total reward: 130.0
Total steps: 1334 	 Episode: 20/54 	 Total reward: 20.0
Total steps: 2575 	 Episode: 40/46 	 Total reward: 30.0
Total steps: 3638 	 Episode: 60/73 	 Total reward: 160.0
Total steps: 5020 	 Episode: 80/58 	 Total reward: 540.0
Total steps: 6333 	 Episode: 100/70 	 Total reward: 40.0
Total steps: 7673 	 Episode: 120/55 	 Total reward: 130.0
Total steps: 9069 	 Episode: 140/42 	 Total reward: 90.0




Total steps: 10298 	 Episode: 160/51 	 Total reward: 20.0
Total steps: 11678 	 Episode: 180/72 	 Total reward: 130.0
Total steps: 12907 	 Episode: 200/52 	 Total reward: 10.0
Total steps: 14339 	 Episode: 220/59 	 Total reward: 70.0
Total steps: 15651 	 Episode: 240/156 	 Total reward: 310.0
Total steps: 16984 	 Episode: 260/41 	 Total reward: 0.0
Total steps: 18272 	 Episode: 280/69 	 Total reward: 90.0
Total steps: 19526 	 Episode: 300/60 	 Total reward: 90.0
Total steps: 21051 	 Episode: 320/56 	 Total reward: 80.0
Total steps: 22273 	 Episode: 340/54 	 Total reward: 50.0
Total steps: 23610 	 Episode: 360/55 	 Total reward: 50.0
Total steps: 25086 	 Episode: 380/77 	 Total reward: 110.0
Total steps: 26254 	 Episode: 400/46 	 Total reward: 30.0
Total steps: 27374 	 Episode: 420/60 	 Total reward: 130.0
Total steps: 28772 	 Episode: 440/55 	 Total reward: 10.0
Total steps: 30090 	 Episode: 460/96 	 Total reward: 80.0
Total steps: 31356 	 Episode: 480/67 	 Total reward: 110.0
Total ste

In [10]:
steps_done = 0

# initialize replay memory
memory = ReplayMemory(MEMORY_SIZE)

policy_net.apply(weight_reset)
target_net.apply(weight_reset)
#policy_net = torch.load("dqn_pong_model")
val_tensor_DQN, result_DQN = train(env, 2000, 10, mode='DQN')
torch.save(policy_net, "dqn_alien_model")

Total steps: 62 	 Episode: 0/61 	 Total reward: 70.0
Total steps: 1373 	 Episode: 20/53 	 Total reward: 60.0
Total steps: 3052 	 Episode: 40/109 	 Total reward: 270.0
Total steps: 4463 	 Episode: 60/70 	 Total reward: 50.0
Total steps: 5894 	 Episode: 80/70 	 Total reward: 50.0
Total steps: 7078 	 Episode: 100/85 	 Total reward: 180.0
Total steps: 8340 	 Episode: 120/68 	 Total reward: 20.0
Total steps: 9742 	 Episode: 140/65 	 Total reward: 60.0




Total steps: 11082 	 Episode: 160/83 	 Total reward: 210.0
Total steps: 12398 	 Episode: 180/77 	 Total reward: 90.0
Total steps: 13917 	 Episode: 200/65 	 Total reward: 40.0
Total steps: 15041 	 Episode: 220/44 	 Total reward: 110.0
Total steps: 16452 	 Episode: 240/71 	 Total reward: 50.0
Total steps: 17902 	 Episode: 260/159 	 Total reward: 240.0
Total steps: 19207 	 Episode: 280/45 	 Total reward: 80.0
Total steps: 20543 	 Episode: 300/41 	 Total reward: 30.0
Total steps: 21792 	 Episode: 320/46 	 Total reward: 30.0
Total steps: 23279 	 Episode: 340/59 	 Total reward: 110.0
Total steps: 24458 	 Episode: 360/54 	 Total reward: 60.0
Total steps: 25743 	 Episode: 380/44 	 Total reward: 30.0
Total steps: 26965 	 Episode: 400/53 	 Total reward: 100.0
Total steps: 28333 	 Episode: 420/52 	 Total reward: 90.0
Total steps: 29530 	 Episode: 440/127 	 Total reward: 70.0
Total steps: 30920 	 Episode: 460/70 	 Total reward: 140.0
Total steps: 32348 	 Episode: 480/41 	 Total reward: 0.0
Total s

In [25]:
#policy_net = torch.load('dqn_pong_model')
test(env, 1, policy_net, render=False)

Finished Episode 0 with reward 230.0


In [11]:
value_tensor_DDQN = np.array(val_tensor_DDQN)
value_tensor_DQN = np.array(val_tensor_DQN)

In [12]:
import colorlover as cl
import plotly.graph_objects as go
colors = cl.scales['5']['qual']['Set1']

fig = go.Figure([
    go.Scatter(
        name='Double Deep Q Networks',
        x=np.arange(len(value_tensor_DDQN[:,0]))*1000,
        y=value_tensor_DDQN[:,0],
        mode='lines',
        line=dict(color=colors[0], width = 2),
    ),
    go.Scatter(
        name='Upper Bound',
        x=np.arange(len(value_tensor_DDQN[:,0]))*1000,
        y=value_tensor_DDQN[:,0]+(value_tensor_DDQN[:,1])*np.sqrt(10000-1),
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound',
        x=np.arange(len(value_tensor_DDQN[:,0]))*1000,
        y=value_tensor_DDQN[:,0]-(value_tensor_DDQN[:,1])*np.sqrt(10000-1),
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    ),
        go.Scatter(
        name='Deep Q Networks',
        x=np.arange(len(value_tensor_DQN[:,0]))*1000,
        y=value_tensor_DQN[:,0],
        mode='lines',
        line=dict(color=colors[1], width = 2),
    ),
    go.Scatter(
        name='Upper Bound',
        x=np.arange(len(value_tensor_DQN[:,0]))*1000,
        y=value_tensor_DQN[:,0]+(value_tensor_DQN[:,1])*np.sqrt(10000-1),
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound',
        x=np.arange(len(value_tensor_DQN[:,0]))*1000,
        y=value_tensor_DQN[:,0]-(value_tensor_DQN[:,1])*np.sqrt(10000-1),
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    )
])

fig.update_layout(
    yaxis_title='Avg. Value Estimate (greedy policy)',
    title='Estimation Bias in Deep Q networks',
    hovermode="x",
    paper_bgcolor = 'rgba(0,0,0,0)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    font = dict(size = 16, color = 'black'),
    width = 900,
    height = 500
)
fig.update_xaxes(title = 'Time steps', showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.update_yaxes(showgrid=True, gridwidth=1.5,gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.show()


In [13]:
avg_result_DDQN = np.mean(np.array(result_DDQN).reshape(-1, 10), axis=1)
avg_result_DQN = np.mean(np.array(result_DQN).reshape(-1, 10), axis=1)

In [14]:
import colorlover as cl
import plotly.graph_objects as go
colors = cl.scales['5']['qual']['Set1']

fig = go.Figure([
    go.Scatter(
        name='Double Deep Q Networks',
        x=np.arange(len(avg_result_DDQN)),
        y=avg_result_DDQN,
        mode='lines',
        line=dict(color=colors[0], width = 2),
    ),
        go.Scatter(
        name='Deep Q Networks',
        x=np.arange(len(avg_result_DQN)),
        y=avg_result_DQN,
        mode='lines',
        line=dict(color=colors[1], width = 2),
    ),
])

fig.update_layout(
    yaxis_title='Avg. Rewards per episode',
    title='Reward accumulation profile',
    hovermode="x",
    paper_bgcolor = 'rgba(0,0,0,0)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    font = dict(size = 16, color = 'black'),
    width = 900,
    height = 500
)
fig.update_xaxes(title = 'Sets (10 episodes in each set)', showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.update_yaxes(showgrid=True, gridwidth=1.5,gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.show()


In [26]:
policy_net = torch.load('ddqn_pong_model')
obs = env.reset()
state = get_state(obs)
for t in count():
  action = policy_net(state.to('cuda')).max(1)[1].view(1,1)
  obs, reward, done, info = env.step(action)
  print(reward)
  
  if not done:
    next_state = get_state(obs)
  else:
    next_state = None
  state = next_state
  if done:
    break


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0

In [29]:
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [33]:
gym.envs.registry.all()

dict_values([EnvSpec(Copy-v0), EnvSpec(RepeatCopy-v0), EnvSpec(ReversedAddition-v0), EnvSpec(ReversedAddition3-v0), EnvSpec(DuplicatedInput-v0), EnvSpec(Reverse-v0), EnvSpec(CartPole-v0), EnvSpec(CartPole-v1), EnvSpec(MountainCar-v0), EnvSpec(MountainCarContinuous-v0), EnvSpec(Pendulum-v0), EnvSpec(Acrobot-v1), EnvSpec(LunarLander-v2), EnvSpec(LunarLanderContinuous-v2), EnvSpec(BipedalWalker-v3), EnvSpec(BipedalWalkerHardcore-v3), EnvSpec(CarRacing-v0), EnvSpec(Blackjack-v0), EnvSpec(KellyCoinflip-v0), EnvSpec(KellyCoinflipGeneralized-v0), EnvSpec(FrozenLake-v0), EnvSpec(FrozenLake8x8-v0), EnvSpec(CliffWalking-v0), EnvSpec(NChain-v0), EnvSpec(Roulette-v0), EnvSpec(Taxi-v3), EnvSpec(GuessingGame-v0), EnvSpec(HotterColder-v0), EnvSpec(Reacher-v2), EnvSpec(Pusher-v2), EnvSpec(Thrower-v2), EnvSpec(Striker-v2), EnvSpec(InvertedPendulum-v2), EnvSpec(InvertedDoublePendulum-v2), EnvSpec(HalfCheetah-v2), EnvSpec(HalfCheetah-v3), EnvSpec(Hopper-v2), EnvSpec(Hopper-v3), EnvSpec(Swimmer-v2), EnvSp

In [35]:
gym.EnvSpec('Alien-v0')

AttributeError: ignored