In [144]:
import sys
import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
import torchvision.transforms as T

from layers import NoisyLinear
from replay_buffer import PrioritizedReplayBuffer
from replay_buffer import ReplayBuffer

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline



In [145]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [146]:
env = gym.make('SpaceInvaders-v0').unwrapped
env.reset()

array([[[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       [[ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0],
        ...,
        [ 0,  0,  0],
        [ 0,  0,  0],
        [ 0,  0,  0]],

       ...,

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]],

       [[80, 89, 22],
        [80, 89, 22],
        [80, 89, 22],
        ...,
        [80, 89, 22],
        [80, 89, 22],
        [80, 89, 22]]], dtype=uint8)

In [147]:
class StateTransform():
    def __init__(self, size=80, use_grayscale=True):
        self.use_grayscale= use_grayscale
        self.size = size

    def transform(self, state):
        state = np.transpose(state, (2,0,1))
        state = torch.tensor(state.copy(), dtype=torch.float32)

        if self.use_grayscale:
            transforms = T.Grayscale()
            state = transforms(state)
        
        transforms = T.Compose(
            [T.Resize((self.size, self.size)), T.Normalize(0,255)]
        )
        state = transforms(state)
        
        return state

In [148]:
class RainbowDQN(nn.Module):
    # distributional dqn + noisyNet
    def __init__(self, w, h, num_actions, num_atoms, Vmin, Vmax, device):
        super(RainbowDQN, self).__init__()
        
        self.num_actions = num_actions
        self.num_atoms = num_atoms
        self.Vmin = Vmin
        self.Vmax = Vmax
        self.device=device

        self.conv1 = nn.Conv2d(1,16,kernel_size=5,stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16,32,kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32,32,kernel_size=5,stride=2)
        self.bn3 = nn.BatchNorm2d(32)

        def conv_size(size, kernel_size=5, stride =2):
            return (size-(kernel_size-1)-1)//stride + 1
        convw = conv_size(conv_size((conv_size(w))))
        convh = conv_size(conv_size((conv_size(h))))
        linear_input_size = convw * convh * 32

        #dueling dqn and noisyNet
        self.noisy_value1= NoisyLinear(linear_input_size, 64, self.device)
        self.noisy_value2 = NoisyLinear(64, self.num_atoms,device)

        self.noisy_adv1 = NoisyLinear(linear_input_size, 64,  self.device)
        self.noisy_adv2 = NoisyLinear(64, self.num_atoms*self.num_actions, device)

    def forward(self, x):
        batch_size = x.size(0)

        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        value = F.relu(self.noisy_value1(x.view(x.size(0),-1)))
        value = self.noisy_value2(value)

        adv = F.relu(self.noisy_adv1(x.view(x.size(0),-1)))
        adv = self.noisy_adv2(adv)

        value = value.view(batch_size, 1, self.num_atoms)
        adv = adv.view(batch_size, self.num_actions, self.num_atoms)

        #dueling dqn
        x = value + adv - adv.mean(1, keepdim=True)
        x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms)

        return x
    
    def reset_noise(self):
        self.noisy_value1.reset_noise()
        self.noisy_value2.reset_noise()
        self.noisy_adv1.reset_noise()
        self.noisy_adv2.reset_noise()

    def act(self, state):
        state= torch.FloatTensor(state).to(device=device)
        with torch.no_grad():
            dist = self.forward(state)
            dist = dist*torch.linspace(self.Vmin, self.Vmax, self.num_atoms)
            #원래 q 값은 분포의 mean임.
            #하지만 이 값이 제일 큰 action을 취하는 것은 sum의 최댓값을 구하기만 해도 됨.
            action = dist.sum(2).max(1)[1].numpy()[0]
        return action
        

 


In [149]:
num_atoms = 51
Vmin = -10
Vmax = 10

w=80
h=80
train_model = RainbowDQN(w, h, env.action_space.n, num_atoms, Vmin, Vmax, device)
target_model = RainbowDQN(w, h, env.action_space.n, num_atoms, Vmin, Vmax, device)

train_model.to(device=device)
target_model = target_model.to(device=device)

optimizer = optim.Adam(train_model.parameters(), 0.001)

replay_buffer = ReplayBuffer(10000)

In [150]:
def update_target(train_model, target_model):
    target_model.load_state_dict(train_model.state_dict())

update_target(train_model, target_model)

In [151]:
def projection_distribution(next_state, rewards, dones):
    batch_size = next_state.size(0)

    delta_z = float(Vmax-Vmin) / (num_atoms-1)
    support = torch.linspace(Vmin, Vmax, num_atoms)

    #q(s,a) = sigma(z*p) 이기 때문에, support를 곱한다. z는 support.
    next_dist = target_model(next_state) * support
    next_action = next_dist.sum(2).max(1)[1]

    #batch, 1, 1, 1? => batch, 1, num_supprot만큼의 action?
    next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2))

    next_dist = next_dist.gather(1, next_action).squeeze(1)

    rewards = rewards.unsqueeze(1).expand_as(next_dist)
    dones = dones.unsqueeze(1).expand_as(next_dist)
    support = support.unsqueeze(0).expand_as(next_dist)
    
    Tz = rewards + (1-dones)*0.99*support
    Tz = Tz.clamp(min=Vmin, max=Vmax)

    b = (Tz - Vmin)/delta_z
    l = b.floor().long()#내림
    u = b.ceil().long() #올림

    offset = torch.linspace(0, (batch_size-1) * num_atoms, batch_size).long().unsqueeze(1).expand(batch_size, num_atoms)

    proj_dist = torch.zeros(next_dist.size())    
    proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
    proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))

    return proj_dist
    

In [152]:
def compute_td_loss(batch_size):
    state, action, reward, next_state, done= replay_buffer.sample(batch_size)

    state = torch.Tensor(np.float32(state)).to(device)
    next_state = torch.Tensor(np.float32(state)).to(device)
    action = torch.from_numpy(action).to(device)
    action = action.type(torch.LongTensor)
    reward = torch.FloatTensor(reward)
    done = torch.FloatTensor(np.float32(done))

    proj_dist = projection_distribution(next_state, reward, done)

    dist = train_model(state)
    action = action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_atoms)
    dist = dist.gather(1, action).squeeze(1)
    dist.data.clamp_(0.01, 0.99)

    loss = -(torch.tensor(proj_dist, device=device)*dist.log()).sum(1)
    loss = loss.mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_model.reset_noise()
    target_model.reset_noise()

    return loss  


    
    

In [153]:
def plot(frame_idx, rewards, losses):
    clear_output(figsize=(20,5))
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

In [154]:
num_frames = 15000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
state_transform = StateTransform()

state= state_transform.transform(state)
for frame_idx in range(1, num_frames + 1):
    action = train_model.act(state.unsqueeze(0))
    
    next_state, reward, done, _ = env.step(action)
    next_state = state_transform.transform(next_state)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        state = state_transform.transform(state)
        all_rewards.append(episode_reward)
        episode_reward = 0    
    
    if len(replay_buffer) < batch_size:
        loss = compute_td_loss(batch_size)
        losses.append(loss.data[0])
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 1000 == 0:
        update_target(train_model, target_model)

  x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms)
  loss = -(torch.tensor(proj_dist, device=device)*dist.log()).sum(1)


IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number