In [1]:
import gym, os
from itertools import count
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import torchvision.transforms as T
from wrappers import *

import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from PIL import Image



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("PongNoFrameskip-v4")
env = make_env(env)

state_space = env.observation_space.shape[0]
print(env.observation_space.shape)
action_space = env.action_space.n
print(action_space)

print(device)

(84, 84, 4)
6
cpu


Helper function

In [2]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize(40, interpolation=Image.CUBIC),
                    T.ToTensor()])

EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 3000

def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns

def get_screen(x):
    state = np.array(x)
    state = state.transpose((2, 0, 1))
    state = torch.from_numpy(state)
    return state.unsqueeze(0)

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

Creating two seperate networks the actor and the critic

In [3]:
end_out = 64


class ActorCritic(nn.Module):
    def __init__(self, num_frames, h, w, outputs):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_frames, 32, kernel_size=5, stride=2)
        # self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=2)
        # self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, end_out, kernel_size=5, stride=2)
        
        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * end_out
        self.down = nn.Linear(linear_input_size, 512)
        self.a = nn.Linear(512, outputs)
        self.c = nn.Linear(512, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.float() / 255
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.down(x.view(x.size(0), -1))
        a = self.a(x)
        c = x = self.c(x)
        
        a = Categorical(F.softmax(a, dim=-1))
        return a, c

In [4]:
class Critic(nn.Module):
    def __init__(self, num_frames, h, w, outputs):
        super(Critic, self).__init__()
        self.conv1 = nn.Conv2d(num_frames, 8, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(8)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(16)
        self.conv3 = nn.Conv2d(16, end_out, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(end_out)
        
        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * end_out
        self.out = nn.Linear(linear_input_size, 1)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        #x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.conv1(x))
        #x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.conv2(x))
        #x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.conv3(x))
        x = self.out(x.view(x.size(0), -1))
        return x

In [5]:
learn_rate = 1e-4
n_iters = 5000
avg_over = 1
val_loss_coeff = 1
ent_loss_coeff = 0.01

init_screen = get_screen(env.reset())
_, num_frames, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

actor_critic = ActorCritic(num_frames, screen_height, screen_width, n_actions).to(device)
critic = Critic(num_frames, screen_height, screen_width, n_actions).to(device)

optimizer = optim.Adam(actor_critic.parameters(), lr=learn_rate)
optimizerC = optim.Adam(critic.parameters(), lr=learn_rate)

score_avg = []
ac_loss_list = []


for iter in range(n_iters):
    state = get_screen(env.reset())
    log_probs = []
    values = []
    rewards = []
    masks = []
    entropy = 0
    env.reset()
    reward_sum = 0

    for i in count():
        #env.render()
        #state = torch.FloatTensor(state).to(device)
        dist, value = actor_critic(state)
        
        action = dist.sample()
        next_state, reward, done, _ = env.step(action.cpu().numpy())
        next_state = get_screen(next_state)
        
        reward_sum += reward
        log_prob = dist.log_prob(action).unsqueeze(0)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.tensor([reward], dtype=torch.float, device=device))
        masks.append(torch.tensor([1-done], dtype=torch.float, device=device))

        state = next_state

        if done:
            #print('Iteration: {}, Score: {}'.format(iter, i))
            score_avg.append(reward_sum)
            break
    
        
    
    #next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = actor_critic(next_state)
    returns = compute_returns(next_value, rewards, masks)

    log_probs = torch.cat(log_probs)
    returns = torch.cat(returns).detach()
    values = torch.cat(values)

    advantage = returns - values
    
    actor_loss = (-log_probs * advantage.detach()).mean()
    critic_loss = (advantage.pow(2)).mean()
    ent_loss = entropy

    ac_loss = actor_loss+val_loss_coeff*critic_loss+ent_loss_coeff*ent_loss
    print('Iteration: {}, Score avg: {}, Loss: {}'.format(iter, reward_sum, ac_loss))
    
    
    ac_loss_list.append(ac_loss)

    optimizer.zero_grad()
    ac_loss.backward()
    optimizer.step()

Iteration: 0, Score avg: -21.0, Loss: 12.059379577636719
Iteration: 1, Score avg: -20.0, Loss: 14.80235481262207
Iteration: 2, Score avg: -21.0, Loss: 13.2349214553833
Iteration: 3, Score avg: -21.0, Loss: 13.640836715698242
Iteration: 4, Score avg: -21.0, Loss: 14.33635425567627
Iteration: 5, Score avg: -20.0, Loss: 14.084345817565918
Iteration: 6, Score avg: -20.0, Loss: 14.342296600341797
Iteration: 7, Score avg: -19.0, Loss: 19.17304039001465
Iteration: 8, Score avg: -20.0, Loss: 19.73275375366211
Iteration: 9, Score avg: -21.0, Loss: 14.925650596618652
Iteration: 10, Score avg: -21.0, Loss: 15.513360977172852
Iteration: 11, Score avg: -21.0, Loss: 15.023296356201172
Iteration: 12, Score avg: -20.0, Loss: 15.294929504394531
Iteration: 13, Score avg: -20.0, Loss: 14.152772903442383
Iteration: 14, Score avg: -20.0, Loss: 12.161042213439941
Iteration: 15, Score avg: -20.0, Loss: 9.801424980163574
Iteration: 16, Score avg: -20.0, Loss: 7.49735689163208
Iteration: 17, Score avg: -21.0, 

KeyboardInterrupt: 

In [None]:
plt.plot(score_avg)

In [None]:
plt.plot(ac_loss_list)

In [None]:
n_iter = 5
for iter in range(n_iter):
    state = get_screen(env.reset())
    log_probs = []
    values = []
    rewards = []
    masks = []
    entropy = 0
    env.reset()
    reward_sum = 0

    for i in count():
        env.render()
        #state = torch.FloatTensor(state).to(device)
        dist, value = actor_critic(state)
        
        action = dist.sample()
        next_state, reward, done, _ = env.step(action.cpu().numpy())
        next_state = get_screen(next_state)
        
        reward_sum += reward
        log_prob = dist.log_prob(action).unsqueeze(0)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.tensor([reward], dtype=torch.float, device=device))
        masks.append(torch.tensor([1-done], dtype=torch.float, device=device))

        state = next_state
        if done:
            break
env.close()

In [None]:
torch.save(actor, 'results/actor.pkl')
torch.save(critic, 'results/critic.pkl')
env.close()