In [26]:
import gym, os
from itertools import count
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Multinomial
import torchvision.transforms as T
from wrappers import *
from torch.autograd import Variable

import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from PIL import Image
import statistics



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("Pong-v0")
env = make_env(env)

state_space = env.observation_space.shape[0]
print(env.observation_space.shape)
action_space = env.action_space.n
print(action_space)

print(device)

(84, 84, 4)
6
cpu


Helper function

In [27]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize(40, interpolation=Image.CUBIC),
                    T.ToTensor()])

EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 3000

def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns

def process_rollout(gamma = 0.99, lambd = 1.0, num_workers = 1):
    _, _, _, _, last_values = steps[-1]
    returns = last_values.data

    advantages = torch.zeros(num_workers, 1)
    #if cuda: advantages = advantages.cuda()

    out = [None] * (len(steps) - 1)

    # run Generalized Advantage Estimation, calculate returns, advantages
    for t in reversed(range(len(steps) - 1)):
        rewards, masks, actions, policies, values = steps[t]
        _, _, _, _, next_values = steps[t + 1]

        returns = rewards + returns * gamma * masks

        deltas = rewards + next_values.data * gamma * masks - values.data
        advantages = advantages * gamma * lambd * masks + deltas

        out[t] = actions, policies, values, returns, advantages

    # return data as batched Tensors, Variables
    return map(lambda x: torch.cat(x, 0), zip(*out))

def get_screen(x):
    state = np.array(x)
    state = state.transpose((2, 0, 1))
    state = torch.from_numpy(state)
    state = state.float() / 255
    return state.unsqueeze(0)

def ortho_weights(shape, scale=1.):
    """ PyTorch port of ortho_init from baselines.a2c.utils """
    shape = tuple(shape)

    if len(shape) == 2:
        flat_shape = shape[1], shape[0]
    elif len(shape) == 4:
        flat_shape = (np.prod(shape[1:]), shape[0])
    else:
        raise NotImplementedError

    a = np.random.normal(0., 1., flat_shape)
    u, _, v = np.linalg.svd(a, full_matrices=False)
    q = u if u.shape == flat_shape else v
    q = q.transpose().copy().reshape(shape)

    if len(shape) == 2:
        return torch.from_numpy((scale * q).astype(np.float32))
    if len(shape) == 4:
        return torch.from_numpy((scale * q[:, :shape[1], :shape[2]]).astype(np.float32))

def game_initializer(module):
    """ Parameter initializer for Atari models
    Initializes Linear, Conv2d, and LSTM weights.
    """
    classname = module.__class__.__name__

    if classname == 'Linear':
        module.weight.data = ortho_weights(module.weight.data.size(), scale=np.sqrt(2.))
        module.bias.data.zero_()

    elif classname == 'Conv2d':
        module.weight.data = ortho_weights(module.weight.data.size(), scale=np.sqrt(2.))
        module.bias.data.zero_()

    elif classname == 'LSTM':
        for name, param in module.named_parameters():
            if 'weight_ih' in name:
                param.data = ortho_weights(param.data.size(), scale=1.)
            if 'weight_hh' in name:
                param.data = ortho_weights(param.data.size(), scale=1.)
            if 'bias' in name:
                param.data.zero_()

Creating two seperate networks the actor and the critic

In [28]:
end_out = 64


class ActorCritic(nn.Module):
    def __init__(self, num_frames, outputs):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_frames, 32, kernel_size=8, stride=4)
        # self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        # self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, end_out, kernel_size=3, stride=1)
        
        self.down = nn.Linear(7*7*64, 512)
        self.a = nn.Linear(512, outputs)
        self.c = nn.Linear(512, 1)
        
        self.apply(game_initializer)
        self.a.weight.data = ortho_weights(self.a.weight.size(), scale=.01)
        self.c.weight.data = ortho_weights(self.c.weight.size())

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.down(x.view(x.size(0), -1))
        a = self.a(x)
        c = self.c(x)
        return a, c

In [None]:
learn_rate = 1e-4
n_iters = 500000
avg_over = 50
val_coeff = 0.5
ent_coeff = 0.01

#init_screen = get_screen(env.reset())
#_, num_frames, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
#n_actions = env.action_space.n

#actor_critic = ActorCritic(num_frames, n_actions).to(device)
#critic = Critic(num_frames, screen_height, screen_width, n_actions).to(device)

#optimizer = optim.Adam(actor_critic.parameters(), lr=learn_rate)
#optimizerC = optim.Adam(critic.parameters(), lr=learn_rate)

#score_avg = []
#ac_loss_list = []
#score_avg_all = []
#ac_loss_list_all = []


for iter in range(n_iters):
    state = get_screen(env.reset())
    #state = Variable(state)
    #log_probs = []
    steps = []
    #values = []
    #rewards = []
    #masks = []
    entropy = 0
    reward_sum = 0

    for i in count():
        #env.render()
        #state = torch.FloatTensor(state).to(device)
        logit, values = actor_critic(state)
        
        probs = F.softmax(logit)
        actions = probs.multinomial(1).data
        
        state, rewards, dones, _ = env.step(actions.cpu().numpy())
        reward_sum += rewards
        #masks = (1. - torch.from_numpy(np.array(dones, dtype=np.float32)))
        #rewards = torch.from_numpy(rewards)
        #rewards = torch.from_numpy(np.array(rewards, dtype=np.float32))
        rewards = (torch.tensor([rewards], dtype=torch.float, device=device)).unsqueeze(-1)
        masks = (torch.tensor([1-dones], dtype=torch.float, device=device)).unsqueeze(-1)
        
        
        state = get_screen(state)
        #state = Variable(state)
        
        #reward_sum += rewards
        
        steps.append((rewards, masks, actions, logit, values))
        if dones:
            #print('Iteration: {}, Score: {}'.format(iter, i))
            score_avg.append(reward_sum)
            reward_sum=0
            break
    
        
    
    #next_state = torch.FloatTensor(next_state).to(device)
    _, final_values = actor_critic(state)
    steps.append((None, None, None, None, final_values))
    actions, logit, values, returns, advantages = process_rollout()
   
    probs = F.softmax(logit)
    log_probs = F.log_softmax(logit)
    log_action_probs = log_probs.gather(1, actions)

    
    policy_loss = (-log_action_probs * advantages).sum()
    value_loss = (0.5*(values - returns) ** 2.).sum()
    entropy_loss = (log_probs * probs).sum()

    ac_loss = policy_loss + value_loss * val_coeff + entropy_loss * ent_coeff

    optimizer.zero_grad()
    ac_loss.backward()
    nn.utils.clip_grad_norm(actor_critic.parameters(), 40.)
    optimizer.step()
    
    ac_loss_list.append(ac_loss)
    #optimizer.zero_grad()
    if iter % avg_over == 0 and iter != 0:
        score_sum = 0
        loss_sum = 0
        for i in range(len(score_avg)):
            score_sum += score_avg[i]
            loss_sum += ac_loss_list[i]
        s = score_sum/avg_over
        l = loss_sum/avg_over
        print('Iteration: {}, Score avg: {}, Loss: {}'.format(iter+8750, s, l))
        score_avg_all.append(s)
        ac_loss_list_all.append(l)
        score_avg.clear()
        ac_loss_list.clear()

              
    



Iteration: 8800, Score avg: -33.24, Loss: 145.86846923828125
Iteration: 8850, Score avg: -20.44, Loss: 162.73435974121094
Iteration: 8900, Score avg: -20.6, Loss: 113.18492126464844
Iteration: 8950, Score avg: -20.5, Loss: 108.09701538085938
Iteration: 9000, Score avg: -20.46, Loss: 101.59829711914062
Iteration: 9050, Score avg: -20.64, Loss: 84.47756958007812
Iteration: 9100, Score avg: -20.78, Loss: 104.55838775634766
Iteration: 9150, Score avg: -20.5, Loss: 155.05332946777344
Iteration: 9200, Score avg: -20.64, Loss: 129.76495361328125
Iteration: 9250, Score avg: -20.62, Loss: 58.61138153076172
Iteration: 9300, Score avg: -20.5, Loss: 98.97052001953125
Iteration: 9350, Score avg: -20.46, Loss: 174.5591278076172
Iteration: 9400, Score avg: -20.32, Loss: 121.80577850341797
Iteration: 9450, Score avg: -20.54, Loss: 82.45608520507812
Iteration: 9500, Score avg: -20.42, Loss: 82.1850357055664
Iteration: 9550, Score avg: -20.54, Loss: 70.83219909667969
Iteration: 9600, Score avg: -20.44, 

Iteration: 15550, Score avg: -20.38, Loss: 41.77320861816406
Iteration: 15600, Score avg: -20.5, Loss: 124.51716613769531
Iteration: 15650, Score avg: -20.36, Loss: 208.81201171875
Iteration: 15700, Score avg: -20.4, Loss: 84.21087646484375
Iteration: 15750, Score avg: -20.3, Loss: 154.0518798828125
Iteration: 15800, Score avg: -20.3, Loss: 228.9610137939453
Iteration: 15850, Score avg: -20.46, Loss: 157.6998291015625
Iteration: 15900, Score avg: -20.26, Loss: 143.63604736328125
Iteration: 15950, Score avg: -20.26, Loss: 153.16795349121094
Iteration: 16000, Score avg: -20.36, Loss: 96.4077377319336
Iteration: 16050, Score avg: -20.5, Loss: 128.59188842773438
Iteration: 16100, Score avg: -20.4, Loss: 173.5692138671875
Iteration: 16150, Score avg: -20.36, Loss: 199.22564697265625
Iteration: 16200, Score avg: -20.58, Loss: 135.1899871826172
Iteration: 16250, Score avg: -20.38, Loss: 152.04737854003906
Iteration: 16300, Score avg: -20.3, Loss: 197.73941040039062
Iteration: 16350, Score avg

In [None]:
torch.save(actor_critic, 'results/actor.pkl')
plt.plot(score_avg)

In [None]:
plt.plot(ac_loss_list)

In [10]:
n_iter = 5
for iter in range(n_iter):
    state = get_screen(env.reset())
    log_probs = []
    values = []
    rewards = []
    masks = []
    entropy = 0
    env.reset()
    reward_sum = 0

    for i in count():
        env.render()
        logit, values = actor_critic(state)
        
        probs = F.softmax(logit)
        actions = probs.multinomial(1).data
        
        state, rewards, dones, _ = env.step(actions.cpu().numpy())
        reward_sum += rewards
        masks = (1. - torch.from_numpy(np.array(dones, dtype=np.float32))).unsqueeze(-1)
        #rewards = torch.from_numpy(rewards)
        #rewards = torch.from_numpy(np.array(rewards, dtype=np.float32)).unsqueeze(-1)
        #rewards.append(torch.tensor([reward], dtype=torch.float, device=device))
        #masks.append(torch.tensor([1-done], dtype=torch.float, device=device))
        
        
        state = get_screen(state)
        #state = Variable(state)
        
        #reward_sum += rewards
        
        steps.append((rewards, masks, actions, logit, values))
        if done:
            break
env.close()

  app.launch_new_instance()


NameError: name 'done' is not defined

In [None]:
torch.save(actor, 'results/actor.pkl')
torch.save(critic, 'results/critic.pkl')
env.close()