In [1]:
import os
import argparse
import gym
import numpy as np
from itertools import count

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

In [3]:
is_cuda = torch.cuda.is_available()

In [4]:
args = {
    "gamma": 0.99,
    "decay_rate": 0.99,
    "learning_rate": 1e-4,
    "batch_size": 20,
    "seed": 87,
    "test": False
}

In [5]:
env = gym.make('Pong-v0')
env.seed(args["seed"])
torch.manual_seed(args["seed"])

<torch._C.Generator at 0x7f9186f9a4b0>

In [6]:
D = 80 * 80
test = args["test"]
if test ==True:
    render = True
else:
    render = False

In [7]:
def prepro(I):
    """ prepro 210x160x3 into 6400 """
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0 ] = 1
    return I.astype(np.float).ravel()

In [8]:
class PGbaseline(nn.Module):
    def __init__(self, num_actions=2):
        super(PGbaseline, self).__init__()
        self.affine1 = nn.Linear(6400, 200)
        self.action_head = nn.Linear(200, num_actions) # action 1: static, action 2: move up, action 3: move down
        self.value_head = nn.Linear(200, 1)

        self.num_actions = num_actions
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values


    def select_action(self, x):
        x = Variable(torch.from_numpy(x).float().unsqueeze(0))
        if is_cuda: x = x.cuda()
        probs, state_value = self.forward(x)
        m = Categorical(probs)
        action = m.sample()

        self.saved_log_probs.append((m.log_prob(action), state_value))
        return action

In [9]:
# built policy network
policy = PGbaseline()
if is_cuda:
    policy.cuda()

In [10]:
# check & load pretrain model
if os.path.isfile('pgb_params.pkl'):
    print('Load PGbaseline Network parametets ...')
    if is_cuda:
        policy.load_state_dict(torch.load('pgb_params.pkl'))
    else:
        policy.load_state_dict(torch.load('pgb_params.pkl', map_location=lambda storage, loc: storage))

In [11]:
# construct a optimal function
optimizer = optim.RMSprop(policy.parameters(), lr=args["learning_rate"], weight_decay=args["decay_rate"])

In [12]:
def finish_episode():
    R = 0
    policy_loss = []
    value_loss = []
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + args["gamma"] * R
        rewards.insert(0, R)
    # turn rewards to pytorch tensor and standardize
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)
    if is_cuda: rewards = rewards.cuda()
    for (log_prob, value), reward in zip(policy.saved_log_probs, rewards):
        advantage = reward - value
        policy_loss.append(- log_prob * advantage)         # policy gradient
        value_loss.append(F.smooth_l1_loss(value, reward)) # value function approximation
    optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    value_loss = torch.stack(value_loss).sum()
    loss = policy_loss + value_loss
    if is_cuda:
        loss.cuda()
    loss.backward()
    optimizer.step()

    # clean rewards and saved_actions
    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [None]:
# Main loop
running_reward = None
reward_sum = 0
for i_episode in count(1):
    state = env.reset()
    prev_x = None
    for t in range(10000):
        if render: env.render()
        cur_x = prepro(state)
        x = cur_x - prev_x if prev_x is not None else np.zeros(D)
        prev_x = cur_x
        action = policy.select_action(x)
        action_env = action + 2
        state, reward, done, _ = env.step(action_env)
        reward_sum += reward

        policy.rewards.append(reward)
        if done:
            # tracking log
            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
            print('Policy Gradient with Baseline ep %03d done. reward: %f. reward running mean: %f' % (i_episode, reward_sum, running_reward))
            reward_sum = 0
            break


    # use policy gradient update model weights
    if i_episode % args["batch_size"] == 0 and test == False:
        finish_episode()

    # Save model in every 50 episode
    if i_episode % 50 == 0 and test == False:
        print('ep %d: model saving...' % (i_episode))
        torch.save(policy.state_dict(), 'pgb_params.pkl')

Policy Gradient with Baseline ep 001 done. reward: -21.000000. reward running mean: -21.000000
Policy Gradient with Baseline ep 002 done. reward: -21.000000. reward running mean: -21.000000
Policy Gradient with Baseline ep 003 done. reward: -21.000000. reward running mean: -21.000000
Policy Gradient with Baseline ep 004 done. reward: -18.000000. reward running mean: -20.970000
Policy Gradient with Baseline ep 005 done. reward: -21.000000. reward running mean: -20.970300
Policy Gradient with Baseline ep 006 done. reward: -20.000000. reward running mean: -20.960597
Policy Gradient with Baseline ep 007 done. reward: -19.000000. reward running mean: -20.940991
Policy Gradient with Baseline ep 008 done. reward: -21.000000. reward running mean: -20.941581
Policy Gradient with Baseline ep 009 done. reward: -20.000000. reward running mean: -20.932165
Policy Gradient with Baseline ep 010 done. reward: -21.000000. reward running mean: -20.932844
Policy Gradient with Baseline ep 011 done. reward:

  app.launch_new_instance()


Policy Gradient with Baseline ep 021 done. reward: -20.000000. reward running mean: -20.919972
Policy Gradient with Baseline ep 022 done. reward: -21.000000. reward running mean: -20.920773
Policy Gradient with Baseline ep 023 done. reward: -21.000000. reward running mean: -20.921565
Policy Gradient with Baseline ep 024 done. reward: -21.000000. reward running mean: -20.922349
Policy Gradient with Baseline ep 025 done. reward: -21.000000. reward running mean: -20.923126
Policy Gradient with Baseline ep 026 done. reward: -20.000000. reward running mean: -20.913895
Policy Gradient with Baseline ep 027 done. reward: -21.000000. reward running mean: -20.914756
Policy Gradient with Baseline ep 028 done. reward: -19.000000. reward running mean: -20.895608
Policy Gradient with Baseline ep 029 done. reward: -20.000000. reward running mean: -20.886652
Policy Gradient with Baseline ep 030 done. reward: -19.000000. reward running mean: -20.867785
Policy Gradient with Baseline ep 031 done. reward: