In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import tensorflow as tf
import numpy as np
import gym
import gym.spaces
from pylab import *
import scipy.misc
import time
from collections import namedtuple, deque
import time
import os.path
import os

from train_ops import create_train_ops
from utils import *
import utils

In [3]:
G = 0.99
N_ACTIONS = 3
ACTIONS = np.arange(N_ACTIONS) + 1
N_FRAMES_STACKED = 4
N_MAX_NOOPS = 30

## Network setup

In [4]:
Network = namedtuple('Network', 's a r a_softmax graph_v policy_loss value_loss summaries_train summaries_test')

In [5]:
def create_network(scope):
    with tf.variable_scope(scope):
        graph_s = tf.placeholder(tf.float32, [None, 80, 80, 4])
        graph_action = tf.placeholder(tf.int64, [None])
        graph_r = tf.placeholder(tf.float32, [None])

        x = tf.layers.conv2d(
                inputs=graph_s,
                filters=32,
                kernel_size=8,
                strides=4,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=4,
                strides=2,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=3,
                strides=1,
                activation=tf.nn.relu)

        w, h, f = x.shape[1:]
        x = tf.reshape(x, [-1, int(w * h * f)])

        x = tf.layers.dense(
                inputs=x,
                units=512,
                activation=tf.nn.relu)

        a_logits = tf.layers.dense(
                inputs=x,
                units=N_ACTIONS,
                activation=None)

        a_softmax = tf.nn.softmax(a_logits)

        graph_v = tf.layers.dense(
            inputs=x,
            units=1,
            activation=None)
        graph_v = graph_v[:, 0]

        p = 0
        for i in range(N_ACTIONS):
            p += tf.cast(tf.equal(graph_action, i), tf.float32) * a_softmax[:, i]
        # Log probability: higher is better for actions we want to encourage
        # Negative log probability: lower is better for actions we want to encourage
        # 1e-7: prevent log(0)
        nlp = -1 * tf.log(p + 1e-7)
        policy_loss = tf.reduce_mean(nlp * graph_r)

        value_loss = tf.reduce_mean((graph_r - graph_v) ** 2)
        
        s1 = tf.summary.scalar('policy_loss_train', policy_loss)
        s2 = tf.summary.scalar('value_loss_train', value_loss)
        s3 = tf.summary.scalar('policy_loss_test', policy_loss)
        s4 = tf.summary.scalar('value_loss_test', value_loss)
        summaries_train = tf.summary.merge([s1, s2])
        summaries_test = tf.summary.merge([s3, s4])
        
        network = Network(
            s=graph_s,
            a=graph_action,
            r=graph_r,
            a_softmax=a_softmax,
            graph_v=graph_v,
            policy_loss=policy_loss,
            value_loss=value_loss,
            summaries_train=summaries_train,
            summaries_test=summaries_test)
        
        return network

In [6]:
def list_set(l, i, val):
    assert(len(l) == i)
    l.append(val)

In [15]:
class Worker:
    
    def __init__(self, worker_n, env_name, summary_writer):
        self.env = EnvWrapper(gym.make(env_name), prepro2=prepro2, frameskip=4)
        
        worker_scope = "worker_%d" % worker_n
        self.network = create_network(worker_scope)
        self.summary_writer = summary_writer
        self.scope = worker_scope
        
        self.reward_var = tf.Variable(0.0)
        self.smoothed_reward = None
        self.reward_summary = tf.summary.scalar('reward', self.reward_var)
         
        # TODO: do these need to be separate?
        policy_optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
        value_optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
        
        self.train_op = policy_optimizer.minimize(self.network.policy_loss)
        
        self.update_policy_gradients, self.apply_policy_gradients, self.zero_policy_gradients, self.grad_bufs_policy = \
            create_train_ops(self.network.policy_loss,
                             policy_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.update_value_gradients, self.apply_value_gradients, self.zero_value_gradients, self.grad_bufs_value = \
            create_train_ops(self.network.value_loss,
                             value_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.frame_stack = deque(maxlen=N_FRAMES_STACKED)
        self.reset_env()

        self.t_max = 10000
        self.steps = 0
        self.lifetime_experience = []
        self.episode_rewards = []
        
        self.render = False

    def reset_env(self):
        self.env.reset()
        n_noops = np.random.randint(low=0, high=N_MAX_NOOPS+1)
        print("%d no-ops..." % n_noops)
        for i in range(n_noops):
            o, _, _, _ = self.env.step(0)
            self.frame_stack.append(o)
        while len(self.frame_stack) < N_FRAMES_STACKED:
            print("One more...")
            o, _, _, _ = self.env.step(0)
            self.frame_stack.append(o)
        print("No-ops done")
        
    def append_to_lifetime_experience(self, feed_dict):
        states = feed_dict[self.network.s]
        actions = feed_dict[self.network.a]
        r = feed_dict[self.network.r]
        self.lifetime_experience.append((states, actions, r))
        
    def lifetime_experience_to_feed_dict(self):
        states = []
        actions = []
        r = []
        for tup in self.lifetime_experience:
            tup_s, tup_a, tup_r = tup
            states.extend(tup_s)
            actions.extend(tup_a)
            r.extend(tup_r)
        feed_dict = {self.network.s: states,
                     self.network.a: actions,
                     self.network.r: r}
        return feed_dict
    
    def run_summaries(self, feed_dict):
        summaries = sess.run(self.network.summaries, feed_dict)
        self.summary_writer.add_summary(summaries, self.steps)
        
    def log_rewards(self):
        reward_sum = sum(self.episode_rewards)
        print("Reward sum was", reward_sum)
        if self.smoothed_reward is None:
            self.smoothed_reward = reward_sum
        else:
            self.smoothed_reward = self.smoothed_reward * 0.99 + reward_sum * 0.01
        print("Smoothed reward sum is %.1f" % self.smoothed_reward)
        sess.run(tf.assign(self.reward_var, self.smoothed_reward))
        summ = sess.run(self.reward_summary)
        self.summary_writer.add_summary(summ, self.steps)
        
    def sync_network(self):
        copy_network(sess,
                     from_scope='global',
                     to_scope=self.scope)
        
    def run_step(self):
        states = []
        actions = []
        rewards = []
        i = 0
        
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        self.sync_network()

        list_set(states, i, self.frame_stack)

        done = False
        while not done and i < self.t_max:
            #print("Step %d" % i)
            feed_dict = {self.network.s: [np.moveaxis(self.frame_stack, source=0, destination=-1)]}
            a_p = sess.run(self.network.a_softmax, feed_dict=feed_dict)[0]
            a = np.random.choice(ACTIONS, p=a_p)
            list_set(actions, i, a)
            
            o, r, done, _ = self.env.step(a)
            if self.render:
                self.env.render()

            if r != 0:
                print("Got reward", r)
            self.frame_stack.append(o)
            self.episode_rewards.append(r)
            list_set(rewards, i, r)
            list_set(states, i + 1, np.copy(self.frame_stack))

            i += 1

        if done:
            print("Episode done!")
            r = 0
        else:
            # We're not at the end of an episode, so we have to estimate
            # the value of the current state using the value network
            feed_dict = {self.network.s: [np.moveaxis(states[i], source=0, destination=-1)]} # the last state
            r = sess.run(self.network.graph_v, feed_dict=feed_dict)[0]

        """
        rewards = discount_rewards(rewards, G)
        rewards -= np.mean(rewards)
        rewards /= np.std(rewards)
        feed_dict = {self.network.s: states[:-1],
                     self.network.a: list(np.array(actions) - 1), # map from possible actions (1, 2, 3) -> (0, 1, 2)
                     self.network.r: rewards}
        sess.run(self.train_op, feed_dict)
        """
        # i - 1 to 0
        # (Why start from i - 1, rather than i?
        #  So that we miss out the last state.)
        for j in reversed(range(i)):
            if rewards[j] != 0:
                r = rewards[j]
            else:
                r = rewards[j] + G * r
            feed_dict = {self.network.s: [np.moveaxis(states[j], source=0, destination=-1)],
                         self.network.a: [actions[j] - 1], # map from possible actions (1, 2, 3) -> (0, 1, 2)
                         self.network.r: [r]}
            sess.run(self.train_op, feed_dict)
            #self.append_to_lifetime_experience(feed_dict)
            sess.run([self.update_policy_gradients,
                      self.update_value_gradients],
                      feed_dict)
        sess.run([self.apply_policy_gradients,
                  self.apply_value_gradients])
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        
        if done:
            summary_ops = self.network.summaries_test
            #summ_feed_dict = self.lifetime_experience_to_feed_dict()
            summ_feed_dict = feed_dict
        else:
            summary_ops = self.network.summaries_train
            summ_feed_dict = feed_dict
        #summaries = sess.run(summary_ops, summ_feed_dict)
        #self.summary_writer.add_summary(summaries, self.steps)
        
        if done:
            self.log_rewards()
            self.episode_rewards = []
        
        self.steps += 1
        
        return done

In [16]:
tf.reset_default_graph()
sess = tf.Session()

In [17]:
global_network = create_network('global')

In [18]:
dirname = 'summaries/' + str(int(time.time()))
os.makedirs(dirname)
summary_writer = tf.summary.FileWriter(dirname, flush_secs=1)

In [19]:
workers = []
for i in range(1):
    workers.append(Worker(i, 'PongNoFrameskip-v4', summary_writer))

[2017-08-16 15:07:13,182] Making new env: PongNoFrameskip-v4


22 no-ops...
No-ops done


In [20]:
sess.run(tf.global_variables_initializer())

In [21]:
workers[0].render = True

In [22]:
while True:
    done = workers[0].run_step()
    while not done:
        done = workers[0].run_step()
    workers[0].reset_env()

Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Episode done!


KeyboardInterrupt: 