In [1]:
import numpy as np
import tensorflow as tf
import gym
import logz
import scipy.signal
import os
import time
import inspect
from multiprocessing import Process

In [2]:
def build_mlp(
		input_placeholder, 
		output_size,
		scope, 
		n_layers=2, 
		size=64, 
		activation=tf.tanh,
		output_activation=None
		):
	#========================================================================================#
	#                           ----------SECTION 3----------
	# Network building
	#
	# Your code should make a feedforward neural network (also called a multilayer perceptron)
	# with 'n_layers' hidden layers of size 'size' units. 
	# 
	# The output layer should have size 'output_size' and activation 'output_activation'.
	#
	# Hint: use tf.layers.dense
	#========================================================================================#

	with tf.variable_scope(scope):
		# YOUR_CODE_HERE
		out = tf.layers.dense(input_placeholder, size, activation=activation, name="fcfirst")
		for i in range(n_layers - 2):
			out = tf.layers.dense(out, size, activation=activation, name = "fc" + str(i+1))
		out = tf.layers.dense(out, output_size, activation=output_activation, name = "fclast")
		return out
def pathlength(path):
	return len(path["reward"])

In [4]:
tf.reset_default_graph()


exp_name=''
env_name='CartPole-v0'
n_iter=100 
gamma=1.0
min_timesteps_per_batch=1000
max_path_length=None
learning_rate=5e-3 
reward_to_go=True
animate=False
logdir=None 
normalize_advantages=True
nn_baseline=False
seed=0
# network arguments
n_layers=1
size=32


start = time.time()

# Configure output directory for logging
"""
logz.configure_output_dir(logdir)

# Log experimental parameters
args = inspect.getargspec(train_PG)[0]
locals_ = locals()
params = {k: locals_[k] if k in locals_ else None for k in args}
logz.save_params(params)

# Set random seeds
tf.set_random_seed(seed)
np.random.seed(seed)
"""
# Make the gym environment
env = gym.make(env_name)

# Is this env continuous, or discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)

# Maximum length for episodes
max_path_length = max_path_length or env.spec.max_episode_steps

#========================================================================================#
# Notes on notation:
# 
# Symbolic variables have the prefix sy_, to distinguish them from the numerical values
# that are computed later in the function
# 
# Prefixes and suffixes:
# ob - observation 
# ac - action
# _no - this tensor should have shape (batch size /n/, observation dim)
# _na - this tensor should have shape (batch size /n/, action dim)
# _n  - this tensor should have shape (batch size /n/)
# 
# Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
# is None
#========================================================================================#

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

#========================================================================================#
#                           ----------SECTION 4----------
# Placeholders
# 
# Need these for batch observations / actions / advantages in policy gradient loss function.
#========================================================================================#

sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
if discrete:
    sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
else:
    sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

# Define a placeholder for advantages
sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)


#========================================================================================#
#                           ----------SECTION 4----------
# Networks
# 
# Make symbolic operations for
#   1. Policy network outputs which describe the policy distribution.
#       a. For the discrete case, just logits for each action.
#
#       b. For the continuous case, the mean / log std of a Gaussian distribution over 
#          actions.
#
#      Hint: use the 'build_mlp' function you defined in utilities.
#
#      Note: these ops should be functions of the placeholder 'sy_ob_no'
#
#   2. Producing samples stochastically from the policy distribution.
#       a. For the discrete case, an op that takes in logits and produces actions.
#
#          Should have shape [None]
#
#       b. For the continuous case, use the reparameterization trick:
#          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
#
#               mu + sigma * z,         z ~ N(0, I)
#
#          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
#
#          Should have shape [None, ac_dim]
#
#      Note: these ops should be functions of the policy network output ops.
#
#   3. Computing the log probability of a set of actions that were actually taken, 
#      according to the policy.
#
#      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
#      policy network output ops.
#   
#========================================================================================#

if discrete:
    # YOUR_CODE_HERE
    sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn", size=size, n_layers=n_layers)
    print(tf.shape(sy_logits_na))
    sy_sampled_ac = tf.reshape(tf.multinomial(tf.nn.log_softmax(sy_logits_na), tf.shape(sy_ob_no)[0]), [tf.shape(sy_ob_no)[0]]) # Hint: Use the tf.multinomial op
    sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(sy_ac_na, ac_dim), logits=sy_logits_na)

else:
    # YOUR_CODE_HERE
    sy_mean = build_mlp(sy_ob_no, ac_dim, "mean", size=size, n_layers=n_layers)
    sy_logstd = tf.log(tf.get_variable(name="stdev", shape=[ac_dim, ac_dim])) # logstd should just be a trainable variable, not a network output.
    sy_sampled_ac = tf.matmul(tf.random_normal([tf.shape(sy_ob_no)[0], ac_dim]), sy_logstd) + sy_mean
    sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_sampled_ac)  # Hint: Use the log probability under a multivariate gaussian. //still incorrect, todo



#========================================================================================#
#                           ----------SECTION 4----------
# Loss Function and Training Operation
#========================================================================================#

loss = -1*tf.reduce_mean(sy_logprob_n*sy_adv_n) # Loss function that we'll differentiate to get the policy gradient.
update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


#========================================================================================#
#                           ----------SECTION 5----------
# Optional Baseline
#========================================================================================#

if nn_baseline:
    baseline_prediction = tf.squeeze(build_mlp(
                            sy_ob_no, 
                            1, 
                            "nn_baseline",
                            n_layers=n_layers,
                            size=size))
    # Define placeholders for targets, a loss function and an update op for fitting a 
    # neural network baseline. These will be used to fit the neural network baseline. 
    # YOUR_CODE_HERE
    baseline_update_op = TODO


#========================================================================================#
# Tensorflow Engineering: Config, Session, Variable initialization
#========================================================================================#

tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

sess = tf.Session(config=tf_config)
sess.__enter__() # equivalent to `with sess:`
tf.global_variables_initializer().run() #pylint: disable=E1101



#========================================================================================#
# Training Loop
#========================================================================================#

total_timesteps = 0

for itr in range(n_iter):
    print("********** Iteration %i ************"%itr)

    # Collect paths until we have enough timesteps
    timesteps_this_batch = 0
    paths = []
    while True:
        ob = env.reset()
        obs, acs, rewards = [], [], []
        animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
        steps = 0
        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.05)
            obs.append(ob)
            ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
            ac = ac[0]
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
            rewards.append(rew)
            steps += 1
            if done or steps > max_path_length:
                break
        path = {"observation" : np.array(obs), 
                "reward" : np.array(rewards), 
                "action" : np.array(acs)}
        paths.append(path)
        timesteps_this_batch += pathlength(path)
        if timesteps_this_batch > min_timesteps_per_batch:
            break
    total_timesteps += timesteps_this_batch

    # Build arrays for observation, action for the policy gradient update by concatenating 
    # across paths
    ob_no = np.concatenate([path["observation"] for path in paths])
    ac_na = np.concatenate([path["action"] for path in paths])

    #====================================================================================#
    #                           ----------SECTION 4----------
    # Computing Q-values
    #
    # Your code should construct numpy arrays for Q-values which will be used to compute
    # advantages (which will in turn be fed to the placeholder you defined above). 
    #
    # Recall that the expression for the policy gradient PG is
    #
    #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
    #
    # where 
    #
    #       tau=(s_0, a_0, ...) is a trajectory,
    #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
    #       and b_t is a baseline which may depend on s_t. 
    #
    # You will write code for two cases, controlled by the flag 'reward_to_go':
    #
    #   Case 1: trajectory-based PG 
    #
    #       (reward_to_go = False)
    #
    #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
    #       entire trajectory (regardless of which time step the Q-value should be for). 
    #
    #       For this case, the policy gradient estimator is
    #
    #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
    #
    #v       where
    #
    #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
    #
    #       Thus, you should compute
    #
    #           Q_t = Ret(tau)
    #
    #   Case 2: reward-to-go PG 
    #
    #       (reward_to_go = True)
    #
    #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
    #       from time step t. Thus, you should compute
    #
    #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
    #
    #
    # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
    # like the 'ob_no' and 'ac_na' above. 
    #
    #====================================================================================#

    # YOUR_CODE_HERE
    qs = []
    if reward_to_go:
        for l in range(len(paths)):
            rewards = paths[l]['reward']
            n_actions = rewards.shape[0]
            q = np.zeros((n_actions))
            for i in range(n_actions):
                for j in range(n_actions-i):
                    q[i] += rewards[i + j]*np.power(gamma, j)
            qs.append(q)
    else:
        for l in range(len(paths)):
            rewards = paths[l]['reward']
            n_actions = rewards.shape[0]
            q = np.zeros((n_actions))
            for i in range(n_actions):
                q += rewards[i]*np.power(gamma, i)
            qs.append(q)
    q_n = np.concatenate(qs)
    #====================================================================================#
    #                           ----------SECTION 5----------
    # Computing Baselines
    #====================================================================================#

    if nn_baseline:
        # If nn_baseline is True, use your neural network to predict reward-to-go
        # at each timestep for each trajectory, and save the result in a variable 'b_n'
        # like 'ob_no', 'ac_na', and 'q_n'.
        #
        # Hint #bl1: rescale the output from the nn_baseline to match the statistics
        # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
        # #bl2 below.)

        b_n = TODO
        adv_n = q_n - b_n
    else:
        adv_n = q_n.copy()

    #====================================================================================#
    #                           ----------SECTION 4----------
    # Advantage Normalization
    #====================================================================================#

    if normalize_advantages:
        # On the next line, implement a trick which is known empirically to reduce variance
        # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
        # YOUR_CODE_HERE
        mean = np.mean(adv_n)
        stdev = np.std(adv_n)
        adv_n = adv_n/stdev - mean


    #====================================================================================#
    #                           ----------SECTION 5----------
    # Optimizing Neural Network Baseline
    #====================================================================================#
    if nn_baseline:
        # ----------SECTION 5----------
        # If a neural network baseline is used, set up the targets and the inputs for the 
        # baseline. 
        # 
        # Fit it to the current batch in order to use for the next iteration. Use the 
        # baseline_update_op you defined earlier.
        #
        # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
        # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

        # YOUR_CODE_HERE
        pass

    #====================================================================================#
    #                           ----------SECTION 4----------
    # Performing the Policy Update
    #====================================================================================#

    # Call the update operation necessary to perform the policy gradient update based on 
    # the current batch of rollouts.
    # 
    # For debug purposes, you may wish to save the value of the loss function before
    # and after an update, and then log them below. 

    # YOUR_CODE_HERE
    sess.run(update_op, feed_dict={sy_ob_no : ob_no, sy_ac_na : ac_na, sy_adv_n : adv_n})

    # Log diagnostics
    returns = [path["reward"].sum() for path in paths]
    ep_lengths = [pathlength(path) for path in paths]
    logz.log_tabular("Time", time.time() - start)
    logz.log_tabular("Iteration", itr)
    logz.log_tabular("AverageReturn", np.mean(returns))
    logz.log_tabular("StdReturn", np.std(returns))
    logz.log_tabular("MaxReturn", np.max(returns))
    logz.log_tabular("MinReturn", np.min(returns))
    logz.log_tabular("EpLenMean", np.mean(ep_lengths))
    logz.log_tabular("EpLenStd", np.std(ep_lengths))
    logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
    logz.log_tabular("TimestepsSoFar", total_timesteps)
    logz.dump_tabular()
    logz.pickle_tf_vars()

[2018-04-03 23:58:57,185] Making new env: CartPole-v0


Tensor("Shape:0", shape=(2,), dtype=int32)
********** Iteration 0 ************
----------------------------------------
|               Time |           0.234 |
|          Iteration |               0 |
|      AverageReturn |            23.3 |
|          StdReturn |            11.4 |
|          MaxReturn |              59 |
|          MinReturn |              10 |
|          EpLenMean |            23.3 |
|           EpLenStd |            11.4 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |           1e+03 |
----------------------------------------


TypeError: join() argument must be str or bytes, not 'NoneType'

In [6]:
sess.close()