## Deep Deterministic Policy Gradient Robert Miklos

In [1]:
# imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.nn import relu, softmax
from tensorflow.contrib.layers import fully_connected, convolution2d, flatten, batch_norm, max_pool2d, dropout
import gym
from gym import envs
from utils import Viewer
from tensorflow.python.framework import ops

In [2]:
# create gym environment
env = gym.make('Pendulum-v0')

[2017-11-27 11:15:27,210] Making new env: Pendulum-v0


In [3]:
# demo the environment
env.reset() # reset the environment
#view = Viewer(env, custom_render=True) # we use this custom viewer to render the environment inline in the notebook
for timestep in range(200):
#   #view.render()
    env.render() # uncomment this to use gym's own render function
    action = env.action_space.sample()
    env.step(action) # take a random action
##view.render(close=True, display_gif=True) # display the environment inline in the notebook
env.render(close=True) # uncomment this to use gym'm own render function

In [4]:
n_actions = 1 # Power to the engine
n_states = 3 # Position and velocity
n_values = 1 # Number of value functions

# Tuning the neural networks
n_hidden_act = 100 # Number of neurons in the hidden layer by the actor
n_hidden_cri = 100 # Number of neurons in the hidden layer by the critic
n_hidden_act_t = n_hidden_act# Number of neurons in the hidden layer by the target actor
n_hidden_cri_t = n_hidden_cri# Number of neurons in the hidden layer by the target critic


# Defining the four neural networks: actor, critic, target actor, target critic.

tf.reset_default_graph()

states_pl = tf.placeholder(tf.float32, [None, n_states], name='states_pl') # States of the environment
actions_pl = tf.placeholder(tf.float32, [None, n_actions], name='actions_pl') # Action to the environment
values_pl = tf.placeholder(tf.float32, [None, n_values], name='values_pl') # The action values
sta_act_pl = tf.placeholder(tf.float32, [None, n_states+n_actions], name='sta_act_pl') # The concatenation of states and actions (for critic network)
targets_pl = tf.placeholder(tf.float32, [None, n_values], name='targets_pl') # The target for the critic network
learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate_pl') # Learning rate

scale = 2
# The actor network
l_hidden_act = tf.layers.dense(inputs=states_pl, units=n_hidden_act, activation=relu, name='l_hidden_act')
l_out_act = tf.scalar_mul(scale,tf.layers.dense(inputs=l_hidden_act, units=n_actions, activation=tf.tanh, name='l_out_act') )

# The target actor network
l_hidden_act_t = tf.layers.dense(inputs=states_pl, units=n_hidden_act_t, activation=relu, name='tl_hidden_act')
l_out_act_t = tf.scalar_mul(scale,tf.layers.dense(inputs=l_hidden_act_t, units=n_actions, activation=tf.tanh, name='tl_out_act') )

# The critic network
l_hidden_cri = tf.layers.dense(inputs=sta_act_pl, units=n_hidden_cri, activation=relu, name='l_hidden_cri',bias_initializer=tf.random_uniform_initializer)
l_out_cri = tf.layers.dense(inputs=l_hidden_cri, units=n_values, activation=None, name='l_out_cri',bias_initializer=tf.random_uniform_initializer) 

# The target critic network
l_hidden_cri_t = tf.layers.dense(inputs=sta_act_pl, units=n_hidden_cri_t, activation=relu, name='tl_hidden_cri')
l_out_cri_t = tf.layers.dense(inputs=l_hidden_cri_t, units=n_values, activation=None, name='tl_out_cri') 

# The critic network for searching for the parameters of the actor network
cl_hidden_cri = tf.layers.dense(inputs=tf.concat([states_pl,l_out_act],axis=1), units=n_hidden_cri, activation=relu, name='cl_hidden_cri',bias_initializer=tf.random_uniform_initializer)
cl_out_cri = tf.layers.dense(inputs=cl_hidden_cri, units=n_values, activation=None, name='cl_out_cri',bias_initializer=tf.random_uniform_initializer) 



print(l_out_act.get_shape())
print(l_out_act_t.get_shape())
print(l_out_cri.get_shape())
print(l_out_cri_t.get_shape())
print(cl_out_cri.get_shape())

(?, 1)
(?, 1)
(?, 1)
(?, 1)
(?, 1)


In [5]:
# test forward pass of actor network
state = env.reset()
#yy = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_out_act')
#uu = tf.get_variable(yy[1])

# Test gradient function
w = tf.placeholder(tf.float32, name='w');
u = tf.placeholder(tf.float32, name='u');
f = w*u
g = tf.gradients(l_out_cri,sta_act_pl);

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate_pl)
#g1 = optimizer.compute_gradients(l_out_act,[ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_hidden_act'),ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_out_act')])
#g1 = optimizer.compute_gradients(l_out_cri,sta_act_pl)


    
x_values = np.zeros((10,1))
x_var = tf.Variable(x_values, name="xVariable", dtype=tf.float32)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #sess.run(copy_criact_critic)
    
    state = env.reset()
    action_totake1 = sess.run(fetches=l_out_act, feed_dict={states_pl: np.reshape(state,(1,n_states))})
    print(state)
    print(action_totake1)

    print('-')

# test forward pass of critic network
    #state = env.reset()
    action = (np.random.rand()-0.5)*2; # Uniform distribution between -1 and 1
    action = action_totake1
#with tf.Session() as sess:
    #sess.run(tf.global_variables_initializer())
    action_totake = sess.run(fetches=l_out_cri, feed_dict={sta_act_pl: np.column_stack((np.reshape(state,(1,n_states)),action))})
    print(state)
    #print(action)
    print(action_totake)
    print('-')
    
    action_totake = sess.run(fetches=cl_out_cri, feed_dict={states_pl: np.reshape(state,(1,n_states))})
    print(state)
    print(action_totake)
    print('-------------------------------------------------------------------------')
    


# test forward pass of targetactor network
    state = env.reset()
#with tf.Session() as sess:
    #sess.run(tf.global_variables_initializer())
    value = sess.run(fetches=l_out_act_t, feed_dict={states_pl: np.reshape(state,(1,n_states))})
    print(state)
    print(value)
    print('-')

# test forward pass of target critic network
    #state = env.reset()
    #action = (np.random.rand()-0.5)*2; # Uniform distribution between -1 and 1
#with tf.Session() as sess:
    #sess.run(tf.global_variables_initializer())
    value = sess.run(fetches=l_out_cri_t, feed_dict={sta_act_pl: np.column_stack((np.reshape(state,(1,n_states)),action))})
    print(state)
    #print(action)
    print(value)
    print('-')
    

    
    
    
    
    
    
    
  #  print(x_var.eval())
    aa = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_out_cri')
    bb = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_hidden_cri')
    #print(aa)
    #print(g)
    for i in range(len(aa)):
        print(aa[i].eval())
        print(bb[i].eval())
        print('110011')
    
    print('---------------')
    i = 0
    #print('--')
    #print(aa[i])
    #print(aa[i].eval())
    #print(np.shape(aa[i].eval()))
    #aa[i] = aa[i].assign(x_var+1)
    #print(aa[i].eval())
    
    #print('--')
    #ans = sess.run(fetches=f, feed_dict={u: 2,w:4})
    #print(ans)
    #ans = sess.run(fetches=g, feed_dict={u: 2,w:4})
    #print(ans)
    #print('--')
    #print()
#    print(g)
    #print(g1)
    #print(len(g1))
    #aa = []
    #for i in range(len(g1)):
        #aa = g[i].eval(feed_dict={sta_act_pl: np.reshape([1,1,1],(1,3))})
        #print(aa)
        #print(aa[0][-1])
        #print(g[i].eval(feed_dict={sta_act_pl: np.reshape([0,0,1],(1,3))}))
    #    bb = g1[i][0].eval(feed_dict={states_pl: np.reshape([1,1],(1,2))})
    #    print(bb)
        #aa = np.concatenate(aa,bb)
        #print(g1[i][0].eval(feed_dict={states_pl: np.reshape([0,0],(1,2))}))
    #print(bb[:])
    #ff = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_out_cri')
    #print(ff)
    #print(g)
    #for i in range(len(ff)):
    #    print('..')
    #    print(ff[i].eval())
    #    ss = ff[i].get_shape()
    #    ze = np.zeros((ss[0],1))
    #    ff[i] = ff[i].assign(ze)
    #    aa = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,'l_out_cri')
    #    print(ff[i].eval())
    #    print(aa[i].eval())
        
        
    
    #print(ff)
    #ze = np.zeros((len(ff),1))
    #ff = ff.assign(ze)
    #print(ff)
    #print(g)
    #print(g1[3])
    #print(g.eval(feed_dict={sta_act_pl: np.reshape([1,1,1],(1,3))}))
    #print(g1[1][0].eval(feed_dict={sta_act_pl: np.reshape([1,1,1],(1,3))}))
    #g_f = sess.run(fetches=g, feed_dict={sta_act_pl: np.reshape([1,1,1],(1,3))})

[-0.61711555 -0.78687254  0.97476273]
[[ 0.15124236]]
-
[-0.61711555 -0.78687254  0.97476273]
[[ 1.52156997]]
-
[-0.61711555 -0.78687254  0.97476273]
[[ 1.46960318]]
-------------------------------------------------------------------------
[-0.69182262 -0.72206749 -0.42741768]
[[ 0.04815253]]
-
[-0.69182262 -0.72206749 -0.42741768]
[[ 0.11577675]]
-
[[ 0.14678001]
 [ 0.239768  ]
 [-0.00263113]
 [-0.10868609]
 [ 0.08241984]
 [-0.19100331]
 [ 0.03067139]
 [-0.00736395]
 [ 0.24042928]
 [-0.03593656]
 [-0.09122492]
 [-0.0340669 ]
 [-0.05000779]
 [ 0.10022172]
 [-0.04302855]
 [ 0.22534269]
 [-0.13738085]
 [ 0.11181656]
 [-0.12166858]
 [-0.03524382]
 [ 0.00488454]
 [-0.02487417]
 [-0.09112579]
 [ 0.07269648]
 [-0.0948287 ]
 [-0.23476003]
 [-0.02628875]
 [-0.07463999]
 [ 0.13870397]
 [-0.06921914]
 [ 0.08938485]
 [-0.07583109]
 [ 0.10872021]
 [ 0.23000601]
 [ 0.17011681]
 [ 0.0343768 ]
 [ 0.23463759]
 [ 0.12259427]
 [-0.16120903]
 [-0.05392572]
 [-0.03417191]
 [ 0.23447716]
 [-0.21879636]
 [-

In [6]:
# Define Q-value loss function
loss_f = tf.reduce_mean(tf.square(targets_pl-l_out_cri))
#loss_f = (tf.square(targets_pl-l_out_cri))

#Regularization
#reg_scale = 0.001
#regularize = tf.contrib.layers.l2_regularizer(reg_scale)
#params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'l_out_cri')
#reg_term = sum([regularize(param) for param in params])
#loss_f += reg_term

# Maximize the reward (maximize the output of the critic network)
in_to_cri = l_out_act
#print([states_pl, in_to_cri])
print(in_to_cri)
print(l_hidden_cri)
#exReward = l_out_cri(feed_dict={sta_act_pl: [states_pl, in_to_cri]})


# Gradient of the critic network wrt to the action
#g_C = tf.gradients(l_out_cri,sta_act_pl)
#print(g_C)

# Gradient of the actor network wrt its own parameters
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate_pl)
#g_A = optimizer.compute_gradients(l_out_act,[tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'l_hidden_act'),tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'l_out_act')])
#print(g_A)

#Minus sign due to converting a maximization to a minimization problem
sign = -1
loss_c = tf.reduce_mean(tf.scalar_mul(sign,cl_out_cri))
g_c = optimizer.compute_gradients(loss_c,[tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'l_hidden_act'),tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'l_out_act')])
train_c = optimizer.apply_gradients(g_c)
#train_c = optimizer.minimize(cl_out_cri,aa)


optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate_pl)
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_pl)
train_f = optimizer.minimize(loss_f)

saver = tf.train.Saver() # we use this later to save the model
print(loss_c)

Tensor("mul:0", shape=(?, 1), dtype=float32)
Tensor("l_hidden_cri/Relu:0", shape=(?, 100), dtype=float32)
Tensor("Mean_1:0", shape=(), dtype=float32)


In [7]:
# Additional functions

# Update the target network's parameters
tau = 0.5
def assign_trainables(t_p, p, tau=1.0):
    """Update trainable variables with rate tau."""
    obs = []
    for i, t in enumerate(t_p):
        for k in range(len(t)):
            obs.append(t[k].assign((1-tau) * t[k].value() + tau * p[i][k].value()))
    return obs


# Get the target actor parameters
pa_t_h = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tl_hidden_act') #The hidden layer
pa_t_o = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tl_out_act')    #The output layer
pa_t = [pa_t_h,pa_t_o]

# Get the actor parameters
pa_h = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='l_hidden_act') #The hidden layer
pa_o = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='l_out_act')    #The output layer
pa = [pa_h,pa_o];

# Get the target critic parameters
pc_t_h = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tl_hidden_cri') #The hidden layer
pc_t_o = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tl_out_cri')    #The output layer
pc_t = [pc_t_h,pc_t_o]

# Get the critic parameters
pc_h = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='l_hidden_cri') #The hidden layer
pc_o = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='l_out_cri')    #The output layer
pc = [pc_h,pc_o];

# Get the critic-actor parameters
p_h_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cl_hidden_cri') #The hidden layer
p_o_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cl_out_cri')    #The output layer
p = [p_h_c,p_o_c];

update_target_actor = assign_trainables( pa_t,pa,tau)
update_target_critic = assign_trainables( pc_t,pc,tau)
copy_target_actor = assign_trainables( pa_t,pa,tau=1) #copy with tau=1 (clever trick)
copy_target_critic = assign_trainables( pc_t,pc,tau=1)
copy_critic_criact = assign_trainables( pc,p,tau=1)
copy_criact_critic = assign_trainables( p,pc,tau=1)




In [8]:
# Continuous mountain car problem.









# training settings
noise_size = 0.1
learning_rate = 0.001# you know this by now
episodes = 10000 #Number of episodes (Outer loop)
maxTime = 200 #Maximum time for an episode (Inner loop)
epsilon = 0.1 #The epsilon in the epsilon-greedy method for exploration
gamma = 0.95 #Discount factor
valid_episode = 10 #Compute validation at every valid_episode'th episode
ep = 0 #Episode counter for valid_episode
Nd = 100000 #Size of D matrix for experience replay
Nb = 64 #Size of minibatch in the stochastic gradient descent with experience replay
ex_rep_counter = 1
try:
    statistics = []
    r_stat = np.zeros((maxTime,valid_episode))
    loss_stat = []
    loss2_stat = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(copy_target_actor)
        sess.run(copy_target_critic)
        sess.run(copy_criact_critic)
        print('start training')
        D = np.zeros((Nd,n_states*2+1+1+1)) #5 is for (aj,xj,rj,xj+1,done). It's the history matrix for experience replay
        D_counter = 0 #Counter in circular D-buffer
        for episode in range(episodes):
            x = env.reset()
            a_noise = 0
            for timestep in range(maxTime):
                # Action noise strategy (at least respect the bounds on the action)
                #------------------------------------------------------------------
                at = sess.run(fetches=l_out_act, feed_dict={states_pl: np.reshape(x,(1,n_states))}) #Take action according to current best policy
                #print(at)
                a_noise = a_noise*0.9 + noise_size*np.random.normal(0,1,1) #sample the exploration noise from a normal distribution (can be made more sophisticated)
                at = at + a_noise
                at = np.clip(at,-2,2)
                #Apply action and observe reward and the new state
                #-------------------------------------------------
                
                #print(at)
                #print(x)
                x_next, r, done, _ = env.step(at) #Apply the action

                r_stat[timestep,ep] = r #Save for statistics
                
                #Save step into D-buffer for Experience replay
                #---------------------------------------------
                #Update into a a circular buffer
                update_index_ER = D_counter
                D[update_index_ER,:] = np.concatenate((np.reshape(x,n_states),np.reshape(at,n_actions),np.reshape(r,1),np.reshape(x_next,n_states),np.reshape(done,1)))
                D_counter = D_counter + 1
                if D_counter == Nd:
                    D_counter = 0

                #Sample the minibatch with Experience replay
                #------------------------------------------------------------
                y_mb = np.zeros((Nb,1))
                at_mb = np.zeros((Nb,n_actions))
                x_mb = np.zeros((Nb,n_states))
                for batch_number in range(Nb):
                    sample_index_ER = np.random.randint(0,ex_rep_counter)
                    Drow= D[sample_index_ER,:]
                    #Unpack the history quintuple
                    xj = Drow[0:n_states]
                    aj = Drow[n_states]
                    rj = Drow[n_states+1]
                    xj1 = Drow[n_states+2:2*n_states+2]
                    done1 = Drow[-1]
                    
                    #Fill out the y-value (right hand side of the Bellman equation)
                    if done1:  #If we have terminated the game
                        y_mb[batch_number] = rj
                    else: #If we have not terminated the game 
                        a_next = sess.run(fetches=l_out_act_t, feed_dict={states_pl: np.reshape(xj1,(1,n_states))})
                        value_next = sess.run(fetches=l_out_cri_t, feed_dict={sta_act_pl: np.column_stack((np.reshape(xj1,(1,n_states)),np.reshape(a_next,(1,1))))})
                        y_mb[batch_number,:] = rj+gamma*value_next
                        
                    #Fill out the action a_mb
                    at_mb[batch_number] = aj
                    
                    #Fill out the state x_mb
                    x_mb[batch_number,:] = xj
                
                
                if ex_rep_counter != Nd: # Handle initial filling of experience replay
                    ex_rep_counter = ex_rep_counter + 1
                #Do a stochastic gradient descent on the critic network
                #------------------------------------------------------
                loss,_ = sess.run(fetches=[loss_f,train_f], feed_dict={
                    targets_pl: y_mb,
                    sta_act_pl: np.column_stack((x_mb,at_mb)),
                    learning_rate_pl: learning_rate
                })    
                
                loss_stat.append(loss) #Save for statistics
                sess.run(copy_criact_critic)
                # Compute the gradients for the actor network (with same mini-batches)
                #---------------------------------------------------------------------
                
                #print(pa_h[0].eval())
                loss2,_,gc = sess.run(fetches=[loss_c,train_c,g_c], feed_dict={
                    states_pl: x_mb,
                    learning_rate_pl: learning_rate
                })  
                loss2_stat.append(loss2) #Save for statistics
                #print('-')
                #print(gc)
                #print(pa_h[0].eval())
                #print(sess.run(cl_out_cri,feed_dict={states_pl: x_mb}))
                #print('-')


                
                # Update target networks
                #------------------------------
                ops = sess.run(update_target_actor)  #Update the actor target network
                ops = sess.run(update_target_critic) #Update the critic target network


                

                #Prepare for next timestep
                #----------------------------
                
                x = x_next
                if done: 
                    #print('Done before time!!!')
                    break #Game over! And end the episode
                
             #Validation time!
            #----------------
            if (episode % 1000) == 0:
                noise_size = noise_size/2
                print('Noise is changed')
                print(noise_size)
            
            if (episode % valid_episode) == 0: 
                #Computation of validation by playing the game once
                #--------------------------------------------------
                x = env.reset()
                r_valid = np.zeros(maxTime)
                #print(x)
                for time_valid in range(maxTime):
                    #Only pick greedy action in validation!
                    action = sess.run(fetches=l_out_act, feed_dict={states_pl: [x]})
                    at = action
                
                    x_next, r, done, _ = env.step(at) #Apply the action
                    r_valid[time_valid] = r #Save for statistics
                    if done: break #Game over! And end the episode
                    x = np.reshape(x_next,(n_states,))
                
                
                #Print out validation statistics
                #-------------------------------
                r_stat_mean = np.mean(np.sum(r_stat,axis=0))
                r_valid_sum = np.sum(r_valid)
                loss_stat_mean = np.mean(loss_stat,axis=0)
                loss2_stat_mean = np.mean(loss2_stat,axis=0)
                print('Episode: %4d. Mean training reward: %6.2f. Validation reward: %6.2f. Mean loss target: %6.2f. Mean loss: %6.2f.' % (episode, r_stat_mean, r_valid_sum, loss_stat_mean, loss2_stat_mean))
                #Reset statistics variables
                #--------------------------
                ep = 0 #Reset episode counter between validations
                r_stat = np.zeros((maxTime,valid_episode)) 
                #saver.save(sess, 'tmp2/model.ckpt')
                
                
            else:
                ep = ep + 1
                
        print('done')
        # save session
        saver.save(sess, 'tmp2/model.ckpt')
except KeyboardInterrupt:
    pass            
        
        
        
        
        
        







start training
Noise is changed
0.05
Episode:    0. Mean training reward: -103.68. Validation reward: -1081.85. Mean loss target:  31.82. Mean loss:  35.31.
Episode:   10. Mean training reward: -1370.99. Validation reward: -940.75. Mean loss target:  69.97. Mean loss: 108.99.
Episode:   20. Mean training reward: -1085.05. Validation reward: -754.26. Mean loss target:  65.44. Mean loss: 106.08.
Episode:   30. Mean training reward: -879.14. Validation reward: -1011.64. Mean loss target:  57.31. Mean loss: 100.08.
Episode:   40. Mean training reward: -762.43. Validation reward: -815.74. Mean loss target:  52.16. Mean loss:  96.61.
Episode:   50. Mean training reward: -856.78. Validation reward: -519.47. Mean loss target:  49.17. Mean loss:  93.86.
Episode:   60. Mean training reward: -732.70. Validation reward: -968.82. Mean loss target:  46.89. Mean loss:  91.53.
Episode:   70. Mean training reward: -687.02. Validation reward: -1015.75. Mean loss target:  44.51. Mean loss:  89.46.
Episod

Episode:  690. Mean training reward: -309.91. Validation reward: -416.11. Mean loss target:  18.93. Mean loss:  39.68.
Episode:  700. Mean training reward: -217.06. Validation reward: -119.87. Mean loss target:  18.79. Mean loss:  39.50.
Episode:  710. Mean training reward: -233.46. Validation reward: -118.33. Mean loss target:  18.66. Mean loss:  39.30.
Episode:  720. Mean training reward: -208.87. Validation reward: -566.68. Mean loss target:  18.54. Mean loss:  39.11.
Episode:  730. Mean training reward: -220.10. Validation reward: -244.14. Mean loss target:  18.41. Mean loss:  38.92.
Episode:  740. Mean training reward: -260.35. Validation reward: -130.34. Mean loss target:  18.31. Mean loss:  38.69.
Episode:  750. Mean training reward: -182.75. Validation reward: -125.54. Mean loss target:  18.20. Mean loss:  38.48.
Episode:  760. Mean training reward: -158.82. Validation reward:  -3.53. Mean loss target:  18.09. Mean loss:  38.30.
Episode:  770. Mean training reward: -182.09. Val

Episode: 1380. Mean training reward: -238.47. Validation reward: -255.30. Mean loss target:  17.99. Mean loss:  20.61.
Episode: 1390. Mean training reward: -241.71. Validation reward: -128.07. Mean loss target:  17.93. Mean loss:  20.52.
Episode: 1400. Mean training reward: -374.94. Validation reward: -635.27. Mean loss target:  17.89. Mean loss:  20.41.
Episode: 1410. Mean training reward: -346.96. Validation reward: -244.33. Mean loss target:  17.85. Mean loss:  20.30.
Episode: 1420. Mean training reward: -254.35. Validation reward: -859.80. Mean loss target:  17.80. Mean loss:  20.22.
Episode: 1430. Mean training reward: -447.93. Validation reward: -392.56. Mean loss target:  17.77. Mean loss:  20.11.
Episode: 1440. Mean training reward: -292.49. Validation reward: -119.85. Mean loss target:  17.72. Mean loss:  20.05.
Episode: 1450. Mean training reward: -229.26. Validation reward: -245.59. Mean loss target:  17.67. Mean loss:  19.98.
Episode: 1460. Mean training reward: -275.97. Va

In [26]:
# review solution
with tf.Session() as sess:
    saver.restore(sess, "tmp2/model.ckpt")
    x = env.reset()
    print(x)
    view = Viewer(env, custom_render=True)
    r_sum = 0
    for _ in range(100):
        env.render() # uncomment this to use gym's own render function
        #a = get_action(sess, s, stochastic=False)
        a = sess.run(fetches=l_out_act, feed_dict={states_pl: np.reshape(x,(1,n_states))})
        #print(a)
        #value = sess.run(fetches=l_out, feed_dict={states_pl: [x],is_training_pl: False})
        #a=value.argmax()
        x, r, done, _ = env.step(a)
        r_sum = r_sum + r
    env.render(close=True) # uncomment this to use gym'm own render function
    print(r_sum)

INFO:tensorflow:Restoring parameters from tmp/model.ckpt


[2017-11-27 14:14:33,538] Restoring parameters from tmp/model.ckpt


[-0.42629543 -0.904584    0.09391583]
[-119.01763153]


In [10]:

with tf.Session() as sess:
    saver.restore(sess, "tmp2/model.ckpt")
    x = [0,0,0]
    a = sess.run(fetches=l_out_act, feed_dict={states_pl: np.reshape(x,(1,n_states))})
    b = sess.run(fetches=l_out_cri, feed_dict={sta_act_pl: np.column_stack((np.reshape(x,(1,n_states)),np.reshape(a,(1,1))))})
    print(a)
    print(b)


INFO:tensorflow:Restoring parameters from tmp2/model.ckpt


[2017-11-27 14:13:15,920] Restoring parameters from tmp2/model.ckpt


[[-1.63175869]]
[[-48.95029831]]
