In [1]:
import tensorflow.compat.v1 as tf
import gym
import numpy as np
import warnings

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [3]:
tf.set_random_seed(1)
np.random.seed(1)
tf.disable_eager_execution()
warnings.filterwarnings('ignore')

# Create HyperParameter

In [4]:
bathc_size = 32
learning_rate = 0.01
epsilon = 0.9
gamma = 0.9
target_replace_iter = 100
memory_capacity = 2000
memory_counter = 0
learning_step_counter = 0
env = gym.make('CartPole-v0')
env = env.unwrapped
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]
memory = np.zeros((memory_capacity, n_states * 2 + 2))

In [5]:
tf_s = tf.placeholder(tf.float32, [None, n_states])
tf_a = tf.placeholder(tf.int32, [None, ])
tf_r = tf.placeholder(tf.float32, [None, ])
tf_s_ = tf.placeholder(tf.float32, [None, n_states])

# Evaluation of Networks

In [6]:
with tf.variable_scope('q'):
    l_eval = tf.layers.dense(tf_s, 10, tf.nn.relu, kernel_initializer = tf.random_normal_initializer(0, 0.1))
    q = tf.layers.dense(l_eval, n_actions, trainable=False)


# Target Network ( Not to train )

In [7]:
with tf.variable_scope('q_next'):
    l_target = tf.layers.dense(tf_s_, 10, tf.nn.relu, trainable= False)
    q_next = tf.layers.dense(l_target, n_actions, trainable=False)
    
    

In [8]:
q_target = tf_r + gamma  * tf.reduce_max(q_next, axis = 1)
a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype = tf.int32), tf_a], axis = 1)
q_wrt_a = tf.gather_nd(params = q, indices = a_indices)

loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)




In [9]:
def choose_action(s):
    s = s[np.newaxis, :]
    if np.random.uniform() < epsilon:
        actions_value = sess.run(q, feed_dict = {tf_s:s})
        action = np.argmax(actions_a_indicesvalue)
    else:
        action = np.random.randint(0, n_actions)
  
    return action


In [10]:
def store_transition(s, a, r, s_):
    global memory_counter
    transition = np.hstack((s, [a, r], s_))
    index = memory_counter % memory_capacity
    memory[index, :] = transition
    memory_counter+=1


In [11]:
def learn():
    # Update the target network
    global learning_step_counter
    
    if learning_step_counter % target_replace_iter == 0:
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'q_next')
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'q')
        sess.run([tf.assign(t, e) for t, e, in zip(t_params, e_params)])
        
    learning_step_counter += 1
    
    sample_index = np.random.choice(memory_capacity, bathc_size)
    b_memory = memory[sample_index, :]
    b_s= b_memory[:, n_states]
    b_a = b_memory[:, n_states].astype(int)
    b_r = b_memory[:, n_states +1]
    b_s_ = b_memory[:, -n_states:]
    
    sess.run(train_op, {tf_s:b_s, tf_a:b_a, tf_r:b_r,tf_s_:b_s_})
    

In [None]:
print('Collecting Experience ....')

for i_episode in range(400):
    s = env.reset()
    ep_r = 0
    
    while True:
        env.render()
        a = choice_action(s)
        
        s_, r, done, info = env.step(a)
        
        x, x_dot, theta, theta_dot = s_
        
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        r = r1 + r2
        
        store_transition(s, a, r, s_)
        ep_r += r
        
        if memory_counter > memory_capacity:
            learn()
            if done:
                print('Ep: ', i_episode, '|Ep_r: ', round(ep_r, 2))
                
        if done:
            break
            
        
    display.clear_output(wait=True)
    display.display(plt.gcf())