# Created by Faisal
# 20/7/2019
# REINFORCE algorithm on the cart and pole problem using a neural network policy
# The code for REINFORCE is based on Chapter 13 section 3 of 'Reinforcement learning an intrduction by Sutton and Barto' 
# The reward for each episode will be saved during training for comparison with other algorithms

## The policy network model is built using Tensorflow

In [1]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

## Building the Network architecture

In [2]:
n_input = 4
n_hidden = 4
n_output = 2

## Building the Input layer

In [3]:
X = tf.placeholder(tf.float32 ,(None, n_input),name='X')

## Building the hidden layer

In [4]:
w_hidden = tf.Variable(tf.random_normal((n_input,n_hidden)), name='w_hidden')
b_hidden = tf.Variable(tf.random_normal((n_hidden,)), name = 'b_hidden')
x_hidden = tf.nn.relu(tf.matmul(X,w_hidden) + b_hidden)

## Building the output layer

In [5]:
w_output = tf.Variable(tf.random_normal((n_hidden,n_output)), name='w_output')
b_output = tf.Variable(tf.random_normal((n_output,)), name='b_output')
output = tf.nn.softmax(tf.matmul(x_hidden,w_output) + b_output)

## Define the loss function and the gradient update

In [6]:
env = gym.make('CartPole-v0')
obs = env.reset()

In [7]:
Actions = [0, 1]

In [8]:
with tf.name_scope('loss'):
    gt = tf.placeholder(tf.float32,(1,1))
    wieighted_loss = -tf.multiply(tf.math.log(output),gt)
#     loss = tf.reduce_mean(wieighted_loss)

In [9]:
Learning_rate = 1e-12
optimizer = tf.train.GradientDescentOptimizer(learning_rate=Learning_rate)
training_op = optimizer.minimize(wieighted_loss)

In [10]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [11]:
Gamma = 0.95
# Store the reward for each episode to plot the learning curve
Rewards = []
#reward for single episode
r = 0
#number of training episodes
num_episodes = 1e6

## Monte Carlo return

In [12]:
def G(history, t, gamma = 0.99):
    G = 0
    for time_step in range(len(history)-1,t-1,-1):
            r = history[time_step][2]
            G = (gamma * G) + r
    return G   

## Train the agent

In [None]:
env = gym.make('CartPole-v0')
num_iterations = 5000000
with tf.Session() as sess:
    init.run()
    for i in range(num_iterations):
        r = 0
        history = []
        # Generate an episode#
        ######################
        state = env.reset()
        c = 0
        done = False
        # The episode should exceed 1000 time step
        while c < 1000:
            if done:
                Rewards.append(r)
                print ('\rreward: {}, iteration: {} , max_rewrad: {}, avergae_rewrad {:.2f} \r'.format(r, i+1, 
                                                                        np.max(Rewards),np.mean(Rewards)),end='')
                break
            # Take actions according to the policy   
            actions_probs = sess.run(output,feed_dict={X:state.reshape(1,4)})
            actions_probs = actions_probs.reshape(2)
            # choose action uniformly to keep exploring
            step_action = tf.multinomial(actions_probs.reshape(1,2),num_samples=1).eval()[0][0]
            step = [state,step_action]
            state,reward,done,_ = env.step(step_action)
            r += 1
            step.append(reward)
            history.append(step)
        ########################
        # save the policy neural network every 10 episodes
        if (i%10 == 0):
            saver.save(sess,'./REINFORCE_for_cart_and_pole_copy.ckpt')
        # Apply the REINFORCE algorithm 
        for t in range(len(history)):
            Gt = G(history,t,gamma=Gamma)
            sess.run([training_op, wieighted_loss],feed_dict={X:state.reshape(1,4),gt:[[Gt]]})
np.savetxt('REINFORCE_for_cart_and_pole_rewards_copy.txt',Rewards)

reward: 37, iteration: 216 , max_rewrad: 114, avergae_rewrad 29.10  