# Exercise 4: Q-Learning with Monte Carlo Updates

In [None]:
from __future__ import division

import json
import numpy as np
import random
import os
import tensorflow as tf

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
y = .99 # Discount rate
start_e = 1 # Starting epsilon value
end_e = 0.1 # Final epsilon value
summary_freq = 50 # How often to display information about training
annealing_steps = 10000 # Number of steps to anneal epsilon
num_episodes = 1000 # Total episodes to run environment
summary_path = './summaries/q-mc' # Path to save statistics
learning_rate = 1e-2 # Agent learning rate

### Load the Unity environment

In [None]:
env = UnityEnvironment("./envs/Tabular", worker_id=3)
default_brain = env.brain_names[0]

### Examine the state space

In [None]:
brain = env.reset()
bandit_brain = brain[default_brain]
print(bandit_brain.vector_observations)

State (s) is an integer which corresponds to a discrete state.

## The Q-Learning Agent

In [None]:
class QAgent(object):
    def __init__(self, num_states, num_actions, lr):
        # These lines establish the feed-forward part of the network used to estimate Q(s, a)
        self.state_input = tf.placeholder(shape=[None, 1],dtype=tf.int32)
        
        # The network takes an integer and uses it to index a row of the matrix.
        state = tf.contrib.layers.one_hot_encoding(self.state_input, num_states)
        self.q_out = tf.layers.dense(state, num_actions,
                                          use_bias=None, 
                                          kernel_initializer= tf.ones_initializer(),
                                          activation=None)
        self.q_out = tf.layers.flatten(self.q_out)
        
        # Selected action is largest Q value for current state.
        self.predict = tf.argmax(self.q_out,1)

        # Below we obtain the loss by taking the sum of squares difference
        # between the target and prediction Q values.
        self.q_next = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action = tf.placeholder(shape=[None],dtype=tf.int32)
        self.action_onehot = tf.contrib.layers.one_hot_encoding(self.action, num_actions)
        self.selected_q = tf.reduce_sum(self.q_out * self.action_onehot, axis=1)
        self.loss = tf.reduce_sum(tf.squared_difference(self.q_next, self.selected_q))
        
        # Update our network.
        trainer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = trainer.minimize(self.loss)

### Training the network

In [1]:
# Function to take list of rewards and discount factor
# and produce discounted sum of future rewards.
def discount_rewards(r, gamma):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
space_size = env.brains[default_brain].vector_observation_space_size
action_size = env.brains[default_brain].vector_action_space_size
agent = QAgent(space_size, action_size, learning_rate)

# Start an interactive TensorFlow session.
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)

# Create variables which will be used throughout training.
e_drop = (start_e - end_e) / annealing_steps
e = start_e
value_table = np.zeros([space_size])
episode_list = []
reward_list = []
loss_list = []

# Reset the environment before training.
brains = env.reset()
state = brains[default_brain].vector_observations[0]

# Start training loop
for i in range(num_episodes):
    # Reset environment and get first new state
    total_reward = 0
    done = False
    episode_steps = 0
    episode_buffer = []
    while not done:
        episode_steps +=1
        # Choose an action by greedily (with e chance of random action) from the Q-network
        action, Q = sess.run([agent.predict,agent.q_out],feed_dict={agent.state_input: [state]})
        action = action[0]
        if np.random.rand(1) < e:
            action = np.random.randint(0, action_size)

        # Get new state and reward from environment
        brains = env.step(vector_action = action, text_action = json.dumps(value_table.tolist()))
        tabular_brain = brains[default_brain]
        state_1 = tabular_brain.vector_observations[0]
        reward = tabular_brain.rewards[0]
        done = tabular_brain.local_done[0]
        episode_buffer.append([state, action, reward, state_1, done])

        total_reward += reward
        state = state_1
        if e > end_e:
            e -= e_drop
            
    # Train our network using target and estimated Q values
    episode_matrix = np.array(episode_buffer)
    episode_matrix[:, 2] = discount_rewards(episode_matrix[:, 2], y)
    _, q_table, v_loss = sess.run([agent.update, 
                                   tf.trainable_variables()[0], 
                                   agent.loss],
             feed_dict={agent.state_input: np.vstack(episode_matrix[:, 0]), 
                        agent.q_next: episode_matrix[:, 2],
                        agent.action: episode_matrix[:, 1]})
    
    loss_list.append(v_loss)
    episode_list.append(episode_steps)
    reward_list.append(total_reward)
    value_table = np.mean(q_table, axis=1)

    # Update our running tally of scores and save information to Tensorboard.
    if i % summary_freq == 0 and i != 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Reward', simple_value=float(np.mean(reward_list[-summary_freq:])))
        summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(loss_list[-summary_freq:])))
        summary.value.add(tag='Info/Epsilon', simple_value=float(e))
        summary.value.add(tag='Info/Q Estimate', simple_value=float(np.mean(value_table)))
        summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(episode_list[-summary_freq:])))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Episode: {}, Epsilon: {}, Mean Reward: {}".format(str(i), str(e), str(round(np.mean(reward_list[-summary_freq:]), 3))))
env.close()

In [None]:
env.close()