# Exercise 2: Contextual Bandit

In [None]:
from __future__ import division

import json
import numpy as np
import os
import tensorflow as tf

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
total_episodes = 4000 # Total episodes to run environment.
summary_path = './summaries/c-bandit' # Path to save summary statistics.
learning_rate = 5e-3 # Agent's learning rate.

### Load the Unity Environment

In [None]:
env = UnityEnvironment("./envs/Contextual")
default_brain = env.brain_names[0]

Select "Contextual Bandit" and press "Start Learning."

### Examine the state space

In [None]:
brain = env.reset()
bandit_brain = brain[default_brain]
print(bandit_brain.vector_observations)

State (s) is an integer which corresponds to a discrete state.

### The Contexual Agent

In [None]:
class Agent(object):
    def __init__(self, learning_rate, num_states, num_actions):
        # These two lines established the feed-forward part of the network. 
        self.state_input = tf.placeholder(shape=[1],dtype=tf.int32)
        state = tf.contrib.layers.one_hot_encoding(self.state_input, num_states)
        self.value_estimates = tf.layers.dense(state, num_actions, 
                                                   activation=None, 
                                                   use_bias=None,
                                                   kernel_initializer=tf.ones_initializer())
        self.value_estimates = tf.reshape(self.value_estimates,[-1])
        self.action_probabilities = tf.nn.softmax(self.value_estimates / 0.5)

        # These lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)

        self.responsible_value = tf.slice(self.value_estimates,self.action_holder,[1])
        
        # We take the difference between the emperical reward and the value estimate
        self.loss = tf.squared_difference(self.responsible_value, self.reward_holder)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update = optimizer.minimize(self.loss)

### Training the Agent

In [None]:
rewards = []
losses = []
space_size = env.brains[default_brain].vector_observation_space_size
action_size = env.brains[default_brain].vector_action_space_size

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
agent = Agent(learning_rate, space_size, action_size)
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    summary_writer = tf.summary.FileWriter(summary_path)
    i = 0
    brains = env.reset()
    state = brains[default_brain].vector_observations[0]
    while i < total_episodes:
        # Start a new episode
        if i < total_episodes / 2:
            # Pick action according to Boltzmann distribution.
            actions, values = sess.run([agent.action_probabilities, agent.value_estimates],
                                       feed_dict={agent.state_input: state})
            a = np.random.choice(actions,p=actions)
            action = np.argmax(actions == a)
        else:
            # Pick action greedily according to value estimates
            actions, values = sess.run([agent.action_probabilities, agent.value_estimates], 
                                       feed_dict={agent.state_input: state})
            action = np.argmax(actions)

        #Get our reward from picking one of the chests.
        brains = env.step(vector_action = action, text_action = json.dumps(values.tolist()))
        bandit_brain = brains[default_brain]
        reward = bandit_brain.rewards[0]
        state = bandit_brain.vector_observations[0]
        rewards.append(reward)
        
        #Update the agent.
        _, v_loss = sess.run([agent.update, agent.loss], feed_dict={agent.reward_holder:[reward], 
                                              agent.action_holder:[action],
                                              agent.state_input: state})
        losses.append(v_loss)
        
        #Update our running tally of scores.
        if i % 50 == 0 and i > 0:
            summary = tf.Summary()
            summary.value.add(tag='Info/Reward', simple_value=float(np.mean(rewards[-50:])))
            summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(losses[-50:])))
            summary_writer.add_summary(summary, i)
            summary_writer.flush()
            print("Trial: {}, Mean Reward: {}".format(str(i), str(round(np.mean(rewards[-50:]), 3))))
        i+=1
env.close()

In [None]:
env.close()