# Exercise 1: Multi-Armed Bandit

In [1]:
from __future__ import division

import numpy as np
import os
import tensorflow as tf
import json

from unityagents import UnityEnvironment

  return f(*args, **kwds)


### Hyperparameters

In [2]:
total_episodes = 4000 # Total episodes to run environment.
summary_path = './summaries/bandit-1' # Path to save summary statistics.
learning_rate = 1e-2 # Agent's learning rate.

### Load the Unity Environment

In [3]:
env = UnityEnvironment("./envs/Bandit")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BanditBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: discrete
        Vector Observation space size (per agent): 1
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 3
        Vector Action descriptions: , , 


### Examine the state space

In [4]:
brain = env.reset()
bandit_brain = brain["BanditBrain"]
print(bandit_brain.vector_observations)

[[0.]]


The environment is stateless.

### The Stateless Agent

In [5]:
class Agent(object):
    def __init__(self, learning_rate, num_actions):
        # These two lines established the feed-forward part of the network. 
        self.value_estimates = tf.Variable(tf.ones([num_actions]))
        self.action_probabilities = tf.nn.softmax(self.value_estimates / 0.5)

        # These lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)

        self.responsible_value = tf.slice(self.value_estimates,self.action_holder,[1])
        
        # We take the difference between the emperical reward and the value estimate
        self.loss = tf.squared_difference(self.responsible_value, self.reward_holder)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update = optimizer.minimize(self.loss)

### Training the Agent

In [6]:
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
agent = Agent(learning_rate, env.brains["BanditBrain"].vector_action_space_size)
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)

i = 0
rewards = []
losses = []
env.reset()

while i < total_episodes:
    if i < total_episodes / 2:
        # Pick action according to Boltzmann distribution.
        actions, values = sess.run([agent.action_probabilities, agent.value_estimates])
        a = np.random.choice(actions,p=actions)
        action = np.argmax(actions == a)
    else:
        # Pick action greedily
        actions, values = sess.run([agent.action_probabilities, agent.value_estimates])
        action = np.argmax(actions)

    #Get our reward from picking one of the chests.
    brains = env.step(vector_action = action, text_action = json.dumps(values.tolist()))
    bandit_brain = brains["BanditBrain"]
    reward = bandit_brain.rewards[0]
    rewards.append(reward)

    #Update the agent.
    _, value_loss = sess.run([agent.update, agent.loss], feed_dict={agent.reward_holder:[reward], 
                                          agent.action_holder:[action]})

    losses.append(value_loss)

    #Update our running tally of scores.
    if i % 50 == 0 and i > 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Reward', simple_value=float(np.mean(rewards[-50:])))
        summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(losses[-50:])))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Trial: {}, Mean Reward: {}".format(str(i), str(round(np.mean(rewards[-50:]), 3))))
    i += 1
env.close()

Mean Reward: -0.4
Mean Reward: -0.2
Mean Reward: -0.36
Mean Reward: -0.2
Mean Reward: -0.04
Mean Reward: -0.4
Mean Reward: -0.16
Mean Reward: -0.08
Mean Reward: 0.08
Mean Reward: 0.48
Mean Reward: 0.24
Mean Reward: 0.32
Mean Reward: 0.4
Mean Reward: 0.08
Mean Reward: 0.08
Mean Reward: -0.04
Mean Reward: 0.2
Mean Reward: 0.12
Mean Reward: -0.04
Mean Reward: 0.12
Mean Reward: 0.32
Mean Reward: 0.4
Mean Reward: 0.12
Mean Reward: -0.04
Mean Reward: 0.04
Mean Reward: 0.12
Mean Reward: 0.04
Mean Reward: 0.08
Mean Reward: 0.08
Mean Reward: -0.08
Mean Reward: -0.12
Mean Reward: 0.28
Mean Reward: 0.04
Mean Reward: 0.04
Mean Reward: 0.28
Mean Reward: 0.04
Mean Reward: 0.16
Mean Reward: 0.2
Mean Reward: 0.04
Mean Reward: 0.16
Mean Reward: 0.32
Mean Reward: 0.52
Mean Reward: 0.52
Mean Reward: 0.6
Mean Reward: 0.64
Mean Reward: 0.28
Mean Reward: 0.36
Mean Reward: 0.44
Mean Reward: 0.48
Mean Reward: 0.44
Mean Reward: 0.6
Mean Reward: 0.56
Mean Reward: 0.44
Mean Reward: 0.28
Mean Reward: 0.4
Mean Rew

In [None]:
env.close()