# Simple Policy Gradient Descent using Tensorflow
Used to solve a multi-armed bandit problem.

https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-1-fd544fab149

In [1]:
import tensorflow as tf
import numpy as np

# The bandits
Four bandits that has a probability to generate a positive reward. Lower score means higher probability to generate a positive reward.

In [2]:
# List of bandits rewards
bandits = [0.2, 0, 0.2, -5]
num_bandits = len(bandits)

def pullArm(bandit):
    result = np.random.randn(1) # normal distribution
    if result > bandit:
        return 1
    else:
        return -1

# The Agent
The agent has a weight for each bandit. The value is an estimate of the value of return from choosing the bandit.
We use a policy gradient method to update the agent by moving the for the selected action toward the recieved reward.

In [4]:
tf.reset_default_graph()

# Feed forward network
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)

# Training with backprop
lr = 0.001
reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
responsible_weight = tf.slice(weights, action_holder, [1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
update = optimizer.minimize(loss)

# Training the agent
The agent takes action in our enviorment and receives rewards. Using rewards and actions, we can know how to properly update our network in order to more often choose actions that will yield the highest rewards over time.

In [17]:
total_episodes = 1000
total_rewards = np.zeros(num_bandits)
e = 0.1 # exploration probability (taking random action)

init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)
    i = 0
    
    while i < total_episodes:
        
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = session.run(chosen_action)
        
        reward = pullArm(bandits[action]) # Get reward from bandit
        
        # Update network
        _, resp, ww = session.run([update, responsible_weight, weights], feed_dict={reward_holder: [reward], action_holder: [action]})
        
        # Update our running tally of scores
        total_rewards[action] += reward
        if i % 50 == 0:
            print('Reward for the bandits: ', total_rewards)
        i += 1
print('Agent learning that the best bandit is: ', np.argmax(ww) + 1)
print('Learned weights: ', ww)

Reward for the bandits:  [-1.  0.  0.  0.]
Reward for the bandits:  [ -2.  -2.  -2.  45.]
Reward for the bandits:  [ -3.  -2.  -3.  93.]
Reward for the bandits:  [  -5.   -3.   -3.  140.]
Reward for the bandits:  [  -5.   -5.   -3.  182.]
Reward for the bandits:  [  -5.   -6.   -3.  229.]
Reward for the bandits:  [  -7.   -6.   -2.  274.]
Reward for the bandits:  [  -8.   -6.   -4.  321.]
Reward for the bandits:  [  -8.   -5.   -4.  368.]
Reward for the bandits:  [  -9.   -6.   -5.  415.]
Reward for the bandits:  [ -13.   -5.   -5.  460.]
Reward for the bandits:  [ -12.   -4.   -6.  501.]
Reward for the bandits:  [ -12.   -4.   -6.  551.]
Reward for the bandits:  [ -14.   -4.   -4.  595.]
Reward for the bandits:  [ -13.   -5.   -6.  641.]
Reward for the bandits:  [ -12.   -5.   -6.  688.]
Reward for the bandits:  [ -12.   -5.   -6.  734.]
Reward for the bandits:  [ -12.   -4.   -6.  779.]
Reward for the bandits:  [ -14.   -4.   -5.  824.]
Reward for the bandits:  [ -12.   -4.   -6.  87