In [None]:
import numpy as np
import tensorflow as tf
import tf_agents as tfa
import matplotlib.pyplot as plt

from tf_agents.trajectories import time_step as ts

In [None]:
class TenArmedTestbed(tfa.environments.py_environment.PyEnvironment):

    def __init__(self):
        self._observation_spec = tfa.specs.array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.float32, minimum=-4, maximum=4, name='observation')
        self._action_spec = tfa.specs.array_spec.BoundedArraySpec(
            shape=(10,), dtype=np.int32, minimum=0, maximum=9, name='action')
        
        self._action_values = np.random.normal(size=(10,))
        
        super(tfa.environments.py_environment.PyEnvironment, self).__init__()

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        return ts.restart(0.0, batch_size=self.batch_size)

    def _step(self, action):
        observation = np.random.normal(self._action_values, size=(10,))[action]
        reward = observation
        return ts.termination(observation, reward)

In [None]:
class GreedyPolicy(tfa.policies.tf_policy.TFPolicy):
    
    def __init__(self, action_values):
        observation_spec = tfa.specs.tensor_spec.BoundedTensorSpec(
            shape=(1,), dtype=np.float32, minimum=-4, maximum=4, name='observation')
        time_step_spec = ts.time_step_spec(observation_spec)
        
        action_spec = tfa.specs.tensor_spec.BoundedTensorSpec(
            shape=(10,), dtype=np.int32, minimum=0, maximum=9)
        
        self._action_values = action_values

        super(GreedyPolicy, self).__init__(time_step_spec=time_step_spec,
                                           action_spec=action_spec)
    
    def _action(self, time_step, policy_state, seed=0):
        action = tf.argmax(self._action_values, output_type=tf.int32)
        return tfa.trajectories.policy_step.PolicyStep(action, policy_state)

In [None]:
class EpsilonGreedyPolicy(tfa.policies.tf_policy.TFPolicy):
    
    def __init__(self, epsilon, action_values):
        observation_spec = tfa.specs.tensor_spec.BoundedTensorSpec(
            shape=(1,), dtype=np.float32, minimum=-4, maximum=4, name='observation')
        time_step_spec = ts.time_step_spec(observation_spec)
        
        action_spec = tfa.specs.tensor_spec.BoundedTensorSpec(
            shape=(10,), dtype=np.int32, minimum=0, maximum=9)
        
        self._epsilon = epsilon
        self._action_values = action_values

        super(EpsilonGreedyPolicy, self).__init__(time_step_spec=time_step_spec,
                                                  action_spec=action_spec)
    
    def _action(self, time_step, policy_state, seed=0):
        
        if tf.random.uniform([1], maxval=1) < self._epsilon:
            action = tf.random.uniform([1], maxval=10, dtype=tf.int32)
        else:
            action = tf.argmax(self._action_values, output_type=tf.int32)
        
        return tfa.trajectories.policy_step.PolicyStep(action, policy_state)

In [None]:
class Agent(tfa.agents.tf_agent.TFAgent):
    
    def __init__(self, epsilon, niter):
        self.action_values = tf.Variable(np.zeros(10), dtype=tf.float32)
        self.naction = tf.Variable(np.zeros(10), dtype=tf.float32)
        self.history = [0]
        self.nsteps = tf.Variable(0, dtype=tf.int32)
        
        epsilon = tf.constant(epsilon)
        policy = GreedyPolicy(self.action_values)
        collect_policy = EpsilonGreedyPolicy(epsilon, self.action_values)
        
        time_step_spec = policy.time_step_spec
        action_spec = policy.action_spec
        
        super(Agent, self).__init__(time_step_spec=time_step_spec,
                                    action_spec=action_spec,
                                    policy=policy,
                                    collect_policy=collect_policy,
                                    train_sequence_length=None)
    
    def _train(self, experience, weights=None):
        
        observation = experience.observation
        action = experience.action
        reward = experience.reward
        self.nsteps.assign_add(1)
        
        action_index = tf.reshape(action, (1,1))
        step_index = tf.reshape(self.nsteps, (1,1))
    
        average_rewards = self.history[self.nsteps-1] + \
                          1/tf.cast(self.nsteps, tf.float32) * \
                          (reward - self.history[self.nsteps-1])
        
        self.history.append(tf.squeeze(average_rewards).numpy())
        
        self.naction.assign(
            tf.tensor_scatter_nd_add(
                self.naction,
                action_index,
                [1]))
        
        action_value_update = tf.reshape(1/self.naction[tf.squeeze(action)] * \
                                             (reward - self.action_values[tf.squeeze(action)]),
                                        (1))
        self.action_values.assign(
            tf.tensor_scatter_nd_add(
                self.action_values,
                action_index,
                action_value_update))
        
        return tfa.agents.tf_agent.LossInfo((), ())

In [None]:
def trajectory_for_bandit(initial_step, action_step, final_step):
    return tfa.trajectories.trajectory.Trajectory(
                                observation=tf.expand_dims(initial_step.observation, 0),
                                action=tf.expand_dims(action_step.action, 0),
                                policy_info=action_step.info,
                                reward=tf.expand_dims(final_step.reward, 0),
                                discount=tf.expand_dims(final_step.discount, 0),
                                step_type=tf.expand_dims(initial_step.step_type, 0),
                                next_step_type=tf.expand_dims(final_step.step_type, 0))

In [None]:
epsilon = 0.1

environment = TenArmedTestbed()
tf_environment = tfa.environments.tf_py_environment.TFPyEnvironment(environment)

niter = 1000
agent = Agent(epsilon, niter)

step = tf_environment.reset()
for _ in range(niter):
    action_step = agent.collect_policy.action(step)
    next_step = tf_environment.step(action_step.action)
    experience = trajectory_for_bandit(step, action_step, next_step)
    # print(experience)
    agent.train(experience)
    step = next_step

In [None]:
plt.plot(agent.history)
plt.show()

In [None]:
epsilon = 0.1

history = np.zero
for _ in range(100):
    environment = TenArmedTestbed()
    tf_environment = tfa.environments.tf_py_environment.TFPyEnvironment(environment)
    agent = Agent(epsilon)

    step = tf_environment.reset()
    for __ in range(1000):
        action_step = agent.collect_policy.action(step)
        next_step = tf_environment.step(action_step.action)
        experience = trajectory_for_bandit(step, action_step, next_step)
        # print(experience)
        agent.train(experience)
        step = next_step
    history = history + np.trim_zeros(agent._history.numpy())
history = history/100

plt.plot(history)
plt.show()