A very simple contextual bandit example with 3 arms

In [1]:
import gym
from gym.spaces import Discrete, Box
import numpy as np
import random
from ray import tune
import time

In [2]:
class SimpleContextualBandit (gym.Env):
    def __init__ (self, config=None):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=-1., high=1., shape=(2, ))
        self.cur_context = None

    def reset (self):
        self.cur_context = random.choice([-1., 1.])
        return np.array([self.cur_context, -self.cur_context])

    def step (self, action):
        rewards_for_context = {
            -1.: [-10, 0, 10],
            1.: [10, 0, -10],
        }
        
        reward = rewards_for_context[self.cur_context][action]
        
        return (np.array([-self.cur_context, self.cur_context]), reward, True,
                {
                    "regret": 10 - reward
                })

In [3]:
bandit = SimpleContextualBandit()
bandit.reset()



array([-1.,  1.])

In [4]:
for i in range(10):
    obs = bandit.step(bandit.action_space.sample())
    print(obs)

action 2
(array([ 1., -1.]), 10, True, {'regret': 0})
action 0
(array([ 1., -1.]), -10, True, {'regret': 20})
action 2
(array([ 1., -1.]), 10, True, {'regret': 0})
action 1
(array([ 1., -1.]), 0, True, {'regret': 10})
action 0
(array([ 1., -1.]), -10, True, {'regret': 20})
action 0
(array([ 1., -1.]), -10, True, {'regret': 20})
action 1
(array([ 1., -1.]), 0, True, {'regret': 10})
action 0
(array([ 1., -1.]), -10, True, {'regret': 20})
action 2
(array([ 1., -1.]), 10, True, {'regret': 0})
action 2
(array([ 1., -1.]), 10, True, {'regret': 0})


In [6]:
stop = {
    "training_iteration": 200,
    "timesteps_total": 100000,
    "episode_reward_mean": 10.0,
}

config = {
    "env": SimpleContextualBandit,
}

In [7]:
start_time = time.time()

analysis = tune.run("contrib/LinUCB", config=config, stop=stop)

print("The trials took", time.time() - start_time, "seconds\n")

2020-05-25 22:20:45,852	INFO resource_spec.py:212 -- Starting Ray with 3.81 GiB memory available for workers and up to 1.91 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-25 22:20:46,265	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


Trial name,status,loc
contrib_LinUCB_SimpleContextualBandit_00000,RUNNING,


[2m[36m(pid=49223)[0m 2020-05-25 22:20:56,301	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=49223)[0m 2020-05-25 22:20:56,305	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=49223)[0m 2020-05-25 22:20:56,368	INFO trainable.py:217 -- Getting current IP.
Result for contrib_LinUCB_SimpleContextualBandit_00000:
  custom_metrics: {}
  date: 2020-05-25_22-20-56
  done: false
  episode_len_mean: 1.0
  episode_reward_max: 10.0
  episode_reward_mean: 9.9
  episode_reward_min: 0.0
  episodes_this_iter: 100
  episodes_total: 100
  experiment_id: 9605be5530a649efb330e6914bb5a8ef
  experiment_tag: '0'
  grad_time_ms: 0.317
  hostname: derwen
  info:
    grad_time_ms: 0.317
    learner:
      cumulative_regret: 10.0
      update_latency: 0.00015091896057128906
    num_steps_sampled: 100
    num_steps_trained: 100
    opt_peak

Trial name,status,loc,iter,total time (s),ts,reward
contrib_LinUCB_SimpleContextualBandit_00000,TERMINATED,,2,0.302497,200,10


The trials took 10.897155046463013 seconds

