In [23]:
import numpy as np
from scipy.stats import entropy

from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from infomercial.util import Distribution
from infomercial.discrete.value import information_value
from infomercial.local_gym import BanditFourArmedDeterministicFixed

# Get a 4 armed bandit

In [24]:
num_arms = 4
env = BanditFourArmedDeterministicFixed()

In [25]:
def softmax(x, beta=1):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x*beta) / np.sum(np.exp(x*beta), axis=0)

def policy(x, beta=1):
    """Softmax action selection policy"""
    p = softmax(x, beta=beta)
    actions = np.arange(0, len(x))
    return np.random.choice(actions, p=p), p
    
def learn(agent, delta, lr=0.1):
    agent += lr * delta
    return agent

# Max reward

In [26]:
num_episodes = 10000
lr = .01
beta = 6

# -----------------------------------------------
R = []
P = []

r_agent = np.random.rand(num_arms)
r_old = r_agent
p_action = Distribution()

# -----------------------------------------------
for n in range(num_episodes):
    env.reset()
    
    # Act!
    a, p = policy(r_agent, beta=beta)
    p_action.update(a)
    _, r, _, _ = env.step(a)

    # RW learning.
    reward = np.zeros(num_arms)
    reward[a] = r

    delta = reward - r_agent
    r_old = r_agent
    r_agent = learn(r_agent, delta, lr)
    
    # Log 
    R.append(r)
    P.append(p)
    
    if n % 1000 == 0:
        print(f"{n} Action {a}, p {p} -> Avg. R {np.mean(R)}")

0 Action 1, p [0.00726541 0.81519239 0.16996908 0.00757312] -> Avg. R 1.0
1000 Action 1, p [0.00251383 0.99245818 0.00251417 0.00251383] -> Avg. R 0.98001998001998
2000 Action 1, p [0.00266565 0.99200306 0.00266565 0.00266565] -> Avg. R 0.9855072463768116
0 Action 1, p [0.00726541 0.81519239 0.16996908 0.00757312] -> Avg. R 1.0
1000 Action 1, p [0.00251383 0.99245818 0.00251417 0.00251383] -> Avg. R 0.98001998001998
2000 Action 1, p [0.00266565 0.99200306 0.00266565 0.00266565] -> Avg. R 0.9855072463768116
3000 Action 1, p [0.00247417 0.99257748 0.00247417 0.00247417] -> Avg. R 0.9893368877040987
4000 Action 1, p [0.00268625 0.99194126 0.00268625 0.00268625] -> Avg. R 0.9900024993751562
5000 Action 1, p [0.00254745 0.99235766 0.00254745 0.00254745] -> Avg. R 0.9894021195760848
6000 Action 1, p [0.00246083 0.9926175  0.00246083 0.00246083] -> Avg. R 0.9903349441759707
3000 Action 1, p [0.00247417 0.99257748 0.00247417 0.00247417] -> Avg. R 0.9893368877040987
4000 Action 1, p [0.00268625

In [27]:
p_action.keys(), p_action.values()

([1, 2, 3, 0], [0.992, 0.0031, 0.0031, 0.0018])

([1, 2, 3, 0], [0.992, 0.0031, 0.0031, 0.0018])

# Max information

In [28]:
num_episodes = 10000
lr = 0.01
beta = 6

# ---------------------------------------------------------------
class Actor(object):
    """A max H(p(r=1)) actor agent."""
    def __init__(self, num_arms=4, lr=0.1, beta=4):
        self.lr = lr
        self.beta = beta
        self.num_arms = num_arms
        self.W = np.ones(num_arms) / num_arms
        
    def __call__(self):
        return policy(self.W, beta=self.beta)
    
    def learn(self, a, p_reward):
        self.W[a] += self.lr * (entropy([p_reward, 1 - p_reward]) - np.log(2))

# ---------------------------------------------------------------
h_actor = Actor(num_arms=num_arms, lr=lr, beta=beta)
h_critic = Distribution()
h_critic.update(0)
h_critic.update(1)
p_action = Distribution()

H = []
P = []
for n in range(num_episodes):
    env.reset()
    
    # Act
    a, p = h_actor()
    p_action.update(a)
    _, r, _, _ = env.step(a)
    
    # Entopy grad. learning
    h_critic.update(r)
    p_reward = h_critic(1)
    h_actor.learn(a, p_reward)
    
    # Log
    h = entropy([p_reward, 1 - p_reward])
    H.append(h)
    P.append(p)
    
    if n % 1000 == 0:    
        print(f"{n}: a {a}, r {r}, p_r {p_reward} -> H {h}")

0: a 1, r 1, p_r 0.6666666666666666 -> H 0.6365141682948128
1000: a 1, r 1, p_r 0.2622133599202393 -> H 0.5753593141339879
2000: a 2, r 0, p_r 0.2556165751372941 -> H 0.5684218751893065
0: a 1, r 1, p_r 0.6666666666666666 -> H 0.6365141682948128
1000: a 1, r 1, p_r 0.2622133599202393 -> H 0.5753593141339879
2000: a 2, r 0, p_r 0.2556165751372941 -> H 0.5684218751893065
3000: a 0, r 0, p_r 0.2524142524142524 -> H 0.5649719621848507
4000: a 1, r 1, p_r 0.2533100174868848 -> H 0.5659424392334094
5000: a 0, r 0, p_r 0.25344793124125525 -> H 0.5660914786901137
3000: a 0, r 0, p_r 0.2524142524142524 -> H 0.5649719621848507
4000: a 1, r 1, p_r 0.2533100174868848 -> H 0.5659424392334094
5000: a 0, r 0, p_r 0.25344793124125525 -> H 0.5660914786901137
6000: a 1, r 1, p_r 0.25170747959353656 -> H 0.5642032398052755
7000: a 1, r 1, p_r 0.2527488219334571 -> H 0.5653349337013692
8000: a 3, r 0, p_r 0.25228039485193055 -> H 0.5648265751846111
6000: a 1, r 1, p_r 0.25170747959353656 -> H 0.5642032398

In [29]:
p_action.keys(), p_action.values()

([1, 0, 3, 2], [0.2511, 0.2501, 0.2491, 0.2497])

([1, 0, 3, 2], [0.2511, 0.2501, 0.2491, 0.2497])

# Max information value