In [12]:
import numpy as np
# from scipy.stats import entropy

from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from infomercial.util import Distribution
from infomercial.discrete.value import information_value
from infomercial.discrete.value import entropy
from infomercial.discrete.value import estimate_prob

from infomercial.local_gym import BanditFourArmedDeterministicFixed

# Get a 4 armed bandit

In [2]:
num_arms = 4
env = BanditFourArmedDeterministicFixed()

## Define some util funcs

In [3]:
def softmax(x, beta=1):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x*beta) / np.sum(np.exp(x*beta), axis=0)

def policy(x, beta=1):
    """Softmax action selection policy"""
    p = softmax(x, beta=beta)
    actions = np.arange(0, len(x))
    return np.random.choice(actions, p=p), p
    
def learn(agent, delta, lr=0.1):
    agent += lr * delta
    return agent

# Max reward

In [4]:
num_episodes = 10000
lr = .01
beta = 6

# -----------------------------------------------
R = []
P = []

r_agent = np.random.rand(num_arms)
r_old = r_agent
actions = []

# -----------------------------------------------
for n in range(num_episodes):
    env.reset()
    
    # Act!
    a, p = policy(r_agent, beta=beta)
    actions.append(a)
    _, r, _, _ = env.step(a)

    # RW learning.
    reward = np.zeros(num_arms)
    reward[a] = r

    delta = reward - r_agent
    r_old = r_agent
    r_agent = learn(r_agent, delta, lr)
    
    # Log 
    R.append(r)
    P.append(p)
    
    if n % 1000 == 0:
        print(f"{n} Action {a}, p {p} -> Avg. R {np.mean(R)}")

0 Action 3, p [0.01218356 0.02312752 0.04387168 0.92081725] -> Avg. R 0.0
1000 Action 1, p [0.00254124 0.99237566 0.00254138 0.00254171] -> Avg. R 0.8091908091908092
2000 Action 1, p [0.00254502 0.99236493 0.00254502 0.00254502] -> Avg. R 0.9020489755122438
3000 Action 1, p [0.00251266 0.99246203 0.00251266 0.00251266] -> Avg. R 0.931356214595135
0 Action 3, p [0.01218356 0.02312752 0.04387168 0.92081725] -> Avg. R 0.0
1000 Action 1, p [0.00254124 0.99237566 0.00254138 0.00254171] -> Avg. R 0.8091908091908092
2000 Action 1, p [0.00254502 0.99236493 0.00254502 0.00254502] -> Avg. R 0.9020489755122438
3000 Action 1, p [0.00251266 0.99246203 0.00251266 0.00251266] -> Avg. R 0.931356214595135
0 Action 3, p [0.01218356 0.02312752 0.04387168 0.92081725] -> Avg. R 0.0
1000 Action 1, p [0.00254124 0.99237566 0.00254138 0.00254171] -> Avg. R 0.8091908091908092
2000 Action 1, p [0.00254502 0.99236493 0.00254502 0.00254502] -> Avg. R 0.9020489755122438
3000 Action 1, p [0.00251266 0.99246203 0.00

In [19]:
probs, cond = estimate_prob(actions)
print(cond, probs)

h = entropy(actions)
print(h)

[0, 1, 2, 3] [0.0051 0.9732 0.0066 0.0151]
0.14980979257720722
[0, 1, 2, 3] [0.0051 0.9732 0.0066 0.0151]
0.14980979257720722
[0, 1, 2, 3] [0.0051 0.9732 0.0066 0.0151]
0.14980979257720722


# Max information

In [None]:
num_episodes = 10000
lr = 0.01
beta = 6

# ---------------------------------------------------------------
class Actor(object):
    """A max H(p(r=1)) actor agent."""
    def __init__(self, num_arms=4, lr=0.1, beta=4):
        self.lr = lr
        self.beta = beta
        self.num_arms = num_arms
        self.W = np.ones(num_arms) / num_arms
        
    def __call__(self):
        return policy(self.W, beta=self.beta)
    
    def learn(self, a, p_reward):
        self.W[a] += self.lr * (entropy([p_reward, 1 - p_reward]) - np.log(2))

# ---------------------------------------------------------------
h_actor = Actor(num_arms=num_arms, lr=lr, beta=beta)
h_critic = Distribution()
h_critic.update(0)
h_critic.update(1)
p_action = Distribution()

H = []
P = []
for n in range(num_episodes):
    env.reset()
    
    # Act
    a, p = h_actor()
    p_action.update(a)
    _, r, _, _ = env.step(a)
    
    # Entopy grad. learning
    h_critic.update(r)
    p_reward = h_critic(1)
    h_actor.learn(a, p_reward)
    
    # Log
    h = entropy([p_reward, 1 - p_reward])
    H.append(h)
    P.append(p)
    
    if n % 1000 == 0:    
        print(f"{n}: a {a}, r {r}, p_r {p_reward} -> H {h}")

In [None]:
p_action.keys(), p_action.probs()

# Max information value