In [22]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import algos
import features
import parametric
import policy
import chicken
from agents import HordeAgent
from rlbench import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
def make_agents(algo_lst, target, phi, update_params):
    """A quick function for making HordeAgent objects from a list 
    of algorithm classes. Somewhat brittle.
    """
    ret = []
    for cls in algo_lst:
        algo = cls(phi.length) 
        params = {k: v for k, v in update_params.items() if k in algo.update_params}
        container = HordeAgent(algo, target, phi, params)
        ret.append(container)
    return ret 

In [19]:
# define the experiment
num_states = 8
num_features = 8

# set up environment
env = chicken.Chicken(num_states)

# "true" values for states
true_values = {0: 0.4782968999999999, 
               1: 0.531441, 
               2: 0.5904899999999993, 
               3: 0.6560999999999992, 
               4: 0.7289999999999992, 
               5: 0.8099999999999997, 
               6: 0.8999999999999999, 
               7: 0.9999999999999998}

# set up algorithm parameters
state_gamma = {s: 0.9 for s in env.states}
state_gamma[0] = 0

f_gm = parametric.MapState(state_gamma)
f_gm_p = parametric.MapNextState(state_gamma)

update_params = {
    'alpha': 0.02,
    'beta': 0.005,
    'gm': f_gm,
    'gm_p': f_gm_p,
    'lm': 0.0,
    'lm_p': 0.0,
    'interest': 1.0,
}

# Define the target policy
pol_pi = policy.FixedPolicy({s: {0: 1} for s in env.states})
# Define the behavior policy
pol_mu = policy.FixedPolicy({s: {0: 1} if s < 4 else {0: 0.5, 1: 0.5} for s in env.states})

# set feature mapping
phi = features.RandomBinary(num_features, num_features // 2, random_seed=101011)
# phi = features.Int2Unary(num_states)


# run the experiment
max_steps = 1000
agent_lst = make_agents([x for x in algos.algo_registry.values()], pol_pi, phi, update_params)
data = run_many(agent_lst, pol_mu, env, max_steps)

In [25]:
tvec = dct2vec(true_values)
for agent in agent_lst:
    print(agent.algo.__class__)
    vvec = dct2vec(get_values(env.states, phi, agent.theta))
    diff = vvec - tvec
    print(np.sqrt(np.mean(diff**2)))


<class 'algos.GTD2'>
0.704496309448
<class 'algos.TDC'>
0.609349583968
<class 'algos.LSTD'>
0.622027146307
<class 'algos.TD'>
0.714806322764
<class 'algos.GTD'>
0.703004178962
<class 'algos.ETD'>
0.327433749876
