In [1]:
import gym
import numpy as np

bins_range = [(-2.4, 2.4), (-5.0, 5.0), (-0.42, 0.42), (-3.5, 3.5)]
bins = [np.linspace(mn, mx, 100)[1:-1] for mn, mx in bins_range]

from collections import defaultdict
Q = defaultdict(float)

def build_state(obs):
    state = ''
    for feature, _bin in zip(obs, bins):
        state += str(np.digitize(feature, _bin))
    return state


def update_Q(s, r, a, s_next, done):
    max_q_next = max([Q[s_next, i] for i in actions]) 
    # Do not include the next state's value if currently at the terminal state.
    Q[s, a] += alpha * (r + gamma * max_q_next * (1.0 - done) - Q[s, a])
    
    

def Q_policy(state):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    state = build_state(ob)
    # Pick the action with highest q value.
    qvals = {a: Q[state, a] for a in actions}
    max_q = max(qvals.values())
    # In case multiple actions have the same maximum q value.
    actions_with_max_q = [a for a, q in qvals.items() if q == max_q]
    return np.random.choice(actions_with_max_q)


n_steps = 500
episodes = 1000
epsilon = 0.2  # 10% chances to apply a random action
gamma = 0.95  # Discounting factor
alpha = 0.01 # soft update param

env = gym.make("CartPole-v0")
actions = range(2)
ob = env.reset()
rewards = []


for episode in range(episodes):
    reward = 0.0
    for step in range(n_steps):
        state = build_state(ob)
        action = Q_policy(state)
        ob_next, r, done, _ = env.step(action)
        state_next = build_state(ob_next)
        update_Q(state, r, action, state_next, done)
        reward += r
        if done:
            rewards.append(reward)
            reward = 0.0
            ob = env.reset()
        else:
            ob = ob_next
print np.mean(rewards), np.std(rewards), np.min(rewards), np.max(rewards)

23.036634556067007 12.881722355726799 1.0 157.0
