In [4]:
import gym
import gym.spaces
import math
import numpy as np
import time
from qlearning_template import QLearningAgent

def discretize_range(lower_bound, upper_bound, num_bins):
    return np.linspace(lower_bound, upper_bound, num_bins + 1)[1:-1]


def discretize_value(value, bins):
    return np.digitize(x=value, bins=bins)


def build_state(observation):
	# you should use discretize_value() functions to build single state representation.
    state = None
	
    return state


def play_and_train(env, agent, visualize=False, t_max=10 ** 4):
    """This function should
    - run a full game, actions given by agent.get_action(s)
    - train agent using agent.update(...) whenever possible
    - return total reward"""
    total_reward = 0.0
    s = env.reset()

    for t in range(t_max):
        d_s = build_state(s)

        a = agent.get_action(d_s)

        next_s, r, done, _ = env.step(a)
        if visualize:
            env.render()
            time.sleep(0.05)
        d_next_s = build_state(next_s)
        agent.update(d_s, a, d_next_s, r)
        s = next_s
        total_reward += r
        if done:
            break

    return total_reward


if __name__ == '__main__':
    env = gym.make("CartPole-v0").env
    env.reset()
    n_actions = env.action_space.n

    print(env.observation_space.high)
    print(env.observation_space.low)
    print('CartPole state: %s' % (env.reset()))

    agent = QLearningAgent(alpha = 0.3, get_legal_actions=lambda s: range(n_actions), epsilon=0.5, discount=1.0)

    # (x, x', theta, theta')
    state_bins = [  # Cart position.
        discretize_range(-2.4, 2.4, 2),
        # Cart velocity.
        discretize_range(-3.0, 3.0, 2),
        # Pole angle.
        discretize_range(-0.5, 0.5, 7),
        # Tip velocity.
        discretize_range(-2.0, 2.0, 7)
    ]
    max_bins = max(len(bin) for bin in state_bins)

    rewards = []
    for i in range(2000):
        rewards.append(play_and_train(env, agent))
        agent.epsilon *= 0.999

        if i % 10 == 0:
            print('Iteration {}, Average reward {:.2f}, Epsilon {:.3f}'.format(i, np.mean(rewards), agent.epsilon))

    print('Reward of Test agent = %.3f' % play_and_train(env, agent, visualize=True))

[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
CartPole state: [ 0.01939038  0.04881888 -0.00071829  0.01545599]
Iteration 0, Average reward 14.00, Epsilon 0.499
Iteration 10, Average reward 12.45, Epsilon 0.495
Iteration 20, Average reward 12.19, Epsilon 0.490
Iteration 30, Average reward 12.71, Epsilon 0.485
Iteration 40, Average reward 12.32, Epsilon 0.480
Iteration 50, Average reward 12.37, Epsilon 0.475
Iteration 60, Average reward 12.66, Epsilon 0.470
Iteration 70, Average reward 12.79, Epsilon 0.466
Iteration 80, Average reward 13.11, Epsilon 0.461
Iteration 90, Average reward 12.98, Epsilon 0.456
Iteration 100, Average reward 13.26, Epsilon 0.452
Iteration 110, Average reward 13.32, Epsilon 0.447
Iteration 120, Average reward 13.28, Epsilon 0.443
Iteration 130, Average reward 13.18, Epsilon 0.439
Iteration 140, Average reward 13.24, Epsilon 0.434
Iteration 150, Average reward 13.19, Epsilon 0.430
Iteration

Iteration 1920, Average reward 10.83, Epsilon 0.073
Iteration 1930, Average reward 10.83, Epsilon 0.072
Iteration 1940, Average reward 10.83, Epsilon 0.072
Iteration 1950, Average reward 10.82, Epsilon 0.071
Iteration 1960, Average reward 10.82, Epsilon 0.070
Iteration 1970, Average reward 10.81, Epsilon 0.070
Iteration 1980, Average reward 10.81, Epsilon 0.069
Iteration 1990, Average reward 10.80, Epsilon 0.068
Reward of Test agent = 12.000
