In [1]:
import gym 
import numpy as np

In [3]:
env_name = "CartPole-v1"
env = gym.make(env_name)
env.reset()

array([ 0.02067652,  0.01344542, -0.03101065, -0.00176886])

In [6]:
env.reset()   
# Test environment.
for _ in range(250):
    action = env.action_space.sample()
    new_state, reward, done, _ = env.step(action) 
    env.render()
env.close()



In [7]:
print("Observation space high: ", env.observation_space.high)
print("Observation space low", env.observation_space.low)
print("Actions: ", env.action_space.n)
print("-----------------------------------------------------------------")
print("Cart Possition: ",env.observation_space.high[0], env.observation_space.low[0])
print("Cart Velosity: ",env.observation_space.high[1], env.observation_space.low[1])
print("Pole Angle: ",env.observation_space.high[2], env.observation_space.low[2])
print("Pole Angular Velocity: ",env.observation_space.high[3], env.observation_space.low[3])

Observation space high:  [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
Observation space low [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
Actions:  2
-----------------------------------------------------------------
Cart Possition:  4.8 -4.8
Cart Velosity:  3.4028235e+38 -3.4028235e+38
Pole Angle:  0.41887903 -0.41887903
Pole Angular Velocity:  3.4028235e+38 -3.4028235e+38


In [8]:
MAX_MIN_VELOCITY = 1.9

In [9]:
# Size of q-matrix
DISCRETE_OS_SIZE = [10,10,10,10]

In [10]:
DISCRETE_OS_WIN_SIZE = [0,0,0,0]
DISCRETE_OS_WIN_SIZE[0] = (env.observation_space.high[0] - env.observation_space.low[0])/DISCRETE_OS_SIZE[0]
DISCRETE_OS_WIN_SIZE[1] = 2*MAX_MIN_VELOCITY/DISCRETE_OS_SIZE[1]
DISCRETE_OS_WIN_SIZE[2] = (env.observation_space.high[2] - env.observation_space.low[2])/DISCRETE_OS_SIZE[2]
DISCRETE_OS_WIN_SIZE[3] = 2*MAX_MIN_VELOCITY/DISCRETE_OS_SIZE[3]

In [11]:
q_table = np.random.uniform(low = 0, high = 1, size = (DISCRETE_OS_SIZE + [env.action_space.n]))
## Create a random Q table with values beetwen 0 and 1 in manifest size.

In [12]:
 # Convert the state.
def get_discrete_state(state):   
    if state[1] < MAX_MIN_VELOCITY*(-1):
        state[1] = -1*MAX_MIN_VELOCITY
    elif state[1] > MAX_MIN_VELOCITY:
        state[1] = MAX_MIN_VELOCITY
    if state[3] < MAX_MIN_VELOCITY*(-1):
        state[3] = -1*MAX_MIN_VELOCITY
    elif state[3] > MAX_MIN_VELOCITY:
        state[3] = MAX_MIN_VELOCITY
    env_low = env.observation_space.low
    env_low[1] = -1*MAX_MIN_VELOCITY
    env_low[3] = -1*MAX_MIN_VELOCITY
    discrete_state = (state - env_low) / DISCRETE_OS_WIN_SIZE
    if discrete_state[1] > DISCRETE_OS_SIZE[1] - 1:
        discrete_state[1] = DISCRETE_OS_SIZE[1] - 1
    if discrete_state[3] > DISCRETE_OS_SIZE[3] - 1:
        discrete_state[3] = DISCRETE_OS_SIZE[3] - 1
    return tuple(discrete_state.astype(np.int))

In [13]:
LEARNING_RATE = 0.1  # a in equation 
DISCOUNT = 0.99  # gamma, g, in equation
EPISODES = 15000
VIEW_EVERY = 1000

 Q(state, action) = Q(state, action) + a * ( r(new_state) + gamma * Max{Q(new_state,..)})

In [15]:
env.reset()

max_score = 0

for episode in range(EPISODES):
    
    new_score = 0
    
    if episode % VIEW_EVERY == 0:
        view = True
    else:
        view = False
    
    done = False
    discrete_state = get_discrete_state(env.reset())

    while not done:
        action = np.argmax(q_table[discrete_state])
        new_state, reward, done, _ = env.step(action)
        new_discrete_state = get_discrete_state(new_state)
        if view:
            env.render()
        if not done:
            max_future_q = np.max(q_table[new_discrete_state])   
            current_q = q_table[discrete_state + (action, )]
            new_q = (1 - LEARNING_RATE)*current_q + LEARNING_RATE*(reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action, )] = new_q
        new_score += 1
        if new_score > max_score:
            max_score = new_score

        discrete_state = new_discrete_state

env.close()
print("The max score was {} ticks".format(max_score))

The max score was 208 ticks
