In [1]:
import gym

In [2]:
env = gym.make('LunarLander-v2')

In [3]:
ACTIONS = env.action_space.n

In [4]:
ACTIONS

4

In [5]:
env.reset()

array([ 0.00631599,  1.4053323 ,  0.63972884, -0.24836452, -0.00731189,
       -0.14490816,  0.        ,  0.        ], dtype=float32)

In [6]:
import random
for rep in range(10):
    i=0
    while i < 100:
        env.step(random.choice([0,1,2,3]))
        env.render()
        i+=1
    env.reset()

In [7]:
import numpy as np
import random

In [8]:
def discretize_state(state):
    dstate = list(state[:5])
    dstate[0] = int(0.5*(state[0]+0.7)*10/2.0) # pos x
    dstate[1] = int(0.5*(state[1]+0.5)*10/2.0) # pos y
    dstate[2] = int(0.5*(state[2]+1.5)*10/3.0) # vel x
    dstate[3] = int(0.5*(state[3]+2)*10/3.0) # vel y
    dstate[4] = int(0.5*(state[4]+3.14159)*10/(2*3.14159)) # angle
    if dstate[0] >= 5: dstate[0] = 4
    if dstate[1] >= 5: dstate[1] = 4
    if dstate[2] >= 5: dstate[2] = 4
    if dstate[3] >= 5: dstate[3] = 4
    if dstate[4] >= 5: dstate[4] = 4
    if dstate[0] < 0: dstate[0] = 0
    if dstate[1] < 0: dstate[1] = 0
    if dstate[2] < 0: dstate[2] = 0
    if dstate[3] < 0: dstate[3] = 0
    if dstate[4] < 0: dstate[4] = 0
    return tuple(dstate)

In [18]:
def run(num_episodes, alpha, gamma, explore_mult):
    max_rewards = []
    last_reward = []
    qtable = np.subtract(np.zeros((5, 5, 5, 5, 5, ACTIONS)), 100) # start all rewards at -100
    explore_rate = 1.0
    for episode in range(num_episodes):
        s = env.reset()
        state = discretize_state(s)
        
        for step in range(10000):

            # select action
            if random.random() < explore_rate:
                action = random.choice(range(ACTIONS))
            else:
                action = np.argmax(qtable[state])

            (new_s, reward, done, _) = env.step(action)
            new_state = discretize_state(new_s)

            # update Q
            best_future_q = np.amax(qtable[new_state]) # returns best possible reward from next state
            prior_val = qtable[state + (action,)]
            qtable[state + (action,)] = (1.0-alpha)*prior_val + alpha*(reward + gamma * best_future_q)
            state = new_state
            
            if done or step == 9999:
                last_reward.append(reward)
                break
        
        if explore_rate > 0.01:
            explore_rate *= explore_mult    
        max_rewards.append(np.amax(qtable))
        
    return (max_rewards, last_reward[-50:], qtable) # return rewards from last 50 episodes

In [19]:
num_episodes = 100
for alpha in [0.05, 0.10, 0.15]:
    for gamma in [0.85, 0.90, 0.95]:
        (max_rewards, last_reward, _) = run(num_episodes=num_episodes, alpha=alpha, gamma=gamma, explore_mult=0.995)
        print("alpha = %.2f, gamma = %.2f, mean last 50 outcomes = %.2f, q max: %.2f, q mean: %.2f" % (alpha, gamma, np.mean(last_reward), np.max(max_rewards), np.mean(max_rewards)))

alpha = 0.05, gamma = 0.85, mean last 50 outcomes = -100.00, q max: -1.85, q mean: -32.01
alpha = 0.05, gamma = 0.90, mean last 50 outcomes = -100.00, q max: -23.57, q mean: -51.64
alpha = 0.05, gamma = 0.95, mean last 50 outcomes = -100.00, q max: -37.14, q mean: -63.29
alpha = 0.10, gamma = 0.85, mean last 50 outcomes = -100.00, q max: 1.02, q mean: -19.94
alpha = 0.10, gamma = 0.90, mean last 50 outcomes = -100.00, q max: -5.10, q mean: -28.52
alpha = 0.10, gamma = 0.95, mean last 50 outcomes = -100.00, q max: -16.78, q mean: -41.41
alpha = 0.15, gamma = 0.85, mean last 50 outcomes = -100.00, q max: 7.87, q mean: -10.83
alpha = 0.15, gamma = 0.90, mean last 50 outcomes = -100.00, q max: 0.63, q mean: -18.82
alpha = 0.15, gamma = 0.95, mean last 50 outcomes = -100.00, q max: -9.07, q mean: -37.61


In [20]:
(max_rewards, last_reward, qtable) = run(num_episodes=200, alpha=0.1, gamma=0.95, explore_mult=0.995)
print("mean last 50 outcomes = %.2f, q max: %.2f, q mean: %.2f" % (np.mean(last_reward), np.max(max_rewards), np.mean(max_rewards)))
np.save('qtable.npy', qtable)

mean last 50 outcomes = -100.00, q max: -6.85, q mean: -33.35


In [22]:
import gym
import numpy as np
env = gym.make('LunarLander-v2')
qtable =np.load('qtable.npy')
for i in range(100):
    s=env.reset()
    state=discretize_state(s)
    for step in range(10000):
        env.render()
        action = np.argmax(qtable[state])
        (new_s,reward,done,_)=env.step(action)
        new_state = discretize_state(new_s)
        if done or step == 9999:
            break
        state = new_state