In [1]:
# gym enviroment setup
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("MountainCar-v0")
env.reset()

array([-0.56192849,  0.        ])

In [2]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

In [3]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

In [4]:
# Q Table setup

LEARNING_RATE = 0.5
DISCOUNT = 0.95
EPISODES = 1000
SHOW_EVERY = 200
Q_TABLE_LEN = 20

DISCRETE_OS_SIZE = [Q_TABLE_LEN] * len(env.observation_space.high) 
# self.high = np.array([self.max_position, self.max_speed])
# [20] * 2 = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
# discrete_os_win_size = array([0.09 , 0.007])
print(DISCRETE_OS_SIZE)

[20, 20]


In [5]:
# decay epsilon

epsilon = 1
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES // 2 # //表示整数除法
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

In [6]:
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) // discrete_os_win_size
    return tuple(discrete_state.astype(int))  # (18, 12)

In [7]:
def take_epsilon_greedy_action(q_table, state, epsilon):
    if np.random.random() < epsilon:
        action = np.random.randint(0, env.action_space.n)
    else:
        action = np.argmax(q_table[get_discrete_state(state)])
    return action

In [8]:
def q_learning(env, episodes, discount, epsilon, alpha):
    q_table = np.zeros((DISCRETE_OS_SIZE + [env.action_space.n]))
    # q_table.shape = (20, 20, 3)
    for episode in range(episodes):
        state = env.reset()
        while(True):
            action = take_epsilon_greedy_action(q_table, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            best_next_action = np.argmax(q_table[get_discrete_state(next_state)])

            td_target = reward + discount * q_table[get_discrete_state(next_state)][best_next_action]
            q_table[get_discrete_state(state)][action] += alpha * (td_target - q_table[get_discrete_state(state)][action])
            if next_state[0] >= 0.5:
                q_table[get_discrete_state(state)][action] = 0
                break
            state = next_state
            if START_EPSILON_DECAYING <= episode <= END_EPSILON_DECAYING:
                epsilon -= epsilon_decay_value            

    return q_table

In [9]:
def sarsa(env, episodes, discount, epsilon, alpha):
    q_table = np.zeros((DISCRETE_OS_SIZE + [env.action_space.n]))
    # q_table.shape = (20, 20, 3)
    for episode in range(episodes):
        state = env.reset()
        action = take_epsilon_greedy_action(q_table, state, epsilon)
        while(True):
            next_state, reward, done, _ = env.step(action)
            next_action = take_epsilon_greedy_action(q_table, next_state, epsilon)

            td_target = reward + discount * q_table[get_discrete_state(next_state)][next_action]
            q_table[get_discrete_state(state)][action] += alpha * (td_target - q_table[get_discrete_state(state)][action])
            if next_state[0] >= 0.5:
                q_table[get_discrete_state(state)][action] = 0
                break
            state = next_state
            action = next_action
            if START_EPSILON_DECAYING <= episode <= END_EPSILON_DECAYING:
                epsilon -= epsilon_decay_value

    return q_table

In [10]:
def montain_car_render(q_table):
    state = env.reset()
    while True:
        action = np.argmax(q_table[get_discrete_state(state)])
        next_state, _, done, _ = env.step(action)
        env.render()
        if done:
            break
        state = next_state

### Run Q-Learning

In [11]:
q_table_QLearning = q_learning(env, EPISODES, DISCOUNT, epsilon, LEARNING_RATE)
montain_car_render(q_table_QLearning)
env.close()

### Run SARSA

In [11]:
q_table_sarsa = sarsa(env, EPISODES, DISCOUNT, epsilon, LEARNING_RATE)
montain_car_render(q_table_sarsa)
env.close()