### 采用Q-Learning算法求解该问题中的最优路径

In [1]:
# 导入相关包
import gym
import matplotlib
import numpy as np
from collections import defaultdict
from gym.envs.toy_text.cliffwalking import CliffWalkingEnv

In [2]:
# 实例化环境CliffWalkingEnv
env = CliffWalkingEnv()

In [3]:
# epsilon\_greedy返回policy函数
# policy函数的输入是状态，输出是根据epsilon\_greedy采取各个行动的概率
def epsilon_greedy(Q, epsilon, nA):
    def policy(state):
        A_prob = np.ones(nA) * epsilon / nA
        best_action = np.argmax(Q[state])
        A_prob[best_action] += (1 - epsilon)
        return A_prob
    return policy

In [4]:
def q_learning(env, num_epi, discount=1.0, epsilon=0.1, alpha=0.5):
    # Q = {state1:[action1-value, action2-value],...}
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    policy = epsilon_greedy(Q, epsilon, env.action_space.n)
    for i in range(num_epi):
        state = env.reset()
        while(True):
            # 在状态state下，根据策略policy，计算行动概率
            prob = policy(state)
            # 采取行动：action
            action = np.random.choice(np.arange(len(prob)), p=prob)
            # 行动action导致下一个状态next\_state
            next_state, reward, done, _ = env.step(action)
            # 选择最优行动
            best_next_action = np.argmax(Q[next_state])
            Q[state][action] += alpha * (reward + discount * Q[next_state][best_next_action] - Q[state][action])
            if done:
                break
            state = next_state
    return Q

In [5]:
def td_render(Q):
    state = env.reset()
    while True:
        # 根据Q-Learning算法得到的Q，寻找最优路径
        next_state, reward, done, _ = env.step(np.argmax(Q[state]))
        env.render()
        if done:
            break
        state = next_state

In [6]:
Q = q_learning(env, 1000)
td_render(Q)

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  x  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  x  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  x  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  x  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  x  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  x  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o