In [3]:
import time
import pandas as pd
import numpy as np

# 设定随机数种子
np.random.seed(2)
# 刷新地图的间隔
INTERVAL = 0.2
# 最大训练次数
EPISODE = 10

# 地图长度: 默认最右为宝藏
MAP_SIZE = 6
# 动作: 只有 l-左 或 r-右
ACTIONS = ['l', 'r']
# 探索率
EXPLORE_RATE = 0.8
# 学习率
LEARN_RATE = 0.2
# 折扣因子
DISCOUNT_FAC = 0.9

# 初始化 R 表
def create_R():
    r_table = pd.DataFrame(
        np.zeros((MAP_SIZE, len(ACTIONS))),
        columns = ACTIONS
    )
    # 在宝藏左侧一格位置执行 'r' 的奖励设为 1
    r_table.loc[MAP_SIZE-2, 'r'] = 1
    return r_table

# 初始化 Q 表
def create_Q():
    return pd.DataFrame(
        np.zeros((MAP_SIZE, len(ACTIONS))),
        columns = ACTIONS
    )

# 选择下一步动作
def choose_action(state, q_table):
    rewards = q_table.iloc[state, :]
    if (np.random.uniform() > EXPLORE_RATE) or (rewards.all() == 0):
        next_action = np.random.choice(ACTIONS)
    else:
        next_action = rewards.idxmax()
    return next_action

# 执行动作
def move(state, action):
    if state == 0:
        if action == 'l':
            return 0
        else:
            return 1
    else:
        if action == 'l':
            return state - 1
        else:
            return state + 1

# 渲染当前地图/本次训练结果
def render(state, episode, counter):
    map = ['-'] * (MAP_SIZE - 1) + ['T']
    if state == MAP_SIZE - 1:
        print('\rEpisode %d: use %d steps' % (episode, counter))
        time.sleep(INTERVAL * 5)
    else:
        map[state] = 'o'
        print('\r%s' % (''.join(map)), end='')
        time.sleep(INTERVAL)

# 主体代码
def rl(r_table, q_table):
    for episode in range(EPISODE):
        counter = 0
        state = 0
        finish = False
        render(state, episode+1, counter)
        # 没有寻得宝藏就一直进行训练
        while not finish:
            action = choose_action(state, q_table)
            next_state = move(state, action)
            reward = r_table.loc[state, action]
            exp = 0
            # 宝藏位的 Q 表不会被更新, 因此 MAX(Q[S']) 始终为 0
            if next_state == MAP_SIZE - 1:
                exp = reward
                finish = True
            else:
                exp = reward + DISCOUNT_FAC * q_table.loc[next_state, :].max()
            # 通过 Bellman 公式更新 Q 表
            q_table.loc[state, action] += LEARN_RATE * exp - LEARN_RATE * q_table.loc[state, action]
            # 开始下一次移动
            state = next_state
            counter += 1
            render(state, episode+1, counter)
    return q_table

R = create_R()
Q = create_Q()
rl(R, Q)


Episode 1: use 38 steps
Episode 2: use 28 steps
Episode 3: use 6 steps
Episode 4: use 7 steps
Episode 5: use 6 steps
Episode 6: use 7 steps
Episode 7: use 5 steps
Episode 8: use 6 steps
Episode 9: use 5 steps
Episode 10: use 5 steps


Unnamed: 0,l,r
0,0.001551,0.028993
1,0.000686,0.099193
2,0.000983,0.271637
3,0.001166,0.566603
4,0.00648,0.892626
5,0.0,0.0
