In [None]:
import gym
import numpy as np

In [None]:
# 创建FrozenLake环境
env = gym.make('FrozenLake-v1')

# Q-learning 参数
alpha = 0.85  # 学习率
gamma = 0.99  # 折扣因子

# Q-table 初始化
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# 训练参数
num_episodes = 10000
max_steps_per_episode = 200

for episode in range(num_episodes):
    state, init_info = env.reset()
    total_reward = 0
    for step in range(max_steps_per_episode):
        # 选择动作，可以使用ε-greedy策略
        action = np.argmax(q_table[state, :] + np.random.randn(1, 4) * (1 / (episode + 1)))  # 选择最优动作

        # 执行动作
        new_state, reward, done, truncated, info = env.step(action)

        # 更新Q值
        target = reward + gamma * np.max(q_table[new_state, :])
        update = alpha * (target - q_table[state, action])
        q_table[state, action] += update

        # # 更新Q值
        # q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (
        #         reward + gamma * np.max(q_table[new_state, :]))

        state = new_state
        total_reward += reward

        if done:
            break

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

In [None]:
# 使用学到的Q值进行游戏
env = gym.make('FrozenLake-v1', render_mode="human")
num_episodes_play = 10

for _ in range(num_episodes_play):
    state, init_info = env.reset()
    total_reward = 0

    for step in range(max_steps_per_episode):
        # 选择最优动作
        action = np.argmax(q_table[state, :])

        # 执行动作
        new_state, reward, done, truncated, info = env.step(action)
        state = new_state
        total_reward += reward

        if done:
            break

    print(f"Playing Episode, Total Reward: {total_reward}")

In [None]:
env.close()  # 关闭环境