In [None]:
import torch
import gym
import numpy as np
from torch import optim

In [None]:
train = True
test = True
gamma = 0.9
epsilon = 0.1
num_episodes = 10000
max_steps_per_episode = 200

In [None]:
class QNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(in_features=16, out_features=4, bias=False)
        self.initialize_weights()

    def initialize_weights(self):
        torch.nn.init.uniform_(self.fc1.weight, a=0, b=0.001)

    def forward(self, x):
        return self.fc1(x)


def to_onehot(i, state_n):
    array = np.zeros(state_n, 'uint8')
    array[i] = 1
    return array


env = gym.make('FrozenLake-v1')
q_net = QNet()
optimizer = optim.Adam(
    q_net.parameters(),
    lr=0.01,
    weight_decay=0.0005
)
MSE = torch.nn.MSELoss()
for episode in range(num_episodes):
    # 初始化环境
    state, init_info = env.reset()
    total_reward = 0

    for step in range(max_steps_per_episode):
        with torch.no_grad():
            action_predict = q_net(torch.tensor([to_onehot(state, 16)], dtype=torch.float32)).detach().numpy()
        if np.random.rand(1) < epsilon:
            action_predict = action_predict + np.random.randn(1, env.action_space.n) * (1 / (episode + 1))
        action = np.argmax(action_predict)

        # 执行动作
        new_state, reward, done, truncated, info = env.step(action)

        # 获取下一步的动作
        with torch.no_grad():
            action_predict_later = q_net(torch.tensor([to_onehot(new_state, 16)], dtype=torch.float32))

        # 在Q-Learning中，策略是贪婪的，所以我们使用“max”来选择下一个动作
        max_q = np.max(action_predict_later.detach().numpy())
        target_q = action_predict
        target_q[0, action] = reward + gamma * max_q

        # 更新q_net参数
        q_net.train()
        optimizer.zero_grad()

        action_predict = q_net(torch.tensor([to_onehot(state, 16)], dtype=torch.float32))

        # 计算损失函数
        loss = MSE(action_predict, torch.tensor(target_q, dtype=torch.float32))

        # 反向传播
        loss.backward()

        # 通过优化器更新模型参数
        optimizer.step()

        state = new_state
        total_reward += reward

        if done:
            break
    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

In [None]:
env = gym.make('FrozenLake-v1', render_mode="human")
num_episodes_play = 10

for _ in range(num_episodes_play):
    state, init_info = env.reset()
    total_reward = 0

    for step in range(max_steps_per_episode):
        # 选择最优动作
        with torch.no_grad():
            action_predict = q_net(torch.tensor([to_onehot(state, 16)], dtype=torch.float32)).numpy()
        action = np.argmax(action_predict)

        # 执行动作
        new_state, reward, done, truncated, info = env.step(action)

        state = new_state
        total_reward += reward

        if done:
            break
    print(f"Playing Episode, Total Reward: {total_reward}")

In [None]:
env.close()  # 关闭环境