In [None]:
import gym
import numpy as np
import torch
import torch.optim as optim
from collections import deque
import random

In [None]:
# 设置参数
gamma = 0.98
epsilon = 0.2
epsilon_decay = 0.98
num_episodes = 1000
batch_size = 128

In [None]:
class QNet(torch.nn.Module):
    def __init__(self, input_dims, output_dims):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dims, 8)
        self.fc2 = torch.nn.Linear(8, 8)
        self.fc3_1 = torch.nn.Linear(8, output_dims)
        self.fc3_2 = torch.nn.Linear(8, output_dims)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        avalue = self.fc3_1(x)
        svalue = self.fc3_2(x)
        sub = avalue - torch.mean(avalue)
        return svalue + sub


In [None]:
# 动作策略
def get_action(q_net, state):
    with torch.no_grad():
        action_q_value = q_net(state)
    if np.random.rand() <= epsilon:
        action = np.argmax(action_q_value + np.random.randn(1, 2) * epsilon)
    else:
        action = np.argmax(action_q_value)
    return action

In [None]:
# 更新策略
def update_net(memory, q_net, target_net):
    data = random.sample(memory, batch_size)
    s = np.array([i[0] for i in data])
    ns = np.array([i[2] for i in data])
    with torch.no_grad():
        action_q_value = q_net(torch.tensor(s))
        next_action_q_value = q_net(torch.tensor(ns))
        target_q_value = target_net(torch.tensor(ns))
    next_action = np.argmax(next_action_q_value, axis=1)
    for i, (_, action, _, r, done) in enumerate(data):
        if done:
            action_q_value[i][action] = torch.tensor(r)
        else:
            target = r + gamma * target_q_value[i, next_action[i]]
            action_q_value[i][action] = torch.tensor(target)
    optimizer.zero_grad()
    _action_q_value = q_net(torch.tensor(s))
    loss = mse(_action_q_value, torch.tensor(action_q_value))
    loss.backward()
    optimizer.step()
    return q_net, loss

In [None]:
def update_target_net(q_net, target_net):
    with torch.no_grad():
        for param_q, param_target in zip(q_net.parameters(), target_net.parameters()):
            param_target.data.copy_(param_q.data)

In [None]:
# 程序设置
q_net = QNet(4, 2)
target_net = QNet(4, 2)
mse = torch.nn.MSELoss()
memory = deque(maxlen=5000)
optimizer = optim.Adam(q_net.parameters(), lr=0.001)

In [None]:
# 主程序
env = gym.make("CartPole-v1")

for i in range(num_episodes):
    s, _ = env.reset()
    total_reward = 0
    total_loss = []
    loss = 0
    step = 0
    epsilon = epsilon * epsilon_decay
    while total_reward < 3000:
        s = torch.tensor(s)
        action = get_action(q_net, s).detach().numpy()
        ns, r, done, _, _ = env.step(action)
        step += 1
        if done:
            r = -2
        else:
            r = r / 10
        total_reward += r
        memory.append([s, action, ns, r, done])
        s = ns
        if len(memory) > batch_size:
            q_net, loss = update_net(memory, q_net, target_net)
            if (step+1)%5 ==0:
                update_target_net(q_net, target_net)
            total_loss.append(loss.detach().numpy())
        if done:
            print(
                'epoch{},step:{},total_rewards:{},ave_loss:{}'.format(i, step, total_reward, np.mean(total_loss)))
            break
    if total_reward == 3000:
        print("训练完毕")
        break

In [None]:
env = gym.make("CartPole-v1", render_mode='human')
s, _ = env.reset()
step = 0
while True:
    s = torch.tensor(s)
    action = get_action(q_net, s).detach().numpy()
    s, r, done, _, _ = env.step(action)
    step += 1
    if done:
        break
print('step:{}'.format(step))
env.close()


In [None]:
env.close()