In [None]:
import time
import gym
import torch
import os
from torch.utils.tensorboard import SummaryWriter

In [None]:
class Net(torch.nn.Module):
    def __init__(self, input_dims, output_dims):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_dims, 10)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(10, output_dims)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
def choose_action(actor_net, state):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        actions_probability = actor_net(torch.tensor(state))
    actions_probability = torch.softmax(actions_probability, dim=1)
    action = torch.distributions.Categorical(actions_probability).sample()
    return action.item()

In [None]:
def learn_critic(critic_net, critic_optimizer, state, reward, new_state, gamma):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    new_state = torch.tensor(new_state, dtype=torch.float32).unsqueeze(0)
    reward = torch.tensor(reward, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        v_ = critic_net(new_state)
    v = critic_net(state)
    td_error = gamma * v_ + reward - v
    loss = td_error.pow(2)
    critic_optimizer.zero_grad()
    loss.backward()
    critic_optimizer.step()
    return td_error.item()

In [None]:
def learn_actor(actor_net, actor_optimizer, state, action, td_error):
    # 1
    probability = torch.nn.Softmax()(actor_net(torch.Tensor(state)))
    log_prob = torch.log(probability)
    loss = - log_prob[action] * td_error
    actor_optimizer.zero_grad()
    loss.backward()
    actor_optimizer.step()

In [None]:
log_dir = './runs'
if os.path.exists(log_dir):
    try:
        shutil.rmtree(log_dir)
        print(f'文件夹 {log_dir} 已成功删除。')
    except OSError as error:
        print(f'删除文件夹 {log_dir} 失败: {error}')
else:
    os.makedirs(log_dir)
    print(f'文件夹 {log_dir} 不存在，已创建文件夹 {log_dir}。')

In [None]:
summary_writer = SummaryWriter(log_dir=log_dir)
env = gym.make('CartPole-v1')
gamma = 0.98
n_actions = env.action_space.n
n_features = env.observation_space.shape[0]
actor_net = Net(n_features, n_actions)
actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=1e-3)
critic_net = Net(n_features, 1)
critic_optimizer = torch.optim.Adam(critic_net.parameters(), lr=1e-3)

episodes = 5000
steps = 5000
for episode in range(episodes):
    start_time = time.time()
    state, _ = env.reset()
    step = 0
    while step <= steps:
        _action = choose_action(actor_net, state)
        new_state, reward, done, _, _ = env.step(_action)
        if done:
            reward = -20
        # else:
        #     reward = 0
        td_error = learn_critic(critic_net, critic_optimizer, state, reward, new_state, gamma)
        learn_actor(actor_net, actor_optimizer, state, _action, td_error)
        step += 1
        state = new_state
        if done:
            summary_writer.add_scalar('step', step, episode)
            print('Episode: {}/{}  | Step: {}  | Running Time: {:.4f}'.format(episode,
                                                                              episodes,
                                                                              step,
                                                                              time.time() - start_time))
            break
    if step >= 1000:
        break


In [None]:
env = gym.make("CartPole-v1", render_mode='human')
state, _ = env.reset()
step = 0
total_reward = 0
while True:
    state = torch.tensor(state)
    a = choose_action(actor_net, state)
    # a = actor_net(state)
    # a = int(torch.argmax(a))
    new_state, reward, done, _, _ = env.step(a)
    step += 1
    state = new_state
    if done:
        break
print('step:{}'.format(step))
env.close()

In [None]:
env.close()