In [None]:
import time
import gym
import torch
import numpy as np
import os
from torch.utils.tensorboard import SummaryWriter

env = gym.make('Pendulum-v1')

In [None]:
class CriticNet(torch.nn.Module):
    def __init__(self, state_dim=env.observation_space.shape[0], output_dims=1):
        super(CriticNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, 64)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(64, 64)
        self.relu2 = torch.nn.ReLU()
        # self.fc3 = torch.nn.Linear(64, 64)
        # self.relu3 = torch.nn.ReLU()
        self.fc4 = torch.nn.Linear(64, output_dims)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        # x = self.relu3(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
class ActorNet(torch.nn.Module):
    def __init__(self, state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]):
        super(ActorNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, 64)
        self.relu1 = torch.nn.ReLU()

        self.fc2 = torch.nn.Linear(64, 64)
        self.relu2 = torch.nn.ReLU()

        # self.fc3 = torch.nn.Linear(64, 64)
        # self.relu3 = torch.nn.ReLU()

        self.fc_mu = torch.nn.Linear(64, action_dim)
        self.fc_sigma = torch.nn.Linear(64, action_dim)
        self.tanh = torch.nn.Tanh()
        self.softplus = torch.nn.Softplus()

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        # x = self.relu3(self.fc3(x))
        mu = self.tanh(self.fc_mu(x))
        sigma = self.softplus(self.fc_sigma(x))
        # return torch.square(mu), sigma
        return mu * 2, sigma


In [None]:
class ProximalPolicyOptimization:
    def __init__(self, state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]):
        self.state_dim = state_dim
        self.actor_net = ActorNet(state_dim, action_dim)
        self.old_actor_net = ActorNet(state_dim, action_dim)
        self.critic_net = CriticNet(state_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=1e-4)
        self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=2e-4)
        self.epsilon = 0.2
        self.gamma = 0.9

    def choose_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            mu, sigma = self.actor_net(state)
        action = torch.clip(torch.distributions.Normal(mu, sigma).sample(), -2, 2)
        return [action.item()]

    def compute_discounted_reward(self, rewards, gamma, new_state):
        v_ = self.critic_net(torch.tensor(new_state, dtype=torch.float32).unsqueeze(0))
        v_ = v_.item()
        discounted_reward = []
        for r in rewards[::-1]:
            v_ = r + gamma * v_
            discounted_reward.append(v_)
        discounted_reward.reverse()
        return discounted_reward

    def train(self, state, action, reward):
        self.old_actor_net.load_state_dict(self.actor_net.state_dict())
        action = torch.tensor(action, dtype=torch.float32)

        for _ in range(10):
            # 计算advantage，也就是td-error
            reward1 = torch.tensor(reward.copy(), dtype=torch.float32)
            state1 = torch.tensor(state.copy(), dtype=torch.float32)
            adv = reward1 - self.critic_net(state1)
            pi = torch.distributions.Normal(*self.actor_net(state1))
            oldpi = torch.distributions.Normal(*self.old_actor_net(state1))
            ratio = torch.exp(pi.log_prob(action) - oldpi.log_prob(action) + 1e-8)
            loss = -torch.mean(
                torch.min(ratio * adv,
                          torch.clamp(ratio, 1. - 0.2, 1. + 0.2) * adv)
            )

            self.actor_optimizer.zero_grad()
            loss.backward()
            self.actor_optimizer.step()

        for _ in range(10):
            reward2 = torch.tensor(reward.copy(), dtype=torch.float32)
            state2 = torch.tensor(state.copy(), dtype=torch.float32)
            adv = reward2 - self.critic_net(state2)
            loss = torch.mean(torch.square(adv))
            self.critic_optimizer.zero_grad()
            loss.backward()
            self.critic_optimizer.step()

    def model_save(self, path):
        torch.save({
            'actor_net_model_state_dict': self.actor_net.state_dict(),
            'old_actor_net_model_state_dict': self.old_actor_net.state_dict(),
            'critic_net_model_state_dict': self.critic_net.state_dict(),
            'actor_optimizer_state_dict': self.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': self.critic_optimizer.state_dict(),
        }, path)

    def model_load(self, path):
        checkpoint = torch.load(path)
        self.actor_net.load_state_dict(checkpoint['actor_net_model_state_dict'])
        self.old_actor_net.load_state_dict(checkpoint['old_actor_net_model_state_dict'])
        self.critic_net.load_state_dict(checkpoint['critic_net_model_state_dict'])
        self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict'])
        self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict'])

In [None]:
log_dir = './runs'
if os.path.exists(log_dir):
    try:
        shutil.rmtree(log_dir)
        print(f'文件夹 {log_dir} 已成功删除。')
    except OSError as error:
        print(f'删除文件夹 {log_dir} 失败: {error}')
else:
    os.makedirs(log_dir)
    print(f'文件夹 {log_dir} 不存在，已创建文件夹 {log_dir}。')

In [None]:
summary_writer = SummaryWriter(log_dir=log_dir)
batch = 32
ppo = ProximalPolicyOptimization()
episode = 500
steps = 320
all_reward = []
for epoch in range(episode):
    start_time = time.time()
    state, _ = env.reset()
    step = 0
    buffer = []
    episode_rewards = 0
    buffer_state, buffer_action, buffer_reward = [], [], []
    while step < steps:
        # choose action
        action = ppo.choose_action(state)
        new_state, reward, done, _, _ = env.step(action)
        buffer_state.append(state)
        buffer_action.append(action)
        buffer_reward.append((reward + 6) / 10)
        state = new_state
        episode_rewards += reward
        step += 1
        if (step + 1) % batch == 0 or step == steps - 1:
            discounted_reward = ppo.compute_discounted_reward(buffer_reward, ppo.gamma, new_state)
            # buffer_state = torch.FloatTensor(buffer_state)
            # buffer_action = torch.tensor(buffer_action).view(-1, 1)
            # discounted_reward = torch.tensor(discounted_reward).view(-1, 1)
            buffer_state, buffer_action, discounted_reward = np.vstack(buffer_state), np.vstack(
                buffer_action), np.vstack(discounted_reward)
            ppo.train(buffer_state, buffer_action, discounted_reward)
            buffer_state, buffer_action, buffer_reward = [], [], []
    all_reward.append(episode_rewards)
    summary_writer.add_scalar('episode_rewards', episode_rewards, epoch)
    print("Epoch/Episode: {}/{},reward: {}".format(epoch + 1, episode, episode_rewards))

In [None]:
env = gym.make("Pendulum-v1", render_mode='human')
state, _ = env.reset()
step = 0
episode_rewards = 0
while True:
    state = torch.tensor(state)
    a = ppo.choose_action(state)
    new_state, reward, done, _, _ = env.step(a)
    step += 1
    state = new_state

In [None]:
env.close()

In [None]:
ppo.model_save('PPO_Pendulum_V1.pth')