In [None]:
import os
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import torch.nn.functional as F
import matplotlib.pyplot as plt

from collections import deque, namedtuple
import random

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make('LunarLanderContinuous-v2', render_mode='rgb_array')
observation_space = env.observation_space.shape[0]  # 状态空间
action_space = env.action_space.shape[0]   # 动作空间
action_bound = env.action_space.high[0]
action_bound_tensor = [torch.tensor(env.action_space.low, device=device), torch.tensor(env.action_space.high, device=device)]

BATCH_SIZE = 10
MAX_NUM_BATCH = 200

## PG

In [None]:
class Policy_Gradient:
    def __init__(self, hidden_size, input_size, output_size, learning_rate, eps, gamma):
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=False),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        ).to(device)
        
        self.action_log_std = nn.Parameter(torch.zeros(output_size))

        self.lr = learning_rate
        self.eps = eps
        self.gamma = gamma

        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, eps=eps)
        self.reset()

    def forward(self, x):
        action_mean = self.model(x)
        action_log_std = self.action_log_std.expand_as(action_mean)
        action_std = torch.exp(action_log_std)
        return action_mean, action_std

    def reset(self):
        self.episode_actions = torch.tensor([], requires_grad=True, device=device)
        self.episode_rewards = []

    def save_checkpoint(self, directory, episode):
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = os.path.join(directory, 'checkpoint_{}.pth'.format(episode))
        torch.save(self.model.state_dict(), f=filename)
        print('保存当前模型至 \'{}\''.format(filename))

    def load_checkpoint(self, directory, filename):
        self.model.load_state_dict(torch.load(os.path.join(directory, filename)))
        print('重新开始训练 checkpoint \'{}\'.'.format(filename))
        return int(filename[11:-4])

    def backward(self):
        future_reward = 0
        rewards = []
        for r in self.episode_rewards[::-1]:
            future_reward = r + self.gamma * future_reward
            rewards.append(future_reward)
        rewards = torch.tensor(rewards[::-1], dtype=torch.float32, device=device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        loss = torch.sum(torch.mul(self.episode_actions, rewards).mul(-1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.reset()

In [None]:
env = gym.make('LunarLanderContinuous-v2', render_mode='rgb_array')
load_filename = None
save_directory = "./pg_weight"
batch_rewards = []
batch = 0
epsilon = 1e-4
pg_reward_list = []

hidden_size = 1024
lr = 1e-4
eps = 1e-4
gamma = 0.99

pg_agent = Policy_Gradient(hidden_size=hidden_size, input_size=observation_space, output_size=action_space, learning_rate=lr, eps=eps, gamma=gamma)

if load_filename is not None:
    batch = pg_agent.load_checkpoint(save_directory, load_filename)
while batch <= MAX_NUM_BATCH:
    batch_rewards = []

    for _ in range(BATCH_SIZE):
        state = env.reset()[0]
        done = False
        for _ in range(10000):
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device)
            action_mean, action_std = pg_agent.forward(state_tensor)
            dist = Normal(action_mean, action_std)
            action = dist.sample()
            action = torch.clamp(action, action_bound_tensor[0], action_bound_tensor[1])
            next_state, reward, done, _, _ = env.step(action.cpu().numpy())
            pg_agent.episode_actions = torch.cat([pg_agent.episode_actions, dist.log_prob(action).sum().reshape(1)])
            pg_agent.episode_rewards.append(reward)
            state = next_state
            if done:
                break
        
        batch_rewards.append(np.sum(pg_agent.episode_rewards))
        pg_agent.backward()
        
    batch += 1
    
    if batch % 50 == 0 and save_directory is not None:
        pg_agent.save_checkpoint(save_directory, batch)

    print('Batch: {}, average reward: {}'.format(batch, np.array(batch_rewards).mean()))
    pg_reward_list.append(np.array(batch_rewards).mean())

env.close()


In [None]:
plt.plot(pg_reward_list)
plt.grid(True)
plt.xlim([0,200])
# plt.ylim([-300,300])
plt.xlabel('Number of Policy Iterations')
plt.ylabel('Batch Average Reward')

## PPO

In [None]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


In [None]:
class ActorCriticContinuous(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(ActorCriticContinuous, self).__init__()

        self.action_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

        self.action_log_std = nn.Parameter(torch.zeros(output_size))

        self.value_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device)
        action_mean = self.action_layer(state)
        action_log_std = self.action_log_std.expand_as(action_mean)
        action_std = torch.exp(action_log_std)
        dist = Normal(action_mean, action_std)
        action = dist.sample()

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action).sum(dim=-1))

        return action.cpu().numpy()

    def evaluate(self, state, action):
        action_mean = self.action_layer(state)
        action_log_std = self.action_log_std.expand_as(action_mean)
        action_std = torch.exp(action_log_std)
        dist = Normal(action_mean, action_std)

        action_logprobs = dist.log_prob(action).sum(dim=-1)
        dist_entropy = dist.entropy().sum(dim=-1)
        state_value = self.value_layer(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy


In [None]:
class PPOAgent:
    def __init__(self, input_size, output_size, hidden_size, lr, eps, gamma, K_epochs, eps_clip, update_timestep):
        self.lr = lr
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.timestep = 0
        self.memory = Memory()
        self.update_timestep = update_timestep

        self.policy = ActorCriticContinuous(input_size, output_size, hidden_size).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, eps=eps)
        self.policy_old = ActorCriticContinuous(input_size, output_size, hidden_size).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def update(self):   
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.memory.rewards), reversed(self.memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        old_states = torch.stack(self.memory.states).detach().to(device)
        old_actions = torch.stack(self.memory.actions).detach().to(device)
        old_logprobs = torch.stack(self.memory.logprobs).detach().to(device)

        for _ in range(self.K_epochs):
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())

    def step(self, reward, done):
        self.timestep += 1 
        self.memory.rewards.append(reward)
        self.memory.is_terminals.append(done)

        if self.timestep % self.update_timestep == 0:
            self.update()
            self.memory.clear_memory()
            self.timestep = 0

    def act(self, state):
        return self.policy_old.act(state, self.memory)
    
    def save_checkpoint(self, directory, episode):
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = os.path.join(directory, 'checkpoint_{}.pth'.format(episode))
        torch.save(self.policy.state_dict(), filename)
        print('保存当前模型至 \'{}\''.format(filename))
        
    def load_checkpoint(self, directory, filename):
        self.policy.load_state_dict(torch.load(os.path.join(directory, filename)))
        print('重新开始训练 checkpoint \'{}\'.'.format(filename))
        return int(filename[11:-4])


In [None]:
hidden_size = 1024
update_timestep = 1200
lr = 1e-4
eps = 1e-4
gamma = 0.99
K_epochs = 4
eps_clip = 0.2
load_filename = None
batch = 0
save_directory = "./ppo_weight"

ppo_agent = PPOAgent(observation_space, action_space, hidden_size, lr, eps, gamma, K_epochs, eps_clip, update_timestep)

ppo_reward_list = []

if load_filename is not None:
    batch = ppo_agent.load_checkpoint(save_directory, load_filename)

while batch <= MAX_NUM_BATCH:
    rewards = []
    total_rewards = []

    for _ in range(BATCH_SIZE):
        state = env.reset()[0]
        total_reward = 0
        done = False
        for _ in range(10000):
            action = ppo_agent.act(state)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward 
            rewards.append(reward)
            ppo_agent.step(reward, done)   
            if done: 
                break
    batch += 1
    total_rewards.append(total_reward)
    
    if batch % 50 == 0 and save_directory is not None:
        ppo_agent.save_checkpoint(save_directory, batch)

    print('Batch: {}, average reward: {}'.format(batch, np.array(total_rewards).mean()))
    ppo_reward_list.append(np.array(total_rewards).mean())

env.close()


In [None]:
plt.plot(ppo_reward_list)
plt.grid(True)
plt.xlim([0,200])
# plt.ylim([-300,300])
plt.xlabel('Number of Policy Iterations')
plt.ylabel('Batch Average Reward')

 ## DDPG


In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


In [None]:
class Actor(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

class Critic(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size + output_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        xs = F.relu(self.fc1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)



In [None]:
class DDPGAgent:
    def __init__(self, input_size, action_size, hidden_size, lr_actor, lr_critic, gamma, tau, buffer_size, batch_size):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(buffer_size, batch_size)

        self.actor_local = Actor(input_size, action_size, hidden_size).to(device)
        self.actor_target = Actor(input_size, action_size, hidden_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        self.critic_local = Critic(input_size, action_size, hidden_size).to(device)
        self.critic_target = Critic(input_size, action_size, hidden_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0.0001)

        self.noise = OUNoise(action_size)

        self.update_targets(1.0)  # 硬更新目标网络

    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -action_bound, action_bound)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.memory.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # 更新Critic
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 更新Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 软更新目标网络
        self.update_targets(self.tau)

    def update_targets(self, tau):
        for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save_checkpoint(self, directory, episode):
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = os.path.join(directory, 'checkpoint_{}.pth'.format(episode))
        torch.save({
            'actor_state_dict': self.actor_local.state_dict(),
            'critic_state_dict': self.critic_local.state_dict()
        }, filename)
        print('保存当前模型至 \'{}\''.format(filename))
        
    def load_checkpoint(self, directory, filename):
        checkpoint = torch.load(os.path.join(directory, filename))
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
        print('重新开始训练 checkpoint \'{}\'.'.format(filename))
        return int(filename.split('_')[1].split('.')[0])

class OUNoise:
    def __init__(self, size, mu=0., theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.size = size
        self.reset()

    def reset(self):
        self.state = np.ones(self.size) * self.mu

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state


In [None]:
hidden_size = 256          # 神经元个数
lr_actor = 1e-4            # Actor网络的学习率
lr_critic = 1e-3           # Critic网络的学习率
gamma = 0.99               # 折扣因子
tau = 1e-3                 # 软更新参数
buffer_size = int(1e6)     # 经验回放缓冲区大小
batch_size = 128           # 批量大小

ddpg_agent = DDPGAgent(observation_space, action_space, hidden_size, lr_actor, lr_critic, gamma, tau, buffer_size, batch_size)


ddpg_reward_list = []

load_filename = None
batch = 0
save_directory = "./ddpg_weight"

if load_filename is not None:
    batch = ddpg_agent.load_checkpoint(save_directory, load_filename)

while batch <= MAX_NUM_BATCH:
    rewards = []
    total_rewards = []

    for _ in range(batch_size):
        state = env.reset()[0]
        total_reward = 0
        done = False
        while not done:
            action = ddpg_agent.act(state)
            next_state, reward, done, _, _ = env.step(action)
            ddpg_agent.step(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if done:
                break
        total_rewards.append(total_reward)

    batch += 1

    if batch % 50 == 0 and save_directory is not None:
        ddpg_agent.save_checkpoint(save_directory, batch)

    print('Batch: {}, average reward: {}'.format(batch, np.array(total_rewards).mean()))
    ddpg_reward_list.append(np.array(total_rewards).mean())

env.close()

In [None]:
plt.plot(ddpg_reward_list)
plt.grid(True)
plt.xlim([0,200])
# plt.ylim([-300,300])
plt.xlabel('Number of Policy Iterations')
plt.ylabel('Batch Average Reward')

In [None]:
# plt.figure(figsize=(20, 6))
plt.plot(ppo_reward_list,label='PPO')
plt.plot(pg_reward_list,label='PG')
plt.plot(ddpg_reward_list,label='DDPG')
plt.grid(True)
plt.xlim([0,200])
plt.legend()
plt.xlabel('Number of Policy Iterations')
plt.ylabel('Batch Average Reward')