# DDPG

In [None]:
import time
import gym
import torch
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from collections import deque
import random
import os

In [None]:
class CriticNet(torch.nn.Module):
    def __init__(self, env):
        super(CriticNet, self).__init__()
        self.fc1 = torch.nn.Linear(env.observation_space.shape[0] + env.action_space.shape[0], 128)
        self.relu1 = torch.nn.ReLU()
        # self.fc2 = torch.nn.Linear(128, 128)
        # self.relu2 = torch.nn.ReLU()
        # self.fc3 = torch.nn.Linear(128, 128)
        # self.relu3 = torch.nn.ReLU()
        self.fc4 = torch.nn.Linear(128, env.action_space.shape[0])

    def forward(self, observation, action):
        x = torch.cat([observation, action], dim=1)
        x = self.relu1(self.fc1(x))
        # x = self.relu2(self.fc2(x))
        # x = self.relu3(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
class ActorNet(torch.nn.Module):
    def __init__(self, env, boundary=[-2, 2]):
        super(ActorNet, self).__init__()
        self.fc1 = torch.nn.Linear(env.observation_space.shape[0], 128)
        self.relu1 = torch.nn.ReLU()

        # self.fc2 = torch.nn.Linear(128, 128)
        # self.relu2 = torch.nn.ReLU()

        # self.fc3 = torch.nn.Linear(128, 128)
        # self.relu3 = torch.nn.ReLU()

        self.fc4 = torch.nn.Linear(128, env.action_space.shape[0])
        self.tanh = torch.nn.Tanh()

        self.low = boundary[0]
        self.high = boundary[1]

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        # x = self.relu2(self.fc2(x))
        # x = self.relu3(self.fc3(x))
        x = self.tanh(self.fc4(x))
        x = (self.high - self.low) / (1 - (-1)) * (x - (-1)) + self.low
        return x

In [None]:
class DDPG:
    def __init__(self, env, batch_size=64):
        self.critic = CriticNet(env)
        self.critic_target = CriticNet(env)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=2e-3)
        self.actor = ActorNet(env, boundary=[-2, 2])
        self.actor_target = ActorNet(env, boundary=[-2, 2])
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3)
        self.mse = torch.nn.MSELoss()
        self.buffer = deque(maxlen=50000)
        self.batch_size = batch_size
        self.gamma = 0.9

    def choose_action(self, state, explore=True):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action = self.actor(state).item()
        if not explore:
            return [action]
        else:
            action = np.clip(np.random.normal(action, 1), -2, 2)
            return [action.item()]

    # def store_transition(self, state, action, reward, new_state):
    #     _temp = [state, action, [reward], new_state]
    #     self.buffer.append(_temp)

    def update_target(self):
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

    def ema_update_target(self, source_model, target_model, tau=0.01):
        weighted_sum_state_dict = {}
        for key in source_model.state_dict():
            weighted_sum_state_dict[key] = tau * source_model.state_dict()[key] + (1 - tau) * target_model.state_dict()[
                key]
        target_model.load_state_dict(weighted_sum_state_dict)

    def learn(self):
        # self.update_target()
        self.ema_update_target(self.critic, self.critic_target)
        self.ema_update_target(self.actor, self.actor_target)

        batch_samples = random.sample(self.buffer, self.batch_size)
        state_lst, action_lst, reward_lst, new_state_lst = zip(*batch_samples)
        state_lst = torch.FloatTensor(state_lst)
        action_lst = torch.FloatTensor(action_lst)
        reward_lst = torch.FloatTensor(reward_lst)
        new_state_lst = torch.FloatTensor(new_state_lst)
        # print(state_lst.size())
        # print(action_lst.size())
        # print(reward_lst.size())
        # print(new_state_lst.size())

        # 更新critic网络
        with torch.no_grad():
            action_target = self.actor_target(new_state_lst)
            new_q = self.critic_target(new_state_lst, action_target)
        q_target = reward_lst + self.gamma * new_q
        q_value = self.critic(state_lst, action_lst)
        td_error = self.mse(q_target, q_value)
        self.critic_optimizer.zero_grad()
        td_error.backward()
        self.critic_optimizer.step()

        # 更新actor网络
        action = self.actor(state_lst)
        q_value = self.critic(state_lst, action)
        loss_actor = -torch.mean(q_value)  # 寻找最小的loos_actor, 就是寻找最大的torch.mean(q_value), 就是使其q值最大
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

    def model_save(self, path):
        torch.save({
            'actor_model_state_dict': self.actor.state_dict(),
            'actor_target_model_state_dict': self.actor_target.state_dict(),
            'critic_model_state_dict': self.critic.state_dict(),
            'critic_target_model_state_dict': self.critic_target.state_dict(),
            'actor_optimizer_state_dict': self.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': self.critic_optimizer.state_dict(),
        }, path)

    def model_load(self, path):
        checkpoint = torch.load(path)
        self.actor.load_state_dict(checkpoint['actor_model_state_dict'])
        self.actor_target.load_state_dict(checkpoint['actor_target_model_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_model_state_dict'])
        self.critic_target.load_state_dict(checkpoint['critic_target_model_state_dict'])
        self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict'])
        self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict'])

In [None]:
log_dir = './runs'
if os.path.exists(log_dir):
    try:
        shutil.rmtree(log_dir)
        print(f'文件夹 {log_dir} 已成功删除。')
    except OSError as error:
        print(f'删除文件夹 {log_dir} 失败: {error}')
else:
    os.makedirs(log_dir)
    print(f'文件夹 {log_dir} 不存在，已创建文件夹 {log_dir}。')

In [None]:
summary_writer = SummaryWriter(log_dir=log_dir)
env = gym.make('Pendulum-v1')
batch_size = 64
ddpg = DDPG(env, batch_size)
episode = 2000
steps = 64 * 8
all_reward = []
for epoch in range(episode):
    start_time = time.time()
    state, _ = env.reset()
    step = 0
    episode_rewards = 0
    while step < steps:
        # choose action
        action = ddpg.choose_action(state)
        new_state, reward, done, _, _ = env.step(action)
        ddpg.buffer.append([state, action, [-(reward) ** 2 / 10], new_state])
        state = new_state
        episode_rewards += reward
        step += 1
        if step % batch_size == 0 or step == steps:
            ddpg.learn()
    all_reward.append(episode_rewards)
    summary_writer.add_scalar('episode_rewards', episode_rewards, epoch)
    print("Epoch/Episode: {}/{},reward: {}".format(epoch + 1, episode, episode_rewards))

In [None]:
env = gym.make("Pendulum-v1", render_mode='human')
state, _ = env.reset()
step = 0
episode_rewards = 0
while True:
    a = ddpg.choose_action(torch.tensor(state), False)
    new_state, reward, done, _, _ = env.step(a)
    step += 1
    state = new_state

In [None]:
env.close()

In [None]:
ddpg.model_save('DDPG Pendulum-v1.pth')