In [2]:
from SheepDogEnv import SheepDogEnv
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import rl_utils

In [3]:
def orthogonal_init(layer, gain=1.0):
    nn.init.orthogonal_(layer.weight, gain=gain)
    nn.init.constant_(layer.bias, 0)


class PolicyNetContinuous(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, use_orthogonal_init=False):
        super(PolicyNetContinuous, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
        self.fc_std = torch.nn.Linear(hidden_dim, action_dim)
        if use_orthogonal_init:
            print("------use_orthogonal_init------")
            orthogonal_init(self.fc1)
            orthogonal_init(self.fc2)
            orthogonal_init(self.fc_mu, gain=0.01)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        # print(self.fc_mu(x))
        mu = 2*np.pi * torch.tanh(self.fc_mu(x))
        std = F.softplus(self.fc_std(x))
        return mu, std


class ValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, use_orthogonal_init=False):
        super(ValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = torch.nn.Linear(hidden_dim, 1)
        if use_orthogonal_init:
            print("------use_orthogonal_init------")
            orthogonal_init(self.fc1)
            orthogonal_init(self.fc2)
            orthogonal_init(self.fc3)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        return self.fc3(x)


class PPOContinuous:
    ''' 处理连续动作的PPO算法 '''

    def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
                 lmbda, epochs, eps, gamma, device, use_orthogonal_init=False):
        self.actor = PolicyNetContinuous(state_dim, hidden_dim,
                                         action_dim, use_orthogonal_init).to(device)
        self.critic = ValueNet(state_dim, hidden_dim,
                               use_orthogonal_init).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        self.gamma = gamma
        self.lmbda = lmbda
        self.epochs = epochs
        self.eps = eps
        self.device = device

    def take_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(self.device)
        mu, sigma = self.actor(state)
        print(mu, sigma)
        action_dist = torch.distributions.Normal(mu, sigma)
        action = action_dist.sample()
        return [action.item()]

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)
        rewards = (rewards) / 8.0  # 和TRPO一样,对奖励进行修改,方便训练
        td_target = rewards + self.gamma * self.critic(next_states) * (1 -
                                                                       dones)
        # print("td_target:", td_target, self.critic(states))
        td_delta = td_target - self.critic(states)
        advantage = rl_utils.compute_advantage(self.gamma, self.lmbda,
                                               td_delta.cpu()).to(self.device)
        mu, std = self.actor(states)
        action_dists = torch.distributions.Normal(mu.detach(), std.detach())
        # 动作是正态分布
        old_log_probs = action_dists.log_prob(actions)

        for _ in range(self.epochs):
            mu, std = self.actor(states)
            action_dists = torch.distributions.Normal(mu, std)
            log_probs = action_dists.log_prob(actions)
            ratio = torch.exp(log_probs - old_log_probs)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
            actor_loss = torch.mean(-torch.min(surr1, surr2))
            critic_loss = torch.mean(
                F.mse_loss(self.critic(states), td_target.detach()))
            # print(actor_loss,critic_loss)
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            self.actor_optimizer.step()
            self.critic_optimizer.step()


In [4]:
actor_lr = 1e-5
critic_lr = 5e-4
num_episodes = 2000
hidden_dim = 128
gamma = 0.9
lmbda = 0.9
epochs = 10
eps = 0.2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")


env = SheepDogEnv(circle_R=350, sheep_v=70, dog_v=80,
                  sec_split_n=10, store_mode=True, render_mode=False)
torch.manual_seed(0)
state_dim = env._get_obs_array().shape[0]
action_dim = env.action_space.shape[0]  # 连续动作空间


In [6]:
transition_dict = {
    'states': [],
    'actions': [],
    'next_states': [],
    'rewards': [],
    'dones': []
}
agent = PPOContinuous(state_dim, hidden_dim, action_dim,
                      actor_lr, critic_lr,
                      lmbda, epochs, eps, gamma, device,
                      use_orthogonal_init=True)
for i in range(10):
    env.reset()
    _st = env._get_obs_array()
    _st[2]=(_st[2]-np.pi)/np.pi
    print(_st)
    act = agent.take_action(state=_st)[0]
    st, reward, done, _, _ = env.step(action=act)
    st[2]=(st[2]-np.pi)/np.pi
    transition_dict["states"].append(_st)
    transition_dict["actions"].append(act)
    transition_dict["next_states"].append(st)
    transition_dict["rewards"].append(reward)
    transition_dict["dones"].append(False)
    print(_st, act, reward, st)

agent.update(transition_dict=transition_dict)


------use_orthogonal_init------
------use_orthogonal_init------
[0.         0.         0.53476684]
tensor([-0.0058], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.7132], device='cuda:0', grad_fn=<SoftplusBackward0>)
[0.         0.         0.53476684] -0.8195993304252625 -5.676776790374618 [7.         5.46358598 0.54204249]
[ 0.          0.         -0.16720806]
tensor([0.0018], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.7157], device='cuda:0', grad_fn=<SoftplusBackward0>)
[ 0.          0.         -0.16720806] 0.5945245623588562 2.9635501095933137 [ 7.          0.59452456 -0.17448371]
[ 0.          0.         -0.99770946]
tensor([0.0108], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.7189], device='cuda:0', grad_fn=<SoftplusBackward0>)
[ 0.          0.         -0.99770946] -0.5185834169387817 -6.116693278956518 [7.         5.76460189 0.99501489]
[ 0.          0.         -0.53937474]
tensor([0.0059], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.7171], device='cuda:

In [18]:
actor_lr = 1e-4
critic_lr = 5e-3
num_episodes = 2000
hidden_dim = 128
gamma = 0.9
lmbda = 0.9
epochs = 1
eps = 0.2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

agent = PPOContinuous(state_dim, hidden_dim, action_dim,
                      actor_lr, critic_lr,
                      lmbda, epochs, eps, gamma, device,
                      use_orthogonal_init=True)
for ii in range(200):
    transition_dict = {
        'states': [],
        'actions': [],
        'next_states': [],
        'rewards': [],
        'dones': []
    }
    for i in range(10):
        env.reset()
        _st = env._get_obs_array()
        # act1 = (_st[2]+np.pi) % (2*np.pi)
        _st[2]=(_st[2]-np.pi)/np.pi
        # print(_st)
        act = agent.take_action(state=_st)[0]
        # st, reward, done, _, _ =  env.step(action=act1) if np.random.random(1)<0.5 else env.step(action=act)
        st, reward, done, _, _ =  env.step(action=act)
        st[2]=(st[2]-np.pi)/np.pi
        transition_dict["states"].append(_st)
        transition_dict["actions"].append(act)
        transition_dict["next_states"].append(st)
        transition_dict["rewards"].append(reward)
        transition_dict["dones"].append(True)
        # print(_st, act, reward, st)
    # print(np.mean(transition_dict['rewards']))
    agent.update(transition_dict)
    if(ii%50==0):
        test_agent(env,agent)



------use_orthogonal_init------
------use_orthogonal_init------
tensor([0.0018], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6691], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0042], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6969], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0023], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6879], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([0.0043], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6578], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0024], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6882], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0030], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6911], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0027], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6900], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-0.0006], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6802], device='

In [19]:
def test_agent(env,agent):
    test_dict = {
        'states': [],
        'actions': [],
        'next_states': [],
        'rewards': [],
        'dones': []
    }
    for i in range(100):
        env.reset()
        _st = env._get_obs_array()
        _st[3:6]=0
        _st[2]=(_st[2]-np.pi)/np.pi
        act = agent.take_action(state=_st)[0]
        st, reward, done, _, _ = env.step(action=act)
        st[3:6]=0
        st[2]=(st[2]-np.pi)/np.pi
        test_dict["states"].append(_st)
        test_dict["actions"].append(act)
        test_dict["next_states"].append(st)
        test_dict["rewards"].append(reward)
        test_dict["dones"].append(False)
        # print(_st, act, reward, st)
    print("test:",np.mean(test_dict['rewards']))
test_agent(env=env,agent=agent)

tensor([2.5535], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6720], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([2.6699], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6727], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([2.8604], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6738], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([1.0420], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6631], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-2.5133], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6442], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-2.7271], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6432], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([0.8977], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6622], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-1.4420], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.6495], device='cuda:0', grad_fn=<SoftplusBackward0>)
tensor([-1.8564], device='cud

In [43]:
actor_lr = 1e-5
critic_lr = 5e-4
num_episodes = 2000
hidden_dim = 64
gamma = 0.9
lmbda = 0.9
epochs = 5
eps = 0.2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

agent = PPOContinuous(state_dim, hidden_dim, action_dim,
                      actor_lr, critic_lr,
                      lmbda, epochs, eps, gamma, device,
                      use_orthogonal_init=True)

------use_orthogonal_init------
------use_orthogonal_init------


In [363]:

env.reset()
_st = env._get_obs_array()
# _st[3:6]=0
_st[2]=(_st[2]-np.pi)/np.pi
act = agent.take_action(state=_st)[0]
# act=(_st[2]+np.pi)%(2*np.pi)
st, reward, done, _, info = env.step(action=act)
# print(_st)
print(act, reward)
# print(st)
print(info)
print("critic:\t_st_score:",agent.critic(torch.tensor(_st,dtype=torch.float).to(device)))
print("critic:\tst_score:",agent.critic(torch.tensor(st,dtype=torch.float).to(device)))

tensor([0.3158], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.4872], device='cuda:0', grad_fn=<SoftplusBackward0>)
0.31879761815071106 0.09255172245929089
{'distance': array([350.09255172])}
critic:	_st_score: tensor([0.1605], device='cuda:0', grad_fn=<AddBackward0>)
critic:	st_score: tensor([0.1102], device='cuda:0', grad_fn=<AddBackward0>)


In [33]:
env = SheepDogEnv(circle_R=450, sheep_v=60, dog_v=60,
                  sec_split_n=10, store_mode=True, render_mode=False)

env.reset()
_st = env._get_obs_array()
_st[2]=(_st[2]-np.pi)/np.pi
act = agent.take_action(state=_st)[0]
st, reward, done, _, info = env.step(action=act)
print(act, reward)
print(info)
print("critic:\t_st_score:",agent.critic(torch.tensor(_st,dtype=torch.float).to(device)))
print("critic:\tst_score:",agent.critic(torch.tensor(st,dtype=torch.float).to(device)))

tensor([-2.5511], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.4890], device='cuda:0', grad_fn=<SoftplusBackward0>)
-3.0257201194763184 4.561458604206848
{'distance': array([454.5614586])}
critic:	_st_score: tensor([0.6903], device='cuda:0', grad_fn=<AddBackward0>)
critic:	st_score: tensor([0.2774], device='cuda:0', grad_fn=<AddBackward0>)


In [20]:
torch.save(agent.actor.state_dict(),"arctor-1")
torch.save(agent.critic.state_dict(),"critic-1")

In [17]:
env = SheepDogEnv(circle_R=350, sheep_v=80, dog_v=80,
                  sec_split_n=10, store_mode=False, render_mode=False)

env.reset()
for i in range(2000):
    _st = env._get_obs_array()
    _st[2]=(_st[2]-np.pi)/np.pi
    action = agent.take_action(state=_st)[0]
    sheep_cur_theta = env.observation_space[1]
    if(sheep_cur_theta < np.pi/2 and action > sheep_cur_theta+np.pi*3/2):
        action -= 2*np.pi
    if(sheep_cur_theta > np.pi*3/2 and action < sheep_cur_theta-np.pi*3/2):
        action += 2*np.pi
    action = np.clip(
        action,
        sheep_cur_theta-np.pi/2,
        sheep_cur_theta+np.pi/2
    )
    print(action)
    action %= (2*np.pi)
    observation, reward, done, _, info = env.step(action)  # 和环境交互
    if done:
        env.save()
        break


tensor([-1.1243], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.5612], device='cuda:0', grad_fn=<SoftplusBackward0>)
-1.159391164779663
tensor([-1.5893], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.8395], device='cuda:0', grad_fn=<SoftplusBackward0>)
3.6140430609332483
tensor([-1.6975], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.9413], device='cuda:0', grad_fn=<SoftplusBackward0>)
2.798122274871689
tensor([-1.7600], device='cuda:0', grad_fn=<MulBackward0>) tensor([0.9977], device='cuda:0', grad_fn=<SoftplusBackward0>)
2.196530119884251
tensor([-1.8083], device='cuda:0', grad_fn=<MulBackward0>) tensor([1.0370], device='cuda:0', grad_fn=<SoftplusBackward0>)
1.6815193442331835
tensor([-1.8469], device='cuda:0', grad_fn=<MulBackward0>) tensor([1.0654], device='cuda:0', grad_fn=<SoftplusBackward0>)
1.223854123826373
tensor([-1.8785], device='cuda:0', grad_fn=<MulBackward0>) tensor([1.0865], device='cuda:0', grad_fn=<SoftplusBackward0>)
0.80779434633101
tensor([-1.9302], 