In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange

In [None]:
class Map:
    def __init__(self, blocks_num, random_seed) -> None:
        self.size = np.array([100, 100])
        self.dsize = 1 / self.size
        self.dagsize = 1 / np.linalg.norm(self.size)
        self.random_seed = random_seed
        self.blocks_num = blocks_num
        self.point_size = 2
        self.init_size = self.point_size * 5
        self._random_map(random_seed)
        return

    def _random_map(self, random_seed) -> None:
        np.random.seed(random_seed)
        self.drift = np.random.rand(2)*2-1
        self.drift = self.drift/np.linalg.norm(self.drift) *0.1

        self.blocks = np.zeros((self.blocks_num, 2))
        self.BEGIN = np.random.rand(2) * self.size
        self.END = np.random.rand(2) * self.size

        i = 0
        while i < self.blocks_num:
            point = np.random.rand(2) * self.size
            fitted = True
            d1 = np.linalg.norm(point-self.BEGIN)
            d2 = np.linalg.norm(point-self.END)
            if d1 <= self.init_size or d2 <= self.init_size:
                fitted = False

            for j in range(i):
                d = np.linalg.norm(point-self.blocks[j])
                if d <= self.init_size:
                    fitted = False

            if fitted:
                self.blocks[i] = point
                i += 1

        self.vd_blocks = self.blocks.flatten()
        return

    def reRandom(self):
        fitted = False
        while not fitted:
            fitted = True
            self.BEGIN = np.random.rand(2) * self.size
            if np.min(np.linalg.norm(self.blocks - self.BEGIN, axis=1)) <= self.init_size:
                fitted = False
        
        fitted = False
        while not fitted:
            fitted = True
            self.END = np.random.rand(2) * self.size
            if np.linalg.norm(self.END - self.BEGIN) <= self.init_size:
                fitted = False
            if np.min(np.linalg.norm(self.blocks - self.END, axis=1)) <= self.init_size:
                fitted = False

        return

In [None]:
class Agent:
    pos = np.zeros(2)
    vel = np.zeros(2)

    def __init__(self, Map) -> None:
        self.map = Map
        self.reset()
        self.max_step = 500
        return

    def reset(self) -> None:
        self.pos = self.map.BEGIN.copy()
        self.vel = np.zeros(2)
        self.steps = 0
        self.old_pos = self.pos.copy()
        self.old_vel = np.zeros(2)
        return

    def step(self, act):
        # 保存旧状态
        self.old_vel = self.vel.copy()
        self.old_pos = self.pos.copy()

        # 方向和力度
        dir = np.array([np.cos(act[1]*np.pi), np.sin(act[1]*np.pi)])
        f = act[0] + 1
        self.vel = dir * f *0.2 + self.vel *0.8
        self.pos += self.vel + (np.random.rand(2) *2-1)*0.01 + self.map.drift

        target = np.linalg.norm(self.map.END - self.pos)
        R1 =  -target *self.map.dagsize         # usually in[-0.8,0]

        R2 = f *-0.001                         

        R3 = 0                                  # 0 or 10
        arrive = False
        if target <= self.map.point_size:
            R3 = 10
            arrive = True

        min_dis = np.min(np.linalg.norm(self.map.blocks - self.pos, axis=1))
        R4 = min_dis *self.map.dagsize *0.002       
        if min_dis <= self.map.point_size:
            R4 -= 10                            # -10 or 0
            
        if (self.pos < 0).any() or (self.pos > self.map.size).any():
            R4 -= 0.1                           # -0.1 or 0
            self.pos = self.old_pos.copy()
            self.vel = self.old_vel.copy()

        return R1+R2+R3+R4, arrive, min_dis

    def get_state(self):
        tmp = np.multiply(self.map.blocks, self.map.dsize)
        np.random.shuffle(tmp)
        return np.concatenate((
            tmp.flatten(),
            self.map.drift,
            self.pos *self.map.dsize,
            self.vel *self.map.dsize,
            self.old_pos *self.map.dsize,
            self.old_vel *self.map.dsize,
            self.map.END *self.map.dsize
        ))

In [None]:
class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)

In [None]:
class Actor_Net(nn.Module):
    def __init__(self, dim):
        super(Actor_Net, self).__init__()
        self.liner1 = nn.Linear(dim, 512)
        self.liner2 = nn.Linear(512, 512)
        self.liner3 = nn.Linear(512, 2)

    def forward(self, x):
        x = F.relu(self.liner1(x))
        x = F.relu(self.liner2(x))
        return torch.tanh(self.liner3(x))

In [None]:
class Critic_Net(nn.Module):
    def __init__(self, dim):
        super(Critic_Net, self).__init__()
        self.actliner = nn.Linear(2, 256)
        self.xliner = nn.Linear(dim, 256)
        self.liner = nn.Linear(512, 512)
        self.liner2 = nn.Linear(512, 1)
        
    def forward(self, x, act):
        act = F.relu(self.actliner(act))
        x = F.relu(self.xliner(x))
        z = torch.cat((x, act), dim = 1)
        z = F.relu(self.liner(z))
        return self.liner2(z)

In [None]:
blocks_num = 20

AMap = Map(blocks_num, 42)
agent = Agent(AMap)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dim = agent.get_state().shape[0]
learning_rate = 2e-4
wd = 1e-4

actor_net = Actor_Net(dim).to(device)
T_actor_net = Actor_Net(dim).to(device)
T_actor_net.load_state_dict(actor_net.state_dict())
actor_net_optim = torch.optim.Adam(actor_net.parameters(), lr = learning_rate, weight_decay = wd)

critic_net_1 = Critic_Net(dim).to(device)
T_critic_net_1 = Critic_Net(dim).to(device)
T_critic_net_1.load_state_dict(critic_net_1.state_dict())
critic_net_1_optim = torch.optim.Adam(critic_net_1.parameters(), lr = learning_rate, weight_decay = wd)

critic_net_2 = Critic_Net(dim).to(device)
T_critic_net_2 = Critic_Net(dim).to(device)
T_critic_net_2.load_state_dict(critic_net_2.state_dict())
critic_net_2_optim = torch.optim.Adam(critic_net_2.parameters(), lr = learning_rate, weight_decay = wd)


exp_pool = ReplayBuffer(dim, 2)

In [None]:
# import plotly.graph_objects as go

# fig = go.Figure(data=[go.Scatter3d(x=AMap.blocks[:, 0],
#                 y=AMap.blocks[:, 1], z=AMap.blocks[:, 2], mode='markers')])
# fig.update_layout(scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'),
#                   title='3D Scatter Plot')
# fig.show()
print(AMap.BEGIN, AMap.END)

In [None]:
for _ in trange(100):
    agent.reset()
    for __ in range(100):
        state1 = agent.get_state()
        act = np.random.rand(2)*2-1
        reward, arrive, _ = agent.step(act)
        state2 = agent.get_state()
        exp_pool.add(state1, act, state2, reward, arrive)

In [None]:
gamma = 0.996



def soft_update(target, source, t):
    for target_param, source_param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_((1 - t) * target_param.data + t * source_param.data)

def train(update_actor = True):
    #exp = exp_pool.Exp_list[0]
    state1, act, state2, reward, _ = exp_pool.sample(618)

    pred_act = T_actor_net(state2)*0.8 + torch.randn(2).clamp(-1, 1).to(device) *0.2

    pred_q1 = T_critic_net_1(state2, pred_act)
    pred_q2 = T_critic_net_2(state2, pred_act)

    td_target = (reward + gamma * torch.min(pred_q1, pred_q2)).detach()

    td_error_1 = F.mse_loss(critic_net_1(state1, act), td_target)
    td_error_2 = F.mse_loss(critic_net_2(state1, act), td_target)

    critic_net_1_optim.zero_grad()
    td_error_1.backward()
    #torch.nn.utils.clip_grad_value_(critic_net_1.parameters(), clip_value=1)
    critic_net_1_optim.step()

    critic_net_2_optim.zero_grad()
    td_error_2.backward()
    #torch.nn.utils.clip_grad_value_(critic_net_2.parameters(), clip_value=1)
    critic_net_2_optim.step()

    if update_actor:
        q = -critic_net_1(state1, actor_net(state1)).mean()

        actor_net_optim.zero_grad()
        q.backward()
        #torch.nn.utils.clip_grad_value_(actor_net.parameters(), clip_value=1)
        actor_net_optim.step()

        soft_update(T_actor_net, actor_net, 0.005)
        soft_update(T_critic_net_1, critic_net_1, 0.005)
        soft_update(T_critic_net_2, critic_net_2, 0.005)
        
        return td_error_1, td_error_2, q

    return td_error_1, td_error_2

train()

In [None]:
import matplotlib.pyplot as plt

maps = [Map(blocks_num, seed) for seed in [11, 22, 33, 44, 55, 66, 77, 88, 99]]
agents = [Agent(m) for m in maps]

# 统计量
rewards = []
floss = []
first = []
min_d = []
min_d_average = []
vel_average = []

count = 0

In [None]:
def test(episode, agent):
    agent.reset()

    # 需统计量记录
    episode_reward = 0
    fsum = 0
    first_arrive = agent.max_step
    close_block = 1e8
    close = np.zeros(agent.max_step)
    vel_con = np.zeros(agent.max_step)
    # 位置
    positions = [agent.pos.copy()]

    for step in trange(agent.max_step):
        state1 = agent.get_state()

        act = actor_net(torch.FloatTensor(state1).to(device))
        act = act.cpu().detach().numpy()
        if np.random.random() < 0.2:
            act = act * 0.5 + (np.random.rand(2)*2-1) * 0.5
        reward, arrive, min_dis = agent.step(act)
        state2 = agent.get_state()
        exp_pool.add(state1, act, state2, reward, arrive)

        # 需统计量记录
        episode_reward += reward
        fsum += act[0]+1
        if arrive: first_arrive = min(first_arrive, step)
        close_block = min(close_block, min_dis)
        close[step] = min_dis
        vel_con[step] = np.linalg.norm(agent.vel)
        # 存储位置
        positions.append(agent.pos.copy())  

    positions = np.array(positions)

    # 记录
    rewards.append(episode_reward)
    floss.append(fsum)
    first.append(first_arrive)
    min_d.append(close_block)
    min_d_average.append(np.average(close))
    vel_average.append(np.average(vel_con))
    
    print('Test ----- episode = {:4d} Reward = {:5.3f}'.format(episode, episode_reward))

    if episode % 2000 == 0:
        # 绘制智能体路径
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(positions[:, 0], positions[:, 1], label='Agent Path')
        ax.scatter(agent.map.BEGIN[0], agent.map.BEGIN[1], color='b', label='Begin', s=20)
        ax.scatter(agent.map.END[0], agent.map.END[1], color='r', label='Goal', s=20)
        ax.scatter(agent.map.blocks[:, 0], agent.map.blocks[:, 1], color='k', label='Obstacle', s=10)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.legend()
        plt.title('Test Episode {:4d}, Reward {:.3f}'.format(episode, episode_reward))
        # plt.show()

        plt.savefig(f'./img/pic-{episode}.png')
        torch.save(actor_net, f'./model/actor_net{episode}.pt')
        torch.save(critic_net_1, f'./model/critic_net{episode}.pt')
        np.save('rewards', rewards)
        np.save('floss', floss)
        np.save('first', first)
        np.save('min_d', min_d)
        np.save('min_d_avg', min_d_average)
        np.save('vel_avg', vel_average)


for i in range(160001):
    for _ in range(5):
        train(False)
    train(True)
    if (i % 20 == 0):
        count += 1
        test(i, agents[count % 9])
        agents[count % 9].map.reRandom()