## Problem 2. Reinforcement Learning for BipedalWalker-v3 (35 points)
This problem is to design a reinforcement learning algorithm that is applied to a robot, with the objective to maximize its reward, in the following game. 
### BipedalWalker-v3
The reward is given for moving forward, accumulating over 300 points at the end. If the robot falls, it will be penalized by deducting 100 points. Applying a motor torque costs a small amount of points. The state of the robot consists of hull angle speed, angular velocity, horizontal speed, vertical speed, positions of joints, angular speeds of joints, contact positions of legs with the ground, and 10 lidar rangefinder measurements. There is no coordinate in the state vector.  
![Alt Text](https://media.giphy.com/media/R89toZzap04ZDKHPkd/giphy.gif)  
This game has continuous action space. You are required to apply the Twin Delayed DDPG (TD3) method in this game.  
### References：
You can read [this link](https://spinningup.openai.com/en/latest/algorithms/td3.html) to understand the **TD3** algorithm better.  
You can visit [this link](https://dllglobal.com/challenges/reinforcement-learning) to understand the **BipedalWalker-v3** environment better.  
### Requirements：
* All of your code should be shown in this file.
* Your network must be based on GRU; otherwise, you will get 0 point.
* You must save your trained model named as **best_model.pt**.
* The RL method you need to implement is TD3; otherwise you will get 0 point.
* Please give some comments to your code.

### 2.1 Import the packages and define helper funcitons and variables (5 points)

In [1]:
from collections import namedtuple
from itertools import count
import os, sys, random
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from tensorboardX import SummaryWriter
import tqdm.notebook as tqdm
# 请用GPU，不然宛若智障
device = 'cuda' if torch.cuda.is_available() else 'cpu'


class Replay_buffer():
    def __init__(self, max_size):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def push(self, data):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        x, y, u, r, d = [], [], [], [], []

        for i in ind:
            X, Y, U, R, D = self.storage[i]
            x.append(np.array(X, copy=False))
            y.append(np.array(Y, copy=False))
            u.append(np.array(U, copy=False))
            r.append(np.array(R, copy=False))
            d.append(np.array(D, copy=False))

        return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)

### 2.2 Build your network (<font color=red>which should include GRU cells</font>) (7 points)

In [2]:
class Actor(nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.GRU_layer = nn.GRU(input_size=state_dim, hidden_size=state_dim ,num_layers=1)
        self.fc1 = nn.Linear(state_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_dim)

        self.max_action = max_action

    def forward(self, state):
        state=state.unsqueeze(0)
        a , self.hidden = self.GRU_layer(state)
        a = a.squeeze(0)
        a = F.relu(self.fc1(a))
        a = F.relu(self.fc2(a))
        a = torch.tanh(self.fc3(a)) * self.max_action
        return a


class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(state_dim + action_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)

        q = F.relu(self.fc1(state_action))
        q = F.relu(self.fc2(q))
        q = self.fc3(q)
        return q

### 2.3 Define your TD3 algorithm (8 points)

In [3]:
class TD3():
    def __init__(self, state_dim, action_dim, max_action):

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.critic_1 = Critic(state_dim, action_dim).to(device)
        self.critic_1_target = Critic(state_dim, action_dim).to(device)
        self.critic_2 = Critic(state_dim, action_dim).to(device)
        self.critic_2_target = Critic(state_dim, action_dim).to(device)

        self.actor_optimizer = optim.Adam(self.actor.parameters())
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters())
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters())

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())

        self.max_action = max_action
        self.memory = Replay_buffer(capacity)
        self.writer = SummaryWriter(directory)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0

    def select_action(self, state):
        state = torch.tensor(state.reshape(1, -1)).float().to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def update(self, num_iteration):

        if self.num_training % 500 == 0:
            print("====================================")
            print("模型已经训练了{}次了".format(self.num_training))
            print("====================================")
        for i in range(num_iteration):
            x, y, u, r, d = self.memory.sample(batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Select next action according to target policy:
            noise = torch.ones_like(action).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.max_action, self.max_action)

            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1 - done) * gamma * target_Q).detach()

            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()
            self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration)

            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()
            self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration)
            # Delayed policy updates:
            if i % policy_delay == 0:
                # Compute actor loss:
                actor_loss = - self.critic_1(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(((1- tau) * target_param.data) + tau * param.data)

                for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
                    target_param.data.copy_(((1 - tau) * target_param.data) + tau * param.data)

                for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
                    target_param.data.copy_(((1 - tau) * target_param.data) + tau * param.data)

                self.num_actor_update_iteration += 1
        self.num_critic_update_iteration += 1
#         print(3)
        self.num_training += 1

    def save(self):
        torch.save(self.actor.state_dict(), directory+'actor.pth')
        torch.save(self.actor_target.state_dict(), directory+'actor_target.pth')
        torch.save(self.critic_1.state_dict(), directory+'critic_1.pth')
        torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth')
        torch.save(self.critic_2.state_dict(), directory+'critic_2.pth')
        torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth')
        print("====================================")
        print("模型已经保存了")
        print("====================================")

    def load(self):
        self.actor.load_state_dict(torch.load(directory + 'actor.pth'))
        self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth'))
        self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth'))
        self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth'))
        self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth'))
        self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth'))
        print("====================================")
        print("模型已经加载了")
        print("====================================")

### 2.4 Define your training process and train your model (5 points)  
You must use some data structures to collect the mean reward and mean loss in each iteration.

In [4]:
# 设置各种参数
mode='train'
env_name='BipedalWalker-v3'
tau=0.005 # target smoothing coefficient
target_update_interval=1
test_iteration=10 #测试次数
learning_rate=3e-4 #学习率
gamma=0.99 # discounted factor
capacity=5000 # replay buffer size
num_iteration=10000 #  num of  games
batch_size=100 # mini batch size
seed=False
random_seed=9527 #随机种子
# optional parameters
num_hidden_layers=2
sample_frequency=256
# render=False # show UI or not
render=True # show UI or not
log_interval=100 #
load=False # load model
render_interval=100 # after render_interval, the env.render() will work
policy_noise=0.2
noise_clip=0.5
policy_delay=2
exploration_noise=0.1
max_episode=2000
print_log=100





env = gym.make(env_name)
if seed:
    env.seed(random_seed)
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_Val = torch.tensor(1e-7).float().to(device) # min value

directory = './exp' + env_name +'./'


agent = TD3(state_dim, action_dim, max_action)
total_reward = 0 # 奖励/评分



print("====================================")
print("Collection Experience...")
print("====================================")
# if load: agent.load()
for i in tqdm.tqdm(range(num_iteration)):
    state = env.reset() #初始化环境
#     print(1)#############
    for t in range(2000): #最多尝试2000次
#         print(2)###############

        action = agent.select_action(state)
        action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
        action = action.clip(env.action_space.low, env.action_space.high)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        if render and i >= render_interval:
            env.render()
        agent.memory.push((state, next_state, action, reward, np.float(done)))
#         if i+1 % 10 == 0:
#             print('Episode {},  The memory size is {} '.format(i, len(agent.memory.storage)))
#         if len(agent.memory.storage) >= capacity-1:
#             agent.update(10)

        state = next_state
        if done or t == max_episode -1:
            agent.update(10)
            agent.writer.add_scalar('total_reward', total_reward, global_step=i)
            if i % print_log == 0:
                print("Epoch {}, the total_reward is \t{:0.2f}, the step is \t{}".format(i, total_reward, t))
            total_reward = 0
            break

    if i % log_interval == 0:
        agent.save()



Collection Experience...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))

模型已经训练了0次了
Epoch 0, the total_reward is 	-93.66, the step is 	91
模型已经保存了
Epoch 100, the total_reward is 	-113.28, the step is 	92
模型已经保存了



KeyboardInterrupt: 

### 2.5 Show your change curves of reward and loss in two sub-pictures (5 points)
Your reward and loss must be able to converge; otherwise this part will get 0 point.

### 2.6 Test your model 10 times and print the mean reward (5 points) 
You should load your trained model **best_model.pt**. If your mean reward is >= 200 but < 250, you will get 3 points. If your mean reward is >= 250, you will get 5 points. If your mean reward is < 200, you will get 0 point.

In [None]:
agent.load()
for i in tqdm.tqdm(range(test_iteration)):
    state = env.reset()
    for t in count():
        action = agent.select_action(state)
        next_state, reward, done, info = env.step(np.float32(action))
        total_reward += reward
        env.render()
        if done or t ==2000 :
            print("Ep_i \t{}, the total_reward is \t{:0.2f}, the step is \t{}".format(i, total_reward, t))
            total_reward = 0
            break
        state = next_state