# Deep Deterministic Policy Gradient (TD3) Twin Delayed.

## Packages install


In [1]:
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/91/39/c56526c130f092d0123c471c1a749edf45cb74e97b4cdf6a5230a0ce4054/pybullet-3.0.8-cp36-cp36m-manylinux1_x86_64.whl (76.6MB)
[K     |████████████████████████████████| 76.6MB 38kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.0.8


## Importar las librerías

In [2]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

## 1. Experiences repeat memory initialization

In [3]:
class ReplayBuffer(object):
  def __init__(self, max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0
  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] == transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)
  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size = batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy= False))
      batch_next_states.append(np.array(next_state, copy= False))
      batch_actions.append(np.array(action, copy= False))
      batch_rewards.append(np.array(reward, copy = False))
      batch_dones.append(np.array(done, copy= False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1,1), np.array(batch_dones).reshape(-1,1)
    


## 2. Build neural network to model actor and a neural network to target actor

In [4]:
class Actor(nn.Module):
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(in_features= state_dim, out_features= 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return  x
    

## 3. Build neural network to two both model critics and a neural network to two both target critics

In [5]:
class Critic(nn.Module):
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()

    #Define twin 1 as a deep neural network
    self.layer_1 = nn.Linear(in_features= state_dim + action_dim, out_features= 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)

    #Define twin 2 as a deep neural network
    self.layer_4 = nn.Linear(in_features= state_dim + action_dim, out_features= 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)

  def forward(self, x, u):
    xu = torch.cat([x,u], 1) #Vertical concat

    #Forward propagation of first critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    #Forward propagation of second critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return  x1, x2

## 4 - 15. Training

In [6]:
#Select GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Build training process in one class

class TD3(object):
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size = 100, discount = 0.99, tau = 0.005, policy_noise = 0.2, noise_clipping = 0.5, policy_freq = 2):
    for it in range(iterations):
      #Take a sample from state, next state, action and reward of the memory. 
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      #For each element of sample:
      #from next state, the actor target execute next action
      next_action = self.actor_target(next_state).to(device)
      #Add gaussian noise to the next action and cut to have in range of accepted environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clipping, noise_clipping)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
      #both target critics take next action and next state for inputs and return two Q-values for outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)
      #take the minimum of the values that are obtained before
      target_Q = torch.min(target_Q1, target_Q2)
      #Get the final Q target from the two critic models Q = r + y* min(Q11,Q12), where y is a discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      #Two critics take state and action for inputs and return two Q-values Q1 and Q2 for output
      current_Q1, current_Q2 = self.critic(state, action)
      #Calculate loss from model critic: Critic_loss = MSE_Loss(Q1(s, a), Qt) + MSE_Loss(Q2(s, a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      #Back propagation of the mse_loss and upgrade the two parameters of critic model with SGD
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      #To each of two iterations, upgrade the actor model by play the ascendence gradient at the first model critic output
      if it%policy_freq == 0:
        actor_loss = - self.critic(state, self.actor(state))[0].mean() #Acendence gradient from negative descendence gradient / REVISAR PASO 15
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        #Upgrade the weights of the target actor using polyak model
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param + (1 - tau) * target_param)
        #Upgrade the weights of the target critic using polyak model
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param + (1 - tau) * target_param)
    #Method to save the trained model
  def save(self, filename, directory):
      torch.save(self.actor.state_dict(), "%s/%s_actor.pth" % (directory, filename))
      torch.save(self.critic.state_dict(), "%s/%s_critic.pth" % (directory, filename))
    #Method to load the trained model
  def load(self, filename, directory):
      self.actor.load_state_dict(torch.load("%s/%s_actor.pth" % (directory, filename)))
      self.critic.load_state_dict(torch.load("%s/%s_critic.pth" % (directory, filename)))





## Implement a function that evaluate the policy calculating the mean of rewards by 10 episodes

In [7]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print("----------------------------------------")
  print("Mean reward in the evaluation step: %f" % (avg_reward))
  print("----------------------------------------")
  return avg_reward
  

## Configuration of parameters

In [8]:
env_name = "AntBulletEnv-v0" # Environment name
seed = 0 #Value of random seed
start_timesteps = 1e4 #Number of iterations during each of them the model choose an random action
eval_freq = 5e3 #frecuency of evaluation step
max_timesteps = 5e5 #Total number of iterations
save_models = True #Check boolean to know if the model pre-trained is would be saved or not
expl_noise = 0.1 #Exploration noise: standard desviation of the gaussian exploration noise
batch_size = 100
discount = 0.99 #Discount gamma factor, utilized in the total discount reward calculate
tau = 0.005 #Actualization ratio of the objectives net
policy_noise = 0.2 # Standard desviation of aditing gaussian noise to exploration actions
noise_clip = 0.5 #Max gaussian noise value aditingto actions (policy)
policy_freq = 2 #Number of iterations to wait before actualization of the policy net

## Create a file name to each of two save models: Actor and Critic

In [9]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print("----------------------------------------")
print("Configuration: %s" % (file_name))
print("----------------------------------------")

----------------------------------------
Configuration: TD3_AntBulletEnv-v0_0
----------------------------------------


## Create a folder where the trained models will saved

In [10]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## Create an environment of PyBullet

In [11]:
env = gym.make(env_name)



## Fix the seeds and obtain the needed information about the select environment states and actions

In [12]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])


## Create a policy net (model actor)

In [13]:
policy = TD3(state_dim, action_dim, max_action)

## Create the experience repeat memory

In [14]:
replay_buffer = ReplayBuffer()

## Define a list where the evaluation results of 10 episodes are going to save

In [15]:
evaluations = [evaluate_policy(policy)]

----------------------------------------
Mean reward in the evaluation step: 9.804960
----------------------------------------


## Create a new directory of folders to show the final results (agents video)

In [16]:
def mkdir(base, name):
  path = os.path.join(base, name)
  if not os.path.exists(path):
    os.makedirs(path)
  return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## Initialize variables

In [17]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [18]:
#Initialize fristly bucle with 500000 timesteps
while total_timesteps < max_timesteps:

  #if the episode has ended
  if done:

    #if we don't stay in the first iteration, run the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    #evaluate the episode and save  the policy if necesary iteration are passed
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(policy))
      policy.save(file_name, directory = "./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)

    #When episode train end, reset environment
    obs = env.reset()

    #Configuration of done value to False
    dane = False

    #Configuration the episode reward and timestep to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1

  # Before 10000 timesteps, run random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
  #after 10000 timesteps, change the model
  else:
    action = policy.select_action(np.array(obs))
    #if the explore_noise value is not 0, add noise to action and cut the range
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
  #Agent run an environment action and reach the next state and one reward
  new_obs, reward, done,_ = env.step(action)

  #chek if the episode is ended
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

  #Increment total reward
  episode_reward += reward

  #store the new transition in the replay buffer
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  #Upgrade the state, number episode timestep, total timesteps and the number of steps since the last policy actualization
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

#Add the last policy actualization to the previous evaluation list and save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 1000 Episode num: 1 Reward: 479.01462369570976
Total Timesteps: 2000 Episode num: 2 Reward: 511.8013945921352
Total Timesteps: 3000 Episode num: 3 Reward: 342.17752612482985
Total Timesteps: 3636 Episode num: 4 Reward: 278.35681274317875
Total Timesteps: 3904 Episode num: 5 Reward: 128.59998217105306
Total Timesteps: 4904 Episode num: 6 Reward: 529.640584528355
Total Timesteps: 5904 Episode num: 7 Reward: 480.20927893245386
----------------------------------------
Mean reward in the evaluation step: 100.385933
----------------------------------------
Total Timesteps: 6904 Episode num: 8 Reward: 481.91313816534637
Total Timesteps: 7353 Episode num: 9 Reward: 211.21894645876594
Total Timesteps: 8353 Episode num: 10 Reward: 305.81699516075116
Total Timesteps: 8550 Episode num: 11 Reward: 90.90798523643461
Total Timesteps: 9550 Episode num: 12 Reward: 515.0580299115522
Total Timesteps: 10550 Episode num: 13 Reward: 481.93879527458245
---------------------------------------

## Inference

In [21]:
class Actor(nn.Module):
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(in_features= state_dim, out_features= 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return  x

class Critic(nn.Module):
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()

    #Define twin 1 as a deep neural network
    self.layer_1 = nn.Linear(in_features= state_dim + action_dim, out_features= 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)

    #Define twin 2 as a deep neural network
    self.layer_4 = nn.Linear(in_features= state_dim + action_dim, out_features= 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)

  def forward(self, x, u):
    xu = torch.cat([x,u], 1) #Vertical concat

    #Forward propagation of first critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    #Forward propagation of second critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return  x1, x2

#Select GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Build training process in one class

class TD3(object):
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, bach_size = 100, discount = 0.99, tau = 0.005, policy_noise = 0.2, noise_clipping = 0.5, policy_freq = 2):
    for it in range(iterations):
      #Take a sample from state, next state, action and reward of the memory. 
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = ReplayBuffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      #For each element of sample:
      #from next state, the actor target execute next action
      next_action = self.actor_target(next_state).to(device)
      #Add gaussian noise to the next action and cut to have in range of accepted environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clipping, noise_clipping)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
      #both target critics take next action and next state for inputs and return two Q-values for outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_action).to(device)
      #take the minimum of the values that are obtained before
      target_Q = torch.min(target_Q1, target_Q2)
      #Get the final Q target from the two critic models Q = r + y* min(Q11,Q12), where y is a discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      #Two critics take state and action for inputs and return two Q-values Q1 and Q2 for output
      current_Q1, current_Q2 = self.critic(state, action)
      #Calculate loss from model critic: Critic_loss = MSE_Loss(Q1(s, a), Qt) + MSE_Loss(Q2(s, a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      #Back propagation of the mse_loss and upgrade the two parameters of critic model with SGD
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      #To each of two iterations, upgrade the actor model by play the ascendence gradient at the first model critic output
      if it%policy_freq == 0:
        actor_loss = - self.critic(state, self.actor(state))[0].mean() #Acendence gradient from negative descendence gradient / REVISAR PASO 15
        self.actor_optimizer.zero_grad()
        actor_optimizer.backward()
        self.actor_optimizer.step()
        #Upgrade the weights of the target actor using polyak model
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param + (1 - tau) * target_param)
        #Upgrade the weights of the target critic using polyak model
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param + (1 - tau) * target_param)
    #Method to save the trained model
  def save(self, filename, directory):
      torch.save(self.actor.state_dict(), "%s/%s_actor.pth" % (directory, filename))
      torch.save(self.critic.state_dict(), "%s/%s_critic.pth" % (directory, filename))
    #Method to load the trained model
  def load(self, filename, directory):
      self.actor.load_state_dict(torch.load("%s/%s_actor.pth" % (directory, filename)))
      self.critic.load_state_dict(torch.load("%s/%s_critic.pth" % (directory, filename)))

def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print("----------------------------------------")
  print("Mean reward in the evaluation step: %f" % (avg_reward))
  print("----------------------------------------")
  return avg_reward

env_name = "AntBulletEnv-v0" # Environment name
seed = 0 #Value of random seed

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print("----------------------------------------")
print("Configuration: %s" % (file_name))
print("----------------------------------------")

eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, './pytorch_models/')
_ = evaluate_policy(policy, eval_episodes=eval_episodes)


----------------------------------------
Configuration: TD3_AntBulletEnv-v0_0
----------------------------------------




----------------------------------------
Mean reward in the evaluation step: 2487.631804
----------------------------------------
