<a href="https://colab.research.google.com/github/Ashish-Tripathy/TD3-Twin-Delayed-DDPG/blob/master/TD3_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip3 install pybullet --upgrade

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 40kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

# Step 1: Initialisation
Initialise the Experience Replay memory and populate it with transitions.

In [0]:
class ReplayBuffer(object):
  def __init__(self, max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0
  
  def add(self, transition):
    if len(self.storage) == max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr+1)%max_size

  def sample(self, batch_size):
    ind = np.random.randint(0,len(self.storage), batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [],[],[],[],[]
    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))  
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states),np.array(batch_next_states),np.array(batch_actions), \
    np.array(batch_rewards).reshape(-1,1),np.array(batch_dones).reshape(-1,1)


# Step 2: Define the model architecture
Build one DNN for Actor model and one for Actor Target

In [0]:
class Actor(nn.Module):
  def __init__(self, state_dims, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dims, 400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action
  
  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x



Build two DNNs for the two Critic models and two DNNs for the two Critic Targets

In [0]:
class Critic(nn.Module):
  def __init__(self, state_dims, action_dim):
    super(Critic, self).__init__()
    self.layer_1  = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300,action_dim)

    def forward(self, x, u):
      xu = torch.cat([x,u],1)
      x1 = F.relu(self.layer_1(xu))
      x1 = F.relu(self.layer_2(x1))
      x1 = self.layer_3(x1)
      #forward propagation for second critic
      x2 = F.relu(self.layer_1(xu))
      x2 = F.relu(self.layer_2(x2))
      x2 = self.layer_3(x2)

      return x1,x2

    def Q1(self, x, u):
      xu = torch.cat([x,u],1)
      x1 = F.relu(self.layer_1(xu))
      x1 = F.relu(self.layer_2(x1))
      x1 = self.layer_3(x1)

      return x1



# Training Process
Create TD3 class, initialise variables

In [0]:
class TD3(object):
  def __init__(self, state_dims, action_dim, max_action):
    self.actor = Actor(state_dims, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict)
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims, action_dim).to(device)
    self.critic_target = Critic(state_dims, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict)
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1,-1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  #Sample from a batch of transitions (s, s', a, r) from the memory
  def train(self, replay_buffer, iterations, batch_size = 100, discount = 0.99, tau = 0.005, 
            policy_noise = 0.2, noise_clip = 0.5, policy_freq = 2):
    for it in iterations:
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      
      #From the next state s', the actor target plays the next action a'
      next_action = self.actor_target.forward(next_state)

      #We add Gaussian noise to this next action a' and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noice_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      #The two Critic targets take each the couple (s', a') as input and return two Q values, Qt1(s', a') and Qt2(s', a') as outputs
      target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)

      #Keep the minimum of these two Q-Values
      target_Q = torch.min(target_Q1, target_Q2)

      #We get the final target of the two Critic models, which is:
      #Qt = r + gamma * min(Qt1, Qt2)
      #target_q or Qt as reward + discount  * torch.min(Qt1, Qt2)
      #we are only supposed to run this if the episode is over, which means we need to integrate Done
      #target_q would create it's BP/computation graph, and without detaching Qt1/Qt2 from their own graph, we are complicating things, i.e. we need to use detach. 
      target_Q = reward + ((1-done) * discount * target_Q).detach()

      #Two critic models take (s, a) and return two Q-Values
      current_Q1, current_Q2 = self.critic.forward(state, action)

      #Compute the Critic Loss
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      #Backpropagate this critic loss and update the parameters of two Critic models
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()

      #Once every two iterations, we update our Actor model by performing 
      #gradient ASCENT on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
        self.actor_optimizer.grad_zero()
        actor_loss.backward()
        self.actor_optimizer.step()

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)

        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)          
      

