<a href="https://colab.research.google.com/github/Aspire-Mayank/EVA/blob/master/Phase2/Session9/P2S9_DDPG_T3D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Lets look at these 15 steps through code of Twin Delays Deep Deterministic Algorithms 

##Initialization

In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

##**STEP** 1  : We initialize the Experience Replay Memory with a size of 1e6. Then we populate it with new transitions with all tuples as input (s', a, r, s) as tensor.

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step1.PNG?raw=true)

In [0]:
class ReplayBuffer(object):
  def __init__(self, max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0
# Transition or terminal state reached 
  def add(self, transition):
    ## terminal state reached, restart ptr to [0] overwrite 
    # else append exisiting transtion in storage 
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

#  for training we took sample of storage as Batch_Size
  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), batch_size)
    batch_state, batch_next_states, batch_actions, batch_rewards, \
        batch_dones = [], [], [], [], [] 
    for i in ind:
      state, next_state, action, rewards, done = self.storage[i]
      batch_states.append(np.array(state, copy = False))
      batch_next_states.append(np.array(next_state, copy = False))
      batch_actions.append(np.array(action, copy = False))
      batch_dones.append(np.array(done, copy = False))
    return np.array(batch_states), np.array(batch_next_states), \
        np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), \
            np.array(batch_dones).reshape(-1, 1)

##STEP 2 : Build one DNN for the Actor Model and one for Actor Target
![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step2.PNG?raw=true)



In [0]:
class Actor(nn.Module):
  # state_dims as Input, action_dims as output of DNN
  # state params, action can take, max_Action is limit by degree
  def __init__(self, state_dims, action_dim, max_action):
    #max_action is to clip in case we added too much noise
    super(Actor, self).__init__() # activate the inheritance
    self.layer_1 = nn.Linear(state_dims, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action
  
  # Layers used with relu
  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x

## STEP 3 : Build two DNNs for the two Critic model and two DNNs for the two critic Targets
![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step3.PNG?raw=true)

In [0]:
class Critic(nn.Module):
  def __init__(self, state_dims, action_dim):
    super(Critic, self).__init__() #activate the inheritance
    # First Critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.Layer_3 = nn.Linear(300, action_dim)
    # Second Critic Network
    self.layer_4 = nn.Linear(state_dims + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, action_dim)

  def forward(self, x, u):
    # x = state, u = action
    # critic takes input from state, and action
    xu = torch.cat([x, u], 1) #1 for vertical concatenation, 0 for hzntl
    # forward propagation on first Critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 - self.layer_3(x1)
    # forward propagation on second Critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return x1, x2

  # these is to train out Actor and we choose first critic
  # to train actor in future 
  def Q1(self, x, u):
    # x=state, u=action This is used for updating the Q valueu
    xu = torch.cat([x, u], 1) #1 for vertical concatentation, 0 for Hzntl
    # we choose first critic to train actor
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

##STEP 4 -15 : Training process. Create a T3D class, initialize variables and get ready for step 4


In [0]:
# Selecting our GPU to cuda as device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Building the whole training Process into a class

class T3D(object):
  def __init__(self, state_dims, action_dim, max_action):
    # making sure our T3D class can work with any env
    self.actor = Actor(state_dims, action_dim, max_action).to(device) #GD
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device) #Ployak Averaging
    #Now load actor weights to actor target
    self.actor_target.load_state_dict(self.actor.state_dict)
    # initializing with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims, action_dim).to(device) #GD
    self.critic_target = critic(state_dims, action_dim).to(device) #polyak avg
    self.critic_target.load_state_dict(self.critic.state_dict)
    # initializing with model weights to keep them same
    self.critic_optimizer = torch.optim.Adam(self.critic.parameter())
    self.max_action_action = max_action # maximized Action passed to Critic to get maximized Q value
  
  # For training critic
  def select_action(self, state):
    # Change state to reshape tensor and send to GPU
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    # forward actor states to GPU device
    # now forwaded state to cpu as data to get numpy
    # and flatten to select action
    return self.actor(state).cpu().data.numpy().flatten()
    # need to convert to numpy, remember clipping?

##STEP 4 : sample from a batch of transition(s,s',a, r) from the memory
![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step4-15.PNG?raw=true)

In [0]:
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99,
          tau = 0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
   for it in range(iterations):
     # Step 4 we sample from a batch of transition (s, s', a, r) from memory
     batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones \
          = replay_buffer.sample(batch_size)
     state = torch.Tensor(batch_states).to(device)
     next_state = torch.Tensor(batch_next_states).to(device)
     action = torch.Tensor(batch_actions).to(device)
     rewards = torch.Tensor(batch_rewards).to(device)
     # done coming from Env feedback as one episode completed
     done = torch.Tensor(batch_dones).to(device)

##STEP 5 : From the next state s', the actor target plays the next action a'
![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step4.PNG?raw=true)

In [0]:
# step 5: From the next state s', the actor target play the next action a'
next_action = self.actor_target.forward(next_state) 

##Step 6 : We add Gaussian noise to this next action a' and we clamp it in a range of values supported by the environment

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step5.PNG?raw=true)

In [0]:
noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

##STEP 7 : The Two Critic targets take each the couple (s', a') as input and return two Q values, Qt1(s', a') and Qt2(s', a') as outputs.

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step6.PNG?raw=true)

In [0]:
target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)

##STEP 8 : Keep the minimum of these two Q-Values

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step7.PNG?raw=true)

In [0]:
target_Q = torch.min(target_Q1, target_Q2)

This is not target_Q, we are just being lazy, and want to use the same variable name later on.

##STEP 9 : We get the final target of the two Critic models, which is 

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step8.PNG?raw=true)
>  Target Qt = Reward + (gamma * min(Q1, Q2))

> -> we can define "target_q" as "Qt" as "reward + discount * torch.min(Q1, Q2)" but it won't work. 
> 1. First, we are only supposed to run this if the episode is over, which means we need to intergate Done.

> 2. Second, target_q would create it's BP/computation graph, and without detaching Qt1/Qt2 from their own graph, we are complicating things, i.e we need to use detach. let's look below:


In [0]:
## Step 9:
#we get the final target of the two critic model, which is:
#Qt = r + gamma*min(Qt1, Qt2)
#target_Q = reward + (1-done) * discount * target_Q
#0 = episode not over, 1 = episode over
#we can't run the above equation efficiently as some components are in computational
#graph and some are not. we need to make one minor modification
target_Q = reward + ((1 - done) * discount * target_Q).detach()

##STEP 10 : Two critic models each take the couple(s, a) as input and return two Q-values

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step9.PNG?raw=true)

In [0]:
current_Q1, current_Q2 = self.critic.forward(state, action)

##STEP 11 : Compute the critic loss
we compute the loss coming from the two critic models

![alt text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step10.PNG?raw=true)

In [0]:
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

##STEP 12 : Backpropagate this critic loss and update the parameters of two critic models with Adam optimizer
![alt_text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step11.PNG?raw=ture)

In [0]:
self.critic_optimizer.zero_grad() # initializing the gradients to zero
critic_loss.backward() # computing the gradients
self.critic_optimizer.step() # performing the weight updates

##STEP 13 : once every two iterations, we update our Actor model by performing gradient Ascent on the output of the first Critic model.

![alt_text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step12.PNG?raw=true)

In [0]:
if it % policy_freq == 0:
  # This is DPG part
  # Actor.forward.mean we need avg of all actor
  actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
  self.actor_optimizer.grad_zero()
  actor_loss.backward()
  self.actor_optimizer.step()

#these above will happen two times plus critic will happen update 
# to make actor target update. by polyak avg. 

##Step 14 : Still once every two iterations, we update the weights of the Actor target by Polyak Averaging
![alt_text](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step13.PNG?raw=true)

In [0]:
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
  target_param.data.copy_(tau*param.data + (1 -tau)*target_param.data)

##STEP 15 : still, in once every two iterations, we update the weights of our critic target by polyal Averaging

![alt_true](https://github.com/Aspire-Mayank/EVA/blob/master/Phase2/Session9/step14.PNG?raw=true)

In [0]:
for param, target_param in zip(self.critic.parameters(), self.critic_target.paramters()):
  target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

# T3D is done Now!