<a href="https://colab.research.google.com/github/nutov/DeepLearningOnComputationAccelerators-CS236781/blob/master/TweenDelayedDDPG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/68/d7/d797ff65ad3fdfbcd4c2b8274429476c377b2d842ae6c0e405b35f8387c0/pybullet-2.6.4-cp36-cp36m-manylinux1_x86_64.whl (94.0MB)
[K     |████████████████████████████████| 94.0MB 47kB/s 
Installing collected packages: pybullet
Successfully installed pybullet-2.6.4


In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import torch
import gym
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable


In [0]:
# Creating a Buffer 
class ReplayBuffer(object):
  def __init__(self,max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self,transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr+1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self,batch_size):
    ind = np.random.randint(0,len(self.storage),size = batch_size)
    batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones = [],[],[],[],[]
    for i in ind:
      state,next_state,action,reward,done = self.storage[i]
      batch_states.append(np.array(state,copy=False))
      batch_next_states.append(np.array(next_state,copy=False))
      batch_actions.append(np.array(action,copy=False))
      batch_rewards.append(np.array(reward,copy=False))
      batch_dones.append(np.array(done,copy=False))
      
    return np.array(batch_states),np.array(batch_next_states),np.array(batch_actions),np.array(batch_rewards).reshape(-1,1),np.array(batch_dones).reshape(-1,1)



In [0]:
class Actor(nn.Module):
  def __init__(self,state_dim,action_dim,max_action):
    super(Actor,self).__init__()
    self.layer_1 = nn.Linear(state_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)
    self.max_action = max_action
  
  def forward(self,x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action*torch.tanh(self.layer_3(x))
    return x


In [0]:
class Critic(nn.Module):
  def __init__(self,state_dim,action_dim,max_action):
    super(Critic,self).__init__()

    # defining the first critic
    self.layer_1 = nn.Linear(state_dim + action_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,1)
    # defining the second critic
    self.layer_1 = nn.Linear(state_dim + action_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,1)  

  def forward(self,x,u):
    xu = torch.cat([x,u],dim = 1)
    # forward pass first network
    x_1 = F.relu(self.layer_1(xu))
    x_1 = F.relu(self.layer_2(x_1))
    x_1 = self.layer_3(x_1)
    # forward pass second network
    x_2 = F.relu(self.layer_4(xu))
    x_2 = F.relu(self.layer_5(x_2))
    x_2 = self.layer_6(x_2)
    return x_1,x_2

  def Q1(self,x,u):
    xu = torch.cat([x,u],dim = 1)
    x_1 = F.relu(self.layer_1(xu))
    x_1 = F.relu(self.layer_2(x_1))
    x_1 = self.layer_3(x_1)
    return x_1

In [0]:
# selecting a device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Building the training process into a class
class TD3(object):
  def __init__(self,state_dim,action_dim,max_action,device = 'cpu'):
    #
    self.actor = Actor(state_dim,action_dim,max_action).to(device)
    self.actor_target = Actor(state_dim,action_dim,max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    # 
    self.critic = Critic(state_dim,action_dim).to(device)
    self.critic_target = Critic(state_dim,action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.critic.parameters())    

device(type='cuda', index=0)