<a href="https://colab.research.google.com/github/Curiousss/EVA/blob/master/Phase2/Seesion10/P2Session10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install pybullet --upgrade

ERROR! Session/line number was not unique in database. History logging moved to new session 59
Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 47kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


In [0]:
import os
import time
import random 
import numpy as np 
import matplotlib.pyplot as plt 
import pybullet_envs 
import gym 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from gym import wrappers
from torch.autograd import Variable 
from collections import deque


In [0]:
class ReplayBuffer(object):
  # Init is there for all classes to initialize an object
  # Self is pointer to the object of the class which is initialized
  def __init__(self, max_size = 1e6):
    self.storage =[] 
    self.max_size = max_size 
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition 
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint (e, len(self.storage), batch_size) 
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], [] 
    
    for i in ind:
      state, next_state, action, reward, done = self.storage[i] 
      batch_states.append(np.array(state, copy = False)) 
      batch_next_states.append(np.array(next_state, copy = False)) 
      batch_actions.append(np.array(action, copy - False)) 
      batch_rewards.append(np.array(reward, copy - False)) 
      batch_dones.append(np.array(done, copy - False)) 
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

In [0]:
class Actor(nn.Module):
  def __init__(self, state_dims, action_dim, max_action):
    #Max action is to clip in case we added too much noise
    super(Actor, self).__init__()# activate the inheritance 
    self.layer_1 = nn.Linear(state_dims, 400) 
    self.layer_2 = nn.Linear(400, 300) 
    self.layer_3 = nn.Linear(300, action_dim) 
    self.max_action = max_action

  def forward(self,x):
    x = F.relu(self.layer_1(x)) 
    x = F.relu(self.layer_2(x)) 
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x

In [0]:
class Critic(nn.Module):
  def __init__(self, state_dims, action_dim):
    #max_action is to clip in case we need added too much noise
    super(Critic, self).__init__() #activate the inheritence
    #First Critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim) 
    #Second Critic Network 
    self.layer_4 = nn.Linear(state_dims + action_dim, 400) 
    self.layer_5 = nn.Linear(400, 300) 
    self.layer_6 = nn.Linear(300, action_dim)

  def forward(self, x, u): 
    # x is state, u is action 
    xu = torch.cat([x, u], 1) # 1 for vertical concatenation, 0 for Horizontal
    #forward propagation on first Critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1) 
    # forward propagation on second Critic 
    x2 = F.relu(self.layer_4(xu)) 
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u): 
    #x state, u- action This is used for updating the Q values 
    xu = torch.cat([x, u], 1) # 1 for vertical concatenation, 0 for horizontal
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

In [0]:
# Select CPU or GPU
device = torch.device('cuda' if torch.cuda.is_available() else cpu)

# Building the whole Training Process into a class
class T3D(object):
  def __init__(self, state_dims, action_dim, max_action):
    # making sure out T3D can work with any environment
    self.actor = Actor(state_dims, action_dim, max_action).to(device) #GD
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device) #Polyak Averaging
    self.actor_target.load_state_dict(self.actor.state_dict())
    #Initializing with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    
    self.critic = Critic(state_dims, action_dim).to(device) #GD 
    self.critic_target = Critic(state_dims, action_dim).to(device) #Polyak Averaging 
    self.critic_target.load_state_dict(self.critic.state_dict()) 
    #Initializing with model weights to keep the same 
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) 
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device) 
    return self.actor(state).cpu().data.numpy().flatten() 
    # Need to convert to numpy remember clipping!
  # SAVING & LOADING FILES

  # making a save method to save a trained model 
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) 
    torch.save (self.critic.state_dict(), "%s/%s_critic.pth" % (directory, filename))

  # Load method to load a pre-trained model
  def load(self, directory, filename):
    self.actor.load_state_dict(torch.load('%s/%s_actor.path' % (directory, filename))) 
    self.critic.load_state_dict(torch.load( '%s/%s critic.path' % (directory, filename)))


In [0]:
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, \
          tau = 0.005, policy_noise = 0.2, noise_clip=0.5, policy_freq=2):
  for it in range(iterations):
    # Step 4 We sample from a batch of transitions (s, s', a, r) from memory 
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
    state = torch. Tensor(batch_states).to(device) 
    next_state = torch.Tensor(batch_next_states).to(device) 
    action = torch.Tensor(batch_actions).to(device)
    reward = torch.Tensor(batch_rewards).to(device)
    done = torch.Tensor (batch_dones).to(device)

In [0]:
#self.actor_target.forward(next_state)

In [0]:
#noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device) 
#noise = noise.clamp(-noise_clip, noise_clip) 
#next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

In [0]:
#target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)

In [0]:
#target_Q = torch.min(target_Q1, target_Q2)

In [0]:
#target_Q = reward + ((1-done) * discount * target_Q).detach()

In [0]:
#current_Q1, current_Q2 = self.critic.forward(state, action)

In [0]:
#critic_loss = Mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

In [0]:
#selt.critic_optimizer.zero_grad() #Initialize the gradients to zero
#critic_loss.backward() #Computing the gradients
#self.critic_optimizer.step() #Performing the weight updates

In [239]:
'''
if it % policy_freq == 0:
  #This is DPG part 
  actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
  self.actor_optimizer.grad_zero() 
  actor_loss.backward() 
  self.actor_optimizer.step()
  '''

'\nif it % policy_freq == 0:\n  #This is DPG part \n  actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())\n  self.actor_optimizer.grad_zero() \n  actor_loss.backward() \n  self.actor_optimizer.step()\n  '

In [0]:
#for param, target_param in zip(self.actor.parameters (), self.actor_target.parameters()):
#  target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 

In [0]:
#for param, target_param in zip(self.critical.parameters(), self.critic_target.parameters()):
#  target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


Environment

In [0]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False 
    while not done:
      action = policy.select_action(np.array(obs)) 
      obs, reward, done, _ = env.step(action) 
      avg_reward += reward
      avg_reward /= eval_episodes

  print ("---------------------------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward)) 
  print ("---------------------------------------------------------")

  return avg_reward



In [0]:
env_name = "AntBulletEnv-v0"  # set environment you want
seed = 0 #random seed number 
start_timesteps = 1e4 # Number of iterations/timesteps,  
# before which the model randomly chooses an action, after which it stats using the policy network
eval_freq = 5e3 # After how many timestamps the evaluation step is performed
max_timesteps = 5e5 # Total number of iterations/timesteps 
save_models = True # Boolean checker whether or not to save the pre-trained model 
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise 
batch_size = 100 # Size of the batch 
discount = 0.99 # Discount factor gamma 
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration and 
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions for the explorations 
policy_freg = 2 # No. of iterations to wait before the policy/Actor model is updated

In [257]:
#We create a file name for the two saved models: The Actor and Critic
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("------------------------------------------ ")
print ("Settings: %s" % (file_name)) 
print ("------------------------------------------ ")

#we create a folder inside which will be saved the trained models 

if not os.path.exists("results"):
  os.makedirs("results") 
if save_models and not os.path.exists("pytorch_models"):
  os.makedirs("pytorch_models")

------------------------------------------ 
Settings: TD3_AntBulletEnv-v0_0
------------------------------------------ 


In [258]:
# We create the PyBullet environment 
env = gym.make (env_name)

# We set seeds and we get necessary information on the states and actions
# in the chosen environment 
env.seed(seed)
torch.manual_seed(seed) 
np.random.seed(seed) 
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float (env.action_space.high[0])



In [259]:
# We create a policy network the actor model
policy = T3D(state_dim, action_dim, max_action)

# We create the Experience Replay memory
replay_buffer = ReplayBuffer()

# We define a list where all evaluation results over 10 episodes are stored
evaluations = [evaluate_policy(policy)]

# We create a new folder directory in which the final results (videos of the agent) will be populated

def mkdir(base, name):
  path = os.path.join(base, name) 
  if not os.path.exists(path):
    os.makedirs (path) 
  return path 

workdir = mkdir('exp', 'brs')
monitor_dir = mkdir(workdir, 'monitor') 
max_episode_steps = env._max_episode_steps 
save_env_vid = False 
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True) 
  env.reset()

---------------------------------------------------------
Average Reward over the Evaluation Step: -0.135011
---------------------------------------------------------


In [0]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True 
t0 = time.time()

# Training
max_timesteps = 1#500000

In [0]:
while total_timesteps < max_timesteps:
  # If episode done 
  if done:
    # If we are not at the beginning we start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: { Episode Num: {} Reward: (}".
            format(total_timesteps, episode_num, episode_reward)) 
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

  # We evaluate the episode and we save the policy 
  if timesteps_since_eval >= eval_freq:
    timesteps_since_eval %= eval_freq 
    evaluations.append(evaluate_policy(policy)) 
    policy.save(file_name, directory="pytorch_models") 
    np.save("results/%as" % (file_name), evaluations)

  # When training step is done we reset the environment
  obs = env.reset()

  done = False

  # Set rewards and episode timesteps to 0
  episode_reward = 0 
  episode_timesteps=0
  episode_num += 1
  break # MEERA: REMOVE


In [0]:

if total_timesteps < start_timesteps:
  action = env.action_space.sample()
else: 
  #After 10000 timesteps, we switch to the model 
  action = policy.select_action(np.array(obs)) 
  #If the explore noise parameter is not 0, we add noise to the action and we clip it 
  if expl_noise != 0:
    action = (action + np.random. normal(0, expl_noise, size-env.action_space. shape[0])).clip(env.action_space.low, env.action_space.high)

# The agent performs the action in the environment, then reaches the next state
# and receives the reward 
new_obs, reward, done, _ = env.step(action)

# We check if the episode is done
done_bool = 0  if episode_timesteps + 1 == env._max_episode_steps else float(done)
# We increase the total reward 
episode_reward += reward

# We store the new transition into the Experience Replay memory 
replay_buffer.add((obs, new_obs, action, reward, done_bool))

# we update the state, the episode timestep, the total timesteps, and the tim the evaluation of the policy 
obs = new_obs 
episode_timesteps += 1 
total_timesteps += 1 
timesteps_since_eval += 1


In [263]:
#We add the last policy evaluation to our List of evaluations and he save our model 
evaluations.append(evaluate_policy(policy)) 
if save_models: policy.save("%s" % (file_name), directory="pytorch_models") 
np.save("results/%as" % (file_name), evaluations)

---------------------------------------------------------
Average Reward over the Evaluation Step: -0.160359
---------------------------------------------------------


In [269]:
!ls
#!ls results/
#!ls pytorch_models/
!ls exp/brs/monitor/

 exp  'pytorch models'	 pytorch_models   results   sample_data
