### **Installing PreRequisites**

In [None]:
pip install pybullet



### **Libraries**


In [None]:
import os
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gym
import pybullet_envs

import torch
import torch.nn as nn
import torch.nn.functional as F

from gym import wrappers
from torch.autograd import Variable
from collections import deque

### **GPU - Check for GPU**


In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('GPU on:', True if torch.cuda.is_available() else False, '| Device:',DEVICE)

GPU on: True | Device: cuda


### **Connection to Google Drive - Google Drive Mount**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Memory class - Memory definition**


In [None]:
class Memory_(object):
    def __init__(self, max_size=1e6):
        self.memory = []
        self.max_size = max_size
        self.loc = 0

    def add(self, transition):
        if len(self.memory) == self.max_size:
            self.memory[int(self.loc)] == transition
            self.loc = (self.loc+1) % self.max_size
        else:
            self.memory.append(transition)
        
    def sample(self, batch_size):
        states_ = []
        next_states_ = [] 
        actions_ = []
        rewards_ = []
        dones_= []

        sample_data = np.random.randint(0 , len(self.memory), size=batch_size)
        for i in sample_data:
            state, next_state, action, reward, done = self.memory[i]
            states_.append(state)
            next_states_.append(next_state)
            actions_.append(action)
            rewards_.append(reward)
            dones_.append(done)

        return np.array(states_), np.array(next_states_), np.array(actions_), np.array(rewards_).reshape(-1,1), np.array(dones_).reshape(-1,1)

### **Actor Model - model definition**

In [None]:
#two models: 
#one for Actor_model
#second for Actor_target
class Actor(nn.Module):
    def __init__(self, input, action, limit):
        """
        action is the number outputs 
        output is the number of actions
        limit is the max action set by the environment:
        limit is used to adjust to the output levels. higher or lower than -1,1 
        """
        super(Actor,self).__init__()
        self.fully01 = nn.Linear(input, 400)
        self.fully02 = nn.Linear(400,300)
        self.last = nn.Linear(300, action)
        self.limit = limit

    def forward(self, x):
        x = F.relu(self.fully01(x))
        x = F.relu(self.fully02(x))
        x = self.limit * torch.tanh(self.last(x))    
        return x                        

### **Critic Model - model definition**


In [None]:
#since we need two pair of critics, im making one class for both pair.
#this model will be used 4 times. Twice for the Critic_Model and twice
#for the Critic_Target adding up to 4 models total.
class Critic(nn.Module):
    def __init__(self, input, action):
        super(Critic, self).__init__()
        self.fully01 = nn.Linear(input+action, 400)
        self.fully02 = nn.Linear(400,300)
        self.fully03 = nn.Linear(300, 1)

    def forward(self, x, u):
        x = torch.cat([x,u], 1)
        x = F.relu(self.fully01(x))
        x = F.relu(self.fully02(x))
        x = self.fully03(x)
        return x

### **Deep Reinforcemenet Learning Model - model definition**
#### **& Model Training Function**


In [None]:
#training
class MODEL_DRL(object):
    def __init__(self, state_dim, action_dim, max_action):
        # actors
        self.Actor = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target.load_state_dict(self.Actor.state_dict())

        # actor optimizer
        self.Actor_optimizer = torch.optim.Adam(self.Actor.parameters())

        ## Critics
        ###Critic MODEL
        self.Critic_left = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_right = Critic(state_dim, action_dim).to(DEVICE)
        ###Critic TARGET
        self.Critic_Target_left = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_Target_right = Critic(state_dim, action_dim).to(DEVICE)

        ###init weights 
        self.Critic_Target_left.load_state_dict(self.Critic_left.state_dict())
        self.Critic_Target_right.load_state_dict(self.Critic_left.state_dict())

        ## Critic optimizer
        self.Critic_optimizer_left = torch.optim.Adam(self.Critic_left.parameters())
        self.Critic_optimizer_right = torch.optim.Adam(self.Critic_right.parameters())

        ### Max_Action is the cut/clip
        self.max_action =  max_action

    def get_action(self, state):
        state = torch.FloatTensor(state.reshape(1,-1)).to(DEVICE)
        return self.Actor(state).cpu().data.numpy().flatten()

    def save(self, fname, directory):
        torch.save(self.Actor.state_dict(),f'{directory}/{fname}_Actor.pth')
        torch.save(self.Critic_left.state_dict(),f'{directory}/{fname}_Critic_left.pth')
        torch.save(self.Critic_right.state_dict(), f'{directory}/{fname}_Critic_right.pth')

    def load(self, fname, directory):
        self.Actor.load_state_dict(torch.load(f'{directory}/{fname}_Actor.pth'))
        self.Critic_left.load_state_dict(torch.load(f'{directory}/{fname}_Critic_left.pth'))
        self.Critic_right.load_state_dict(torch.load(f'{directory}/{fname}_Critic_right.pth'))
    
    def train(self, Memory, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        for i in range(iterations):
            # get samples from the memory
            states_, next_states_, actions_, rewards_, dones_ = Memory.sample(batch_size)
            # converting the sample to torch-tenspr
            # moving the tensor to device:gpu/cpu
            state = torch.Tensor(states_).to(DEVICE)
            next_state = torch.Tensor(next_states_).to(DEVICE)
            action = torch.Tensor(actions_).to(DEVICE)
            reward = torch.Tensor(rewards_).to(DEVICE)
            done = torch.Tensor(dones_).to(DEVICE)
            #passing in the next-state to get the action for the next state.
            next_action = self.Actor_Target(next_state)
            #we add noise(bias) to the next-action
            #we limit/clip the noise to limit we want 0.5
            #we limit/clip the next-action to the environment limits
            noise = torch.Tensor(actions_).data.normal_(0, policy_noise).to(DEVICE)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action+noise).clamp(-self.max_action, self.max_action)
            #we get the Q vlues for the Target Critice models
            Target_Q1 = self.Critic_Target_left(next_state, next_action)
            Target_Q2 = self.Critic_Target_right(next_state, next_action)
            #we select the minimum to not be optimistic
            Target_Q = torch.min(Target_Q1, Target_Q2)
            #we calculate the Q value= (r + y * Q)
            #the (1-done) for when the episode is over its 1,
            # when its not over its 0. 
            #we detached because adding the reward which is the output 
            #of nn to the computaional graph would not be what we want.
            Target_Q = reward + (discount * Target_Q * (1 - done)).detach()
            #we get the Q values for the critic models
            Current_Q1 = self.Critic_left(state, action)
            Current_Q2 = self.Critic_right(state, action)
            #we use the critic_model_Q values with the critic Q values calculated
            #above to get the loss
            critic_loss = F.mse_loss(Current_Q1,Target_Q) + F.mse_loss(Current_Q2, Target_Q)
            #setting the optim to zero
            self.Critic_optimizer_left.zero_grad()
            self.Critic_optimizer_right.zero_grad()
            #getting the backward
            critic_loss.backward()
            #doing the stop
            self.Critic_optimizer_left.step()
            self.Critic_optimizer_right.step()
            #if i % polic_freq(how often we update the models) == 0: do the below
            if not i % policy_freq:
                #getting the negative to apply gradient ascent on actor model
                #based on the left/first critic model Q output
                action_for_update = self.Actor(state)
                actor_loss = -self.Critic_left(state, action_for_update).mean()
                self.Actor_optimizer.zero_grad()
                actor_loss.backward()
                self.Actor_optimizer.step()
                #we are slowly updating the target model after 
                #X number of iterations
                #by slowly I mean using polyak update. 
                #everytime a percentage of the model is affecting the target model
                for param, target_param in zip(self.Actor.parameters(), self.Actor_Target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
                
                for param, target_param in zip(self.Critic_left.parameters(), self.Critic_Target_left.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)

                for param, target_param in zip(self.Critic_right.parameters(), self.Critic_Target_right.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)



### **Evaluation of the model**
Average-rewards over number of episodes

In [None]:
def evaluate(policy, episodes=10):
    total_rewards = 0
    for _ in range(episodes):
        tmp_done = False
        tmp_state = env.reset()
        while not tmp_done:
            tmp_state, tmp_reward, tmp_done, _ = env.step(policy.get_action(np.array(tmp_state)))
            total_rewards += tmp_reward
    total_rewards /= episodes
    print(f"Average award over {episodes} is:", total_rewards)
    return total_rewards

### **Hyper-Parameters and Parameters**

In [None]:
#hyper-parameters
save_model = True
seed = 0

start_timesteps = 1e4
#max_timesteps= 6e5
max_timesteps= 15e3

batch_size = 100
policy_freq = 2
eval_freq = 5e3

discount = 0.99

expi_noise = 0.1
policy_noise = 0.2
noise_clip = 0.5
tau = 0.005

file_name = f"MODEL_DRL--{env_name}--seed({seed})"
env_name = 'HopperBulletEnv-v0'

### **Environment initialization**


In [None]:
env = gym.make(env_name)
print("ENV:",env_name)

ENV: HopperBulletEnv-v0


### **Environment Setting**
Setting all randoms to a fixed seed of 0

In [None]:
#setting env
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

### **Model and Memory initialization**
Model: Is a Deep Reinforcement Learning<br>
Memory: Is where the model will pick samples to train from/on.



In [None]:
policy = MODEL_DRL(state_dim, action_dim, max_action)
Memory = Memory_() 

### **Setting basemodel**
evaluation of the model before training

In [None]:
evaluation = [evaluate(policy)]

Average award over 10 is: 30.413425844018196


In [None]:
#initializing the variables
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

### **Training**

In [None]:
while total_timesteps < max_timesteps:
    if done:
        if total_timesteps != 0:
            print(f'Total timesteps:{total_timesteps} - Episode:{episode_num} - Reward:{episode_reward}')
            policy.train(Memory, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %=  eval_freq
            evaluation.append(evaluate(policy))
            policy.save(file_name, directory='/content/drive/MyDrive/DRL/models')
            np.save('/content/drive/MyDrive/DRL/results/%s'%(file_name), evaluation)

        obs = env.reset()
        done = False

        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    #Before 10000 timesteps,we play Random actions.
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else:#after 10000 we switch to the policy/model/agent
        action = policy.get_action(np.array(obs))
        if expi_noise != 0:
            action = (action + np.random.normal(0, expi_noise, size=env.action_space.shape[0])).clip(env.action_space.low,env.action_space.high)

    new_obs, reward, done, _ = env.step(action)

    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    episode_reward += reward
    
    Memory.add((obs, new_obs, action, reward, done_bool))

    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

evaluation.append(evaluate(policy))
if save_model:
    policy.save('%s'% (file_name), directory='/content/drive/MyDrive/DRL/models')
np.save("/content/drive/MyDrive/DRL/results/%s" % (file_name),evaluation)

Total timesteps:16 - Episode:1 - Reward:29.05637608106626
Total timesteps:24 - Episode:2 - Reward:20.01591295011167
Total timesteps:34 - Episode:3 - Reward:22.036458671405853
Total timesteps:51 - Episode:4 - Reward:19.803455461015982
Total timesteps:69 - Episode:5 - Reward:17.453622058752806
Total timesteps:81 - Episode:6 - Reward:19.276845316313846
Total timesteps:86 - Episode:7 - Reward:16.78463421813067
Total timesteps:94 - Episode:8 - Reward:18.264891544888087
Total timesteps:107 - Episode:9 - Reward:13.484286775598592
Total timesteps:116 - Episode:10 - Reward:20.494939125757078
Total timesteps:127 - Episode:11 - Reward:18.623737723080556
Total timesteps:134 - Episode:12 - Reward:19.33823231160495
Total timesteps:141 - Episode:13 - Reward:17.92542426261498
Total timesteps:149 - Episode:14 - Reward:17.802375661052064
Total timesteps:163 - Episode:15 - Reward:21.727406111369785
Total timesteps:169 - Episode:16 - Reward:17.151570225146134
Total timesteps:176 - Episode:17 - Reward:16.8

### **Visualizing  by testing**

In [None]:
env_name = 'HopperBulletEnv-v0'
seed = 0

file_name = f"MODEL_DRL--{env_name}--seed({seed})"
print(file_name)

eval_episodes = 10

env = gym.make(env_name)
max_episode_step = env._max_episode_steps

env = wrappers.Monitor(env, '/content/drive/MyDrive/DRL/exp/vids/', force=True)
env.reset()
#setting env
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
#agent
policy = MODEL_DRL(state_dim, action_dim, max_action)
policy.load(file_name, "/content/drive/MyDrive/DRL/models")
_ = evaluate(policy, episodes=eval_episodes)

MODEL_DRL--HopperBulletEnv-v0--seed(0)
Average award over 10 is: 83.35262543137912


In [None]:
print('Done')

Done


## **The End**