In [3]:
pip install pybullet

Collecting pybullet
  Downloading pybullet-3.2.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (90.8 MB)
[K     |████████████████████████████████| 90.8 MB 289 bytes/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.1


In [4]:
import os
import time
import random

import numpy as np
import matplotlib.pyplot as plt

import gym
import pybullet_envs

import torch
import torch.nn as nn
import torch.nn.functional as F

from gym import wrappers
from torch.autograd import Variable
from collections import deque

In [5]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('GPU on:', True if torch.cuda.is_available() else False, '| Device:',DEVICE)

GPU on: True | Device: cuda


In [6]:
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
    
    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr+1) % self.max_size
        else:
            self.storage.append(transition) 
    
    def sample(self, batch_size):
        sample_data = np.random.randint(0, len(self.storage), size=batch_size)

        states_ = []
        next_states_ = [] 
        actions_ = []
        rewards_ = []
        dones_= []
        for i in sample_data:
            state, next_state, action, reward, done = self.storage[i]
            states_.append(state)
            next_states_.append(next_state)
            actions_.append(action)
            rewards_.append(reward)
            dones_.append(done)
        
        return np.array(states_), np.array(next_states_), np.array(actions_), np.array(rewards_).reshape(-1,1), np.array(dones_).reshape(-1,1)

In [7]:
class Actor(nn.Module):
    def __init__(self, input, action, cut):#action is the number outputs | output is the number of actions
        super(Actor,self).__init__()
        self.fully01 = nn.Linear(input, 400)
        self.fully02 = nn.Linear(400,300)
        self.last = nn.Linear(300, action)
        self.cut = cut

    def forward(self, x):
        x = F.relu(self.fully01(x))
        x = F.relu(self.fully02(x))
        return self.cut * torch.tanh(self.last(x))#the cut to adjust to the output levels. higher or lower that -1,1                             

In [8]:
#since we need two pair of critics, im making both on the same class.
#the name of the class should be DoubleCritic,PairCritic..etc
class Critic(nn.Module):
    def __init__(self, input, action):
        super(Critic, self).__init__()
        #first
        self.fully01 = nn.Linear(input+action, 400)
        self.fully02 = nn.Linear(400,300)
        self.fully03 = nn.Linear(300, 1)

        #second
        self.fully11 = nn.Linear(input+action, 400)
        self.fully22 = nn.Linear(400,300)
        self.fully33 = nn.Linear(300, 1)

    def forward(self, x, u):
        xu = torch.cat([x,u], 1)
        #first
        x = F.relu(self.fully01(xu))
        x = F.relu(self.fully02(x))
        x = self.fully03(x)
        #Second
        y = F.relu(self.fully11(xu))
        y = F.relu(self.fully22(y))
        y = self.fully33(y)
        return x, y

    def Q1(self, x, u):
        xu = torch.cat([x,u], 1)
        x = F.relu(self.fully01(xu))
        x = F.relu(self.fully02(x))
        x = self.fully03(x)
        return x

In [9]:
#training
class TD3(object):
    def __init__(self, state_dim, action_dim, max_action):
        # actors
        self.Actor = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target.load_state_dict(self.Actor.state_dict())

        # actor optimizer
        self.Actor_optimizer = torch.optim.Adam(self.Actor.parameters())

        ## Critic 
        self.Critic = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_Target = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_Target.load_state_dict(self.Critic.state_dict())

        ## Critic optimizer
        self.Critic_optimizer = torch.optim.Adam(self.Critic.parameters())

        ### Max_Action is the cut/clip
        self.max_action =  max_action

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1,-1)).to(DEVICE)
        return self.Actor(state).cpu().data.numpy().flatten()
        #return self.Actor(state).data.numpy().flatten()

    def save(self, filename, directory):
        torch.save(self.Actor.state_dict(),'%s/%s_Actor.pth' % (directory,filename))
        torch.save(self.Critic.state_dict(),'%s/%s_Critic.pth' % (directory,filename))

    def load(self, filename, directory):
        self.Actor.load_state_dict(torch.load('%s/%s_Actor.pth' % (directory,filename)))
        self.Critic.load_state_dict(torch.load('%s/%s_Critic.pth' % (directory,filename)))
    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        for i in range(iterations):
            states_, next_states_, actions_, rewards_, dones_ = replay_buffer.sample(batch_size)

            state = torch.Tensor(states_).to(DEVICE)
            next_state = torch.Tensor(next_states_).to(DEVICE)
            action = torch.Tensor(actions_).to(DEVICE)
            reward = torch.Tensor(rewards_).to(DEVICE)
            done = torch.Tensor(dones_).to(DEVICE)

            next_action = self.Actor_Target(next_state)

            noise = torch.Tensor(actions_).data.normal_(0, policy_noise).to(DEVICE)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action+noise).clamp(-self.max_action, self.max_action)

            Target_Q1, Target_Q2 = self.Critic_Target(next_state, next_action)
            
            #when episode is over 1, not over 0. 
            # we detached because adding the reward which is the output 
            #of nn to the computaional graph would not be what we want.
            Target_Q = torch.min(Target_Q1, Target_Q2)
            Target_Q = reward + (discount * Target_Q * (1 - done)).detach()
          
            Current_Q1, Current_Q2 = self.Critic(state, action)
            critic_loss = F.mse_loss(Current_Q1,Target_Q) + F.mse_loss(Current_Q2, Target_Q)

            self.Critic_optimizer.zero_grad()
            critic_loss.backward()
            self.Critic_optimizer.step()

            if not i % policy_freq:
                actor_loss = -self.Critic.Q1(state, self.Actor(state)).mean()
                self.Actor_optimizer.zero_grad()
                actor_loss.backward()
                self.Actor_optimizer.step()

                for param, target_param in zip(self.Actor.parameters(), self.Actor_Target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
                
                
                for param, target_param in zip(self.Critic.parameters(), self.Critic_Target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)

In [10]:
def evaluate_policy(policy, episodes=10):
    avg_awards = 0
    for _ in range(episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_awards += reward
    avg_awards /= episodes
    print(f"Average award over {episodes} is:",avg_awards)
    return avg_awards

In [71]:
#parameters
env_name = 'Walker2DBulletEnv-v0'
seed = 0
start_timesteps = 1e4
eval_freq = 5e3
max_timesteps= 5e5

save_model = True
expi_noise = 0.1
batch_size = 100
discount = 0.99
tau = 0.005

policy_noise = 0.2
noise_clip = 0.5
policy_freq = 2

In [72]:
env = gym.make(env_name)

In [73]:
file_name = f"TD3--{env_name}--seed({seed})"
print(file_name)

TD3--Walker2DBulletEnv-v0--seed(0)


In [74]:
if not os.path.exists('./results'):
    os.makedirs('./results')
if save_model and not os.path.exists('./pytorch_models'):
    os.makedirs('./pytorch_models')

In [75]:
#setting env
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [76]:
policy = TD3(state_dim, action_dim, max_action)

In [77]:
replay_buffer = ReplayBuffer() 

In [78]:
evaluation = [evaluate_policy(policy)]

Average award over 10 is: 264.9672989036841


In [79]:
def mkdir(base, name):
    path = os.path.join(base,name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
    
work_dir = mkdir('exp','brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_step = env._max_episode_steps
save_env_vid = False
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force=True)
    env.reset()

In [80]:
#initializing the variables
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

In [None]:
while total_timesteps < max_timesteps:
    if done:
        if total_timesteps != 0:
            print(f'Total timesteps:{total_timesteps} - Episode num:{episode_num} - Reward:{episode_reward}')
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %=  eval_freq
            evaluation.append(evaluate_policy(policy))
            policy.save(file_name, directory='./pytorch_models')
            np.save('./results/%s'%(file_name), evaluation)

        obs = env.reset()
        done = False

        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    #Before 10000 timesteps,we play Random actions.
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else:#after 10000 we switch to the policy/model/agent
        action = policy.select_action(np.array(obs))
        if expi_noise != 0:
            action = (action + np.random.normal(0, expi_noise, size=env.action_space.shape[0])).clip(env.action_space.low,env.action_space.high)

    new_obs, reward, done, _ = env.step(action)

    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    episode_reward += reward
    
    replay_buffer.add((obs, new_obs, action, reward, done_bool))

    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

evaluation.append(evaluate_policy(policy))
if save_model:
    policy.save('%s'% (file_name), directory='./pytorch_models')
np.save("./results/%s" % (file_name),evaluation)

Total timesteps:11 - Episode num:1 - Reward:14.342351449678246
Total timesteps:31 - Episode num:2 - Reward:20.777396590379066
Total timesteps:48 - Episode num:3 - Reward:20.514735043872495
Total timesteps:55 - Episode num:4 - Reward:10.83502477268048
Total timesteps:71 - Episode num:5 - Reward:17.43650487842533
Total timesteps:110 - Episode num:6 - Reward:35.59045141547685
Total timesteps:120 - Episode num:7 - Reward:15.484733266291732
Total timesteps:141 - Episode num:8 - Reward:21.129919926942964
Total timesteps:154 - Episode num:9 - Reward:16.60799936141848
Total timesteps:163 - Episode num:10 - Reward:12.953738050516403
Total timesteps:181 - Episode num:11 - Reward:20.30876482272433
Total timesteps:190 - Episode num:12 - Reward:12.539863156058708
Total timesteps:203 - Episode num:13 - Reward:16.99671558473783
Total timesteps:216 - Episode num:14 - Reward:17.49615300779551
Total timesteps:226 - Episode num:15 - Reward:14.988500758878942
Total timesteps:236 - Episode num:16 - Reward:

In [65]:
#test  

In [66]:
pip install pybullet



In [67]:
import os
import time
import random

import numpy as np
import matplotlib.pyplot as plt

import gym
import pybullet_envs

import torch
import torch.nn as nn
import torch.nn.functional as F

from gym import wrappers
from torch.autograd import Variable
from collections import deque

In [68]:
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
    
    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr+1) % self.max_size
        else:
            self.storage.append(transition) 
    
    def sample(self, batch_size):
        sample_data = np.random.randint(0, len(self.storage), size=batch_size)

        states_ = []
        next_states_ = [] 
        actions_ = []
        rewards_ = []
        dones_= []
        for i in sample_data:
            state, next_state, action, reward, done = self.storage[i]
            states_.append(state)
            next_states_.append(next_state)
            actions_.append(action)
            rewards_.append(reward)
            dones_.append(done)
        
        return np.array(states_), np.array(next_states_), np.array(actions_), np.array(rewards_).reshape(-1,1), np.array(dones_).reshape(-1,1)

class Actor(nn.Module):
    def __init__(self, input, action, cut):#action is the number outputs | output is the number of actions
        super(Actor,self).__init__()
        self.fully01 = nn.Linear(input, 400)
        self.fully02 = nn.Linear(400,300)
        self.last = nn.Linear(300, action)
        self.cut = cut

    def forward(self, x):
        x = F.relu(self.fully01(x))
        x = F.relu(self.fully02(x))
        return self.cut * torch.tanh(self.last(x))#the cut to adjust to the output levels. higher or lower that -1,1                             

#since we need two pair of critics, im making both on the same class.
#the name of the class should be DoubleCritic,PairCritic..etc
class Critic(nn.Module):
    def __init__(self, input, action):
        super(Critic, self).__init__()
        #first
        self.fully01 = nn.Linear(input+action, 400)
        self.fully02 = nn.Linear(400,300)
        self.fully03 = nn.Linear(300, 1)

        #second
        self.fully11 = nn.Linear(input+action, 400)
        self.fully22 = nn.Linear(400,300)
        self.fully33 = nn.Linear(300, 1)

    def forward(self, x, u):
        xu = torch.cat([x,u], 1)
        #first
        x = F.relu(self.fully01(xu))
        x = F.relu(self.fully02(x))
        x = self.fully03(x)
        #Second
        y = F.relu(self.fully11(xu))
        y = F.relu(self.fully22(y))
        y = self.fully33(y)
        return x, y

    def Q1(self, x, u):
        xu = torch.cat([x,u], 1)
        x = F.relu(self.fully01(xu))
        x = F.relu(self.fully02(x))
        x = self.fully03(x)
        return x

#training
class TD3(object):
    def __init__(self, state_dim, action_dim, max_action):
        # actors
        self.Actor = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.Actor_Target.load_state_dict(self.Actor.state_dict())

        # actor optimizer
        self.Actor_optimizer = torch.optim.Adam(self.Actor.parameters())

        ## Critic 
        self.Critic = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_Target = Critic(state_dim, action_dim).to(DEVICE)
        self.Critic_Target.load_state_dict(self.Critic.state_dict())

        ## Critic optimizer
        self.Critic_optimizer = torch.optim.Adam(self.Critic.parameters())

        ### Max_Action is the cut/clip
        self.max_action =  max_action

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1,-1)).to(DEVICE)
        return self.Actor(state).cpu().data.numpy().flatten()
        #return self.Actor(state).data.numpy().flatten()

    def save(self, filename, directory):
        torch.save(self.Actor.state_dict(),'%s/%s_Actor.pth' % (directory,filename))
        torch.save(self.Critic.state_dict(),'%s/%s_Critic.pth' % (directory,filename))

    def load(self, filename, directory):
        self.Actor.load_state_dict(torch.load('%s/%s_Actor.pth' % (directory,filename)))
        self.Critic.load_state_dict(torch.load('%s/%s_Critic.pth' % (directory,filename)))
    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        for i in range(iterations):
            states_, next_states_, actions_, rewards_, dones_ = replay_buffer.sample(batch_size)

            state = torch.Tensor(states_).to(DEVICE)
            next_state = torch.Tensor(next_states_).to(DEVICE)
            action = torch.Tensor(actions_).to(DEVICE)
            reward = torch.Tensor(rewards_).to(DEVICE)
            done = torch.Tensor(dones_).to(DEVICE)

            next_action = self.Actor_Target(next_state)

            noise = torch.Tensor(actions_).data.normal_(0, policy_noise).to(DEVICE)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action+noise).clamp(-self.max_action, self.max_action)

            Target_Q1, Target_Q2 = self.Critic_Target(next_state, next_action)
            
            #when episode is over 1, not over 0. 
            # we detached because adding the reward which is the output 
            #of nn to the computaional graph would not be what we want.
            Target_Q = torch.min(Target_Q1, Target_Q2)
            Target_Q = reward + (discount * Target_Q * (1 - done)).detach()
          
            Current_Q1, Current_Q2 = self.Critic(state, action)
            critic_loss = F.mse_loss(Current_Q1,Target_Q) + F.mse_loss(Current_Q2, Target_Q)

            self.Critic_optimizer.zero_grad()
            critic_loss.backward()
            self.Critic_optimizer.step()

            if not i % policy_freq:
                actor_loss = -self.Critic.Q1(state, self.Actor(state)).mean()
                self.Actor_optimizer.zero_grad()
                actor_loss.backward()
                self.Actor_optimizer.step()

                for param, target_param in zip(self.Actor.parameters(), self.Actor_Target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
                
                
                for param, target_param in zip(self.Critic.parameters(), self.Critic_Target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
                
def evaluate_policy(policy, episodes=10):
    avg_awards = 0
    for _ in range(episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_awards += reward
    avg_awards /= episodes
    print(f"Average award over {episodes} is:",avg_awards)
    return avg_awards

def mkdir(base, name):
    path = os.path.join(base,name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

In [70]:
env_name = 'Walker2DBulletEnv-v0'
seed = 0
file_name = f"TD3--{env_name}--seed({seed})"
print(file_name)

eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)

max_episode_step = env._max_episode_steps
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force=True)
    env.reset()
#setting env
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
#agent
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, "./pytorch_models/")
_ = evaluate_policy(policy, episodes=eval_episodes)

TD3--Walker2DBulletEnv-v0--seed(0)
Average award over 10 is: 74.56413918364066
