In [3]:
!git clone https://github.com/benelot/pybullet-gym.git

fatal: destination path 'pybullet-gym' already exists and is not an empty directory.


In [1]:
cd pybullet-gym/

C:\Users\GHOSH\OneDrive\Documents\MS\NEU\Assignmnets\CS5180\pybullet-gym


In [2]:
!pip install -e .

Obtaining file:///C:/Users/GHOSH/OneDrive/Documents/MS/NEU/Assignmnets/CS5180/pybullet-gym
Installing collected packages: pybulletgym
  Attempting uninstall: pybulletgym
    Found existing installation: pybulletgym 0.1
    Uninstalling pybulletgym-0.1:
      Successfully uninstalled pybulletgym-0.1
  Running setup.py develop for pybulletgym
Successfully installed pybulletgym-0.1


In [3]:
!pip3 install box2d-py
!pip install gym[box2d]
import gym
import pybulletgym 



In [4]:
import os
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
from torch.distributions import Normal


def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)

class Critic(nn.Module):
    def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device='cpu'):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(input_dims,fc1_dims)
        self.ln1 = nn.LayerNorm(fc1_dims)

        self.fc2 = nn.Linear(fc1_dims,fc2_dims)
        self.ln2 = nn.LayerNorm(fc2_dims)

        self.action_value_layer = nn.Linear(action_dims,fc2_dims)

        self.q = nn.Linear(fc2_dims,1)

        self.device = device
        self.to(self.device)

        self.apply(weights_init_)

    def forward(self, state, action):

      state_value = F.relu(self.ln1(self.fc1(state)))
      state_value = self.ln2(self.fc2(state_value))

      action_value = F.relu(self.action_value_layer(action))

      state_action_value = F.relu(torch.add(state_value,action_value))
      state_action_value = self.q(state_action_value)

      return state_action_value


class GaussianPolicy(nn.Module):
    def __init__(self, input_dims,fc1_dims,fc2_dims,action_dims,device='cpu',log_std_min=-20,log_std_max=2,epsilon=1e-6):
        super(GaussianPolicy, self).__init__()

        self.linear1 = nn.Linear(input_dims,fc1_dims)
        self.ln1 = nn.LayerNorm(fc1_dims)

        self.linear2 = nn.Linear(fc1_dims,fc2_dims)
        self.ln2 = nn.LayerNorm(fc2_dims)

        self.mean_linear = nn.Linear(fc2_dims,action_dims)
        self.log_std_linear = nn.Linear(fc2_dims,action_dims)

        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        self.epsilon = epsilon

        self.device = torch.device(device)
        self.to(self.device)

        self.apply(weights_init_)


    def forward(self, state):
        x = F.relu(self.ln1(self.linear1(state)))
        x = F.relu(self.ln2(self.linear2(x)))
        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)

        log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max)

        return mean, log_std


    def sample(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        x_t = normal.rsample()
        action = torch.tanh(x_t)
        log_prob = normal.log_prob(x_t)

        # Enforcing Action Bound
        log_prob -= torch.log((1 - action.pow(2)) + self.epsilon)
        log_prob = log_prob.sum(1, keepdim=True)
        mean = torch.tanh(mean)
        return action, log_prob, mean



class Agent(object):
    def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,gamma,tau,SAC_alpha,actor_alpha,critic_alpha,SAC_alpha_lr,batch_size,n_update_iter,tune_alpha=True,buffer_size = 1000000,device='cpu'):

        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.action_dims = action_dims
        self.tune_alpha = tune_alpha
        self.n_update_iter = n_update_iter

        self.device = torch.device(device)

        self.critic_1 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
        self.target_critic_1 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

        self.critic_2 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
        self.target_critic_2 = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

        self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), lr=critic_alpha)
        self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), lr=critic_alpha)

        self.actor = GaussianPolicy(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_alpha)

        self.learn_counter = 0
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_size, input_dims))
        self.next_state_buffer = np.zeros((self.buffer_size, input_dims))
        self.action_buffer = np.zeros((self.buffer_size,self.action_dims))
        self.reward_buffer = np.zeros((self.buffer_size,))
        self.terminal_buffer = np.zeros((self.buffer_size,), dtype=np.float32)

        if self.tune_alpha:
            self.target_entropy = -torch.prod(torch.Tensor(action_dims).to(self.device)).item()
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha_optim = optim.Adam([self.log_alpha], lr=SAC_alpha_lr)
            self.alpha = self.log_alpha.exp().detach()
        else:
            self.alpha = SAC_alpha

        self.update_network_parameters(tau=1)

    def store_transitions(self,state,action,reward,next_state,terminal):
      idx = self.buffer_counter % self.buffer_size

      self.state_buffer[idx] = state
      self.action_buffer[idx] = action
      self.reward_buffer[idx] = reward
      self.next_state_buffer[idx] = next_state
      self.terminal_buffer[idx] = float(1 - terminal)

      self.buffer_counter += 1

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        critic_params_1 = self.critic_1.named_parameters()
        critic_params_2 = self.critic_2.named_parameters()
        target_critic_params_1 = self.target_critic_1.named_parameters()
        target_critic_params_2 = self.target_critic_2.named_parameters()

        critic_state_dict_1 = dict(critic_params_1)
        critic_state_dict_2 = dict(critic_params_2)
        target_critic_dict_1 = dict(target_critic_params_1)
        target_critic_dict_2 = dict(target_critic_params_2)

        for name in critic_state_dict_1:
            critic_state_dict_1[name] = tau*critic_state_dict_1[name].clone() +(1-tau)*target_critic_dict_1[name].clone()
        self.target_critic_1.load_state_dict(critic_state_dict_1)

        for name in critic_state_dict_2:
            critic_state_dict_2[name] = tau*critic_state_dict_2[name].clone() +(1-tau)*target_critic_dict_2[name].clone()
        self.target_critic_2.load_state_dict(critic_state_dict_2)


    def choose_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        action, _, _ = self.actor.sample(state)

        return action.detach().cpu().numpy()[0]


    def update(self):

        if self.buffer_counter > self.batch_size:
            for n in range(self.n_update_iter):

                if self.buffer_counter > self.buffer_size:
                    max_mem = self.buffer_size
                else:
                    max_mem = self.buffer_counter


                batch_indices = np.random.choice(max_mem, self.batch_size)

                state_batch = torch.Tensor(self.state_buffer[batch_indices]).to(self.device)
                action_batch = torch.Tensor(self.action_buffer[batch_indices]).to(self.device)
                reward_batch = torch.Tensor(self.reward_buffer[batch_indices]).to(self.device).reshape((self.batch_size,1))
                next_state_batch = torch.Tensor(self.next_state_buffer[batch_indices]).to(self.device)
                terminal_batch = torch.Tensor(self.terminal_buffer[batch_indices]).to(self.device).reshape((self.batch_size,1))

                next_action_batch,next_logprobs,_ = self.actor.sample(next_state_batch)
                next_action_batch = next_action_batch.detach()
                next_logprobs = next_logprobs.detach()
                next_critic_value_1 = self.target_critic_1(next_state_batch,next_action_batch).detach()
                next_critic_value_2 = self.target_critic_2(next_state_batch,next_action_batch).detach()
                next_critic_value = torch.min(next_critic_value_1,next_critic_value_2) - self.alpha * next_logprobs

                critic_target = reward_batch + terminal_batch * self.gamma * (next_critic_value)

                critic_value_1 = self.critic_1(state_batch, action_batch)
                critic_value_2 = self.critic_2(state_batch, action_batch)
                critic_1_loss = F.mse_loss(critic_value_1, critic_target)
                critic_2_loss = F.mse_loss(critic_value_2, critic_target)

                self.critic_optimizer_1.zero_grad()
                critic_1_loss.backward()
                self.critic_optimizer_1.step()

                self.critic_optimizer_2.zero_grad()
                critic_2_loss.backward()
                self.critic_optimizer_2.step()

                mu,log_probs, _ = self.actor.sample(state_batch)

                critic_val_1 = self.critic_1(state_batch,mu)
                critic_val_2 = self.critic_2(state_batch,mu)
                min_critic_val = torch.min(critic_val_1,critic_val_2)

                if (n+1) % 2 == 0:
                  actor_loss = ((self.alpha * log_probs) - min_critic_val)
                  self.actor_optimizer.zero_grad()
                  actor_loss.mean().backward()
                  self.actor_optimizer.step()

                  self.update_network_parameters()

                if self.tune_alpha:
                    alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean()

                    self.alpha_optim.zero_grad()
                    alpha_loss.backward()
                    self.alpha_optim.step()
                    self.alpha = self.log_alpha.exp().detach()


In [5]:
env_name = "HalfCheetahPyBulletEnv-v0"
env = gym.make(env_name)
input_dims = env.observation_space.shape[0]
action_dims = env.action_space.shape[0]

num_episodes = 1000
fc1_dims=512
fc2_dims=256
batch_size=128
gamma = 0.99
tau = 0.005
buffer_size = 100000
SAC_alpha = 0.5
n_update_iter = 4
actor_alpha = 0.0003
critic_alpha = 0.0003
SAC_alpha_lr = 0.0003
tune_alpha = True
device='cuda'




SAC_agent = Agent(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,gamma=gamma,tau=tau,
                  SAC_alpha=SAC_alpha,actor_alpha=actor_alpha,critic_alpha=critic_alpha,SAC_alpha_lr=SAC_alpha_lr,
                  batch_size=batch_size,buffer_size=buffer_size,tune_alpha=tune_alpha,n_update_iter=n_update_iter,device=device)

scores = []
Scores_Array = np.zeros(num_episodes,)
for i in range(num_episodes):
  done = False
  score = 0
  observation = env.reset()

  while not done:
    action = SAC_agent.choose_action(observation)
    observation_,reward,done,_ = env.step(action)
    SAC_agent.store_transitions(observation,action,reward,observation_,done)
    score += reward
    observation = observation_
    
    SAC_agent.update()
  
  scores.append(score)

  avg_score = np.mean(scores[max(0, i-10):(i+1)])
  avg_score_100 = np.mean(scores[max(0, i-100):(i+1)])
  print('episode: ', i+1,'score: ', score,' average_score_10 %.3f' % avg_score,' average_score_100 %.3f' % avg_score_100)
  print()


Scores_Array = np.array(scores)
print(Scores_Array)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  deprecation(
  deprecation(


WalkerBase::__init__


  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(


episode:  1 score:  2.1644243959904985  average_score_10 2.164  average_score_100 2.164

episode:  2 score:  1.1792609014970359  average_score_10 1.672  average_score_100 1.672

episode:  3 score:  0.03184525761171386  average_score_10 1.125  average_score_100 1.125

episode:  4 score:  4.455743435889599  average_score_10 1.958  average_score_100 1.958

episode:  5 score:  -5.250475453259423  average_score_10 0.516  average_score_100 0.516

episode:  6 score:  9.298969254511757  average_score_10 1.980  average_score_100 1.980

episode:  7 score:  5.552340615825961  average_score_10 2.490  average_score_100 2.490

episode:  8 score:  -9.71846540044062  average_score_10 0.964  average_score_100 0.964

episode:  9 score:  1.9033256240276386  average_score_10 1.069  average_score_100 1.069

episode:  10 score:  -1.1882615869661097  average_score_10 0.843  average_score_100 0.843

episode:  11 score:  1.578041042450059  average_score_10 0.910  average_score_100 0.910

episode:  12 score:  -

episode:  93 score:  12.651339513256973  average_score_10 14.061  average_score_100 5.696

episode:  94 score:  23.052924360748143  average_score_10 15.364  average_score_100 5.881

episode:  95 score:  21.624975968924996  average_score_10 16.891  average_score_100 6.046

episode:  96 score:  31.18904237258103  average_score_10 19.222  average_score_100 6.308

episode:  97 score:  17.75320084610721  average_score_10 20.206  average_score_100 6.426

episode:  98 score:  10.51034548918833  average_score_10 18.651  average_score_100 6.468

episode:  99 score:  41.20081944827689  average_score_10 20.937  average_score_100 6.819

episode:  100 score:  9.980529333067535  average_score_10 20.708  average_score_100 6.850

episode:  101 score:  25.72830501275748  average_score_10 21.132  average_score_100 7.037

episode:  102 score:  54.03345133194817  average_score_10 24.627  average_score_100 7.551

episode:  103 score:  42.7448411759513  average_score_10 26.406  average_score_100 7.962

epis

episode:  181 score:  725.6034325924525  average_score_10 291.837  average_score_100 330.688

episode:  182 score:  119.44670421303012  average_score_10 295.742  average_score_100 331.787

episode:  183 score:  78.64544486064698  average_score_10 296.889  average_score_100 332.472

episode:  184 score:  82.15593540819977  average_score_10 300.317  average_score_100 333.199

episode:  185 score:  101.42630121624248  average_score_10 291.581  average_score_100 334.156

episode:  186 score:  111.35885022973531  average_score_10 286.681  average_score_100 335.203

episode:  187 score:  310.79216398261485  average_score_10 306.561  average_score_100 338.212

episode:  188 score:  48.00890854409664  average_score_10 299.493  average_score_100 338.414

episode:  189 score:  88.19968481968968  average_score_10 243.090  average_score_100 339.128

episode:  190 score:  89.2274833142669  average_score_10 223.677  average_score_100 339.888

episode:  191 score:  53.283434732782176  average_score_1

episode:  269 score:  752.4717417374594  average_score_10 472.373  average_score_100 404.835

episode:  270 score:  776.6475638440985  average_score_10 496.537  average_score_100 407.936

episode:  271 score:  785.7558839653675  average_score_10 500.577  average_score_100 413.686

episode:  272 score:  780.8665656869493  average_score_10 501.558  average_score_100 420.660

episode:  273 score:  782.8152637199336  average_score_10 504.231  average_score_100 427.757

episode:  274 score:  779.7758724339882  average_score_10 569.226  average_score_100 435.037

episode:  275 score:  779.0701169899149  average_score_10 569.492  average_score_100 440.795

episode:  276 score:  757.6598616444696  average_score_10 637.137  average_score_100 446.660

episode:  277 score:  727.3334286367683  average_score_10 633.302  average_score_100 452.950

episode:  278 score:  786.3386676358343  average_score_10 701.546  average_score_100 459.490

episode:  279 score:  804.8928835902504  average_score_10 77

episode:  357 score:  110.67473288920006  average_score_10 512.861  average_score_100 596.192

episode:  358 score:  110.30979986721654  average_score_10 500.876  average_score_100 589.875

episode:  359 score:  113.7336248044303  average_score_10 496.461  average_score_100 590.778

episode:  360 score:  162.98336740271037  average_score_10 453.290  average_score_100 587.334

episode:  361 score:  84.8008689812137  average_score_10 403.777  average_score_100 580.834

episode:  362 score:  872.7151082288042  average_score_10 425.297  average_score_100 581.850

episode:  363 score:  349.37164214612096  average_score_10 397.341  average_score_100 577.850

episode:  364 score:  868.6137598291164  average_score_10 418.658  average_score_100 585.808

episode:  365 score:  845.7436080735907  average_score_10 439.184  average_score_100 586.497

episode:  366 score:  839.992879979624  average_score_10 457.073  average_score_100 594.680

episode:  367 score:  817.9898252123011  average_score_10 

episode:  445 score:  670.207054467229  average_score_10 687.378  average_score_100 663.645

episode:  446 score:  660.6837718202966  average_score_10 712.015  average_score_100 667.602

episode:  447 score:  731.1107399285347  average_score_10 700.507  average_score_100 671.038

episode:  448 score:  669.0794155052363  average_score_10 760.255  average_score_100 675.265

episode:  449 score:  682.0587787401186  average_score_10 742.809  average_score_100 680.411

episode:  450 score:  700.1924513525197  average_score_10 727.376  average_score_100 681.029

episode:  451 score:  694.9511948855055  average_score_10 711.330  average_score_100 681.677

episode:  452 score:  884.4463299985589  average_score_10 712.128  average_score_100 684.137

episode:  453 score:  921.5580191312217  average_score_10 722.659  average_score_100 686.758

episode:  454 score:  893.2592253530394  average_score_10 743.383  average_score_100 689.323

episode:  455 score:  893.4970474979672  average_score_10 763

episode:  533 score:  874.0760088189118  average_score_10 717.080  average_score_100 518.421

episode:  534 score:  847.6627575870544  average_score_10 734.456  average_score_100 518.150

episode:  535 score:  856.617793193306  average_score_10 751.922  average_score_100 517.953

episode:  536 score:  861.8608635372823  average_score_10 769.393  average_score_100 522.628

episode:  537 score:  857.6531941521049  average_score_10 784.790  average_score_100 522.628

episode:  538 score:  742.9555636860607  average_score_10 791.016  average_score_100 529.866

episode:  539 score:  774.1891710913487  average_score_10 799.443  average_score_100 528.879

episode:  540 score:  884.1976702536151  average_score_10 811.354  average_score_100 529.019

episode:  541 score:  879.0477338059725  average_score_10 811.926  average_score_100 529.095

episode:  542 score:  837.5520875740705  average_score_10 809.833  average_score_100 528.717

episode:  543 score:  119.90335621621924  average_score_10 77

episode:  621 score:  521.7717508601255  average_score_10 564.291  average_score_100 503.371

episode:  622 score:  601.7447469910423  average_score_10 559.114  average_score_100 508.036

episode:  623 score:  692.8531649879378  average_score_10 568.880  average_score_100 508.575

episode:  624 score:  868.7972753906064  average_score_10 592.426  average_score_100 510.677

episode:  625 score:  853.3386579034587  average_score_10 613.857  average_score_100 512.547

episode:  626 score:  827.5740853912591  average_score_10 680.717  average_score_100 514.110

episode:  627 score:  234.74341946182824  average_score_10 649.071  average_score_100 509.620

episode:  628 score:  840.1933338145164  average_score_10 670.851  average_score_100 511.260

episode:  629 score:  628.0555720483388  average_score_10 661.928  average_score_100 510.731

episode:  630 score:  610.7485388740251  average_score_10 660.832  average_score_100 509.321

episode:  631 score:  637.7375828923535  average_score_10 6

episode:  709 score:  889.8920415784063  average_score_10 757.414  average_score_100 652.082

episode:  710 score:  876.4086258037161  average_score_10 755.886  average_score_100 654.705

episode:  711 score:  880.4396806301804  average_score_10 771.364  average_score_100 656.646

episode:  712 score:  878.9187647162714  average_score_10 843.527  average_score_100 658.827

episode:  713 score:  917.4352795393477  average_score_10 850.144  average_score_100 662.114

episode:  714 score:  888.9020184530829  average_score_10 863.333  average_score_100 664.878

episode:  715 score:  926.9055247320656  average_score_10 872.050  average_score_100 667.940

episode:  716 score:  909.8672916705841  average_score_10 896.976  average_score_100 676.037

episode:  717 score:  939.6996524540886  average_score_10 899.604  average_score_100 679.570

episode:  718 score:  939.1905815291128  average_score_10 902.389  average_score_100 682.922

episode:  719 score:  931.5984708992735  average_score_10 90

episode:  797 score:  22.18026774744067  average_score_10 334.078  average_score_100 606.926

episode:  798 score:  10.574140286153122  average_score_10 252.914  average_score_100 598.345

episode:  799 score:  11.369048985010883  average_score_10 172.940  average_score_100 589.882

episode:  800 score:  17.797915042150997  average_score_10 91.196  average_score_100 581.214

episode:  801 score:  20.60841891883174  average_score_10 13.423  average_score_100 574.387

episode:  802 score:  11.42130110550934  average_score_10 13.510  average_score_100 573.657

episode:  803 score:  12.595205823623111  average_score_10 13.803  average_score_100 565.419

episode:  804 score:  13.750702825325426  average_score_10 14.155  average_score_100 558.191

episode:  805 score:  40.513755219563606  average_score_10 16.708  average_score_100 550.364

episode:  806 score:  30.05566623830383  average_score_10 18.660  average_score_100 544.368

episode:  807 score:  59.429883719266236  average_score_10 22

episode:  885 score:  28.773506368711242  average_score_10 564.091  average_score_100 602.901

episode:  886 score:  100.15388619243456  average_score_10 519.296  average_score_100 594.929

episode:  887 score:  37.67791514574929  average_score_10 473.720  average_score_100 586.357

episode:  888 score:  29.285291347044403  average_score_10 419.736  average_score_100 577.703

episode:  889 score:  32.22386916104298  average_score_10 372.418  average_score_100 569.199

episode:  890 score:  35.918898425150836  average_score_10 319.820  average_score_100 560.476

episode:  891 score:  21.826656208198976  average_score_10 261.867  average_score_100 552.018

episode:  892 score:  45.351187864999524  average_score_10 213.270  average_score_100 552.363

episode:  893 score:  663.7051270250608  average_score_10 211.632  average_score_100 558.842

episode:  894 score:  158.66102302486843  average_score_10 164.941  average_score_100 560.315

episode:  895 score:  52.3198655600063  average_score

episode:  972 score:  489.8052978101638  average_score_10 180.765  average_score_100 599.037

episode:  973 score:  548.8363416991081  average_score_10 229.460  average_score_100 599.070

episode:  974 score:  555.9096797148616  average_score_10 278.708  average_score_100 601.656

episode:  975 score:  60.9424403416706  average_score_10 263.085  average_score_100 597.802

episode:  976 score:  475.15130368435985  average_score_10 304.752  average_score_100 596.636

episode:  977 score:  504.1822271942988  average_score_10 322.573  average_score_100 596.291

episode:  978 score:  571.4735313155905  average_score_10 363.872  average_score_100 595.780

episode:  979 score:  533.8620318354191  average_score_10 396.862  average_score_100 595.593

episode:  980 score:  570.3407903162149  average_score_10 447.500  average_score_100 595.156

episode:  981 score:  61.8437714408574  average_score_10 452.372  average_score_100 589.240

episode:  982 score:  99.197904096634  average_score_10 406.5

In [12]:
env_name = "HopperPyBulletEnv-v0"
env = gym.make(env_name)
input_dims = env.observation_space.shape[0]
action_dims = env.action_space.shape[0]

num_episodes = 1000
fc1_dims=512
fc2_dims=256
batch_size=128
gamma = 0.99
tau = 0.005
buffer_size = 100000
SAC_alpha = 0.5
n_update_iter = 4
actor_alpha = 0.0003
critic_alpha = 0.0003
SAC_alpha_lr = 0.0003
tune_alpha = True
device='cuda'




SAC_agent = Agent(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,gamma=gamma,tau=tau,
                  SAC_alpha=SAC_alpha,actor_alpha=actor_alpha,critic_alpha=critic_alpha,SAC_alpha_lr=SAC_alpha_lr,
                  batch_size=batch_size,buffer_size=buffer_size,tune_alpha=tune_alpha,n_update_iter=n_update_iter,device=device)

scores = []
Scores_Array = np.zeros(num_episodes,)
for i in range(num_episodes):
  done = False
  score = 0
  observation = env.reset()

  while not done:
    action = SAC_agent.choose_action(observation)
    observation_,reward,done,_ = env.step(action)
    SAC_agent.store_transitions(observation,action,reward,observation_,done)
    score += reward
    observation = observation_
    
    SAC_agent.update()
  
  scores.append(score)

  avg_score = np.mean(scores[max(0, i-10):(i+1)])
  avg_score_100 = np.mean(scores[max(0, i-100):(i+1)])
  print('episode: ', i+1,'score: ', score,' average_score_10 %.3f' % avg_score,' average_score_100 %.3f' % avg_score_100)
  print()


Scores_Array = np.array(scores)
print(Scores_Array)

WalkerBase::__init__
episode:  1 score:  14.236600896986783  average_score_10 14.237  average_score_100 14.237

episode:  2 score:  16.10401071917877  average_score_10 15.170  average_score_100 15.170

episode:  3 score:  10.828493853523105  average_score_10 13.723  average_score_100 13.723

episode:  4 score:  12.312081466676318  average_score_10 13.370  average_score_100 13.370

episode:  5 score:  12.593231376621407  average_score_10 13.215  average_score_100 13.215

episode:  6 score:  17.487590789745443  average_score_10 13.927  average_score_100 13.927

episode:  7 score:  10.659769825387047  average_score_10 13.460  average_score_100 13.460

episode:  8 score:  17.720899612088395  average_score_10 13.993  average_score_100 13.993

episode:  9 score:  15.42732980959263  average_score_10 14.152  average_score_100 14.152

episode:  10 score:  15.125462889121263  average_score_10 14.250  average_score_100 14.250

episode:  11 score:  19.25662523935025  average_score_10 14.705  avera

episode:  91 score:  18.261917296619504  average_score_10 18.080  average_score_100 17.424

episode:  92 score:  18.339268272796474  average_score_10 18.287  average_score_100 17.434

episode:  93 score:  17.96855632664083  average_score_10 18.342  average_score_100 17.440

episode:  94 score:  18.95080423950858  average_score_10 18.316  average_score_100 17.456

episode:  95 score:  20.21994231984136  average_score_10 18.601  average_score_100 17.485

episode:  96 score:  22.821617206918017  average_score_10 18.948  average_score_100 17.541

episode:  97 score:  19.266824234789237  average_score_10 19.081  average_score_100 17.558

episode:  98 score:  19.159506454119402  average_score_10 19.008  average_score_100 17.575

episode:  99 score:  20.335923640255352  average_score_10 19.299  average_score_100 17.603

episode:  100 score:  21.39339011858392  average_score_10 19.440  average_score_100 17.641

episode:  101 score:  33.23051946301566  average_score_10 20.904  average_score_100

episode:  180 score:  195.07130626654393  average_score_10 191.799  average_score_100 68.834

episode:  181 score:  215.74538185007268  average_score_10 201.445  average_score_100 70.807

episode:  182 score:  170.59642047882377  average_score_10 206.105  average_score_100 72.338

episode:  183 score:  236.21053622502805  average_score_10 216.958  average_score_100 74.504

episode:  184 score:  381.83029657510036  average_score_10 241.243  average_score_100 78.094

episode:  185 score:  213.68848175178582  average_score_10 247.742  average_score_100 80.041

episode:  186 score:  701.559190140554  average_score_10 297.848  average_score_100 86.799

episode:  187 score:  342.05693377690915  average_score_10 310.390  average_score_100 90.009

episode:  188 score:  206.87943017569202  average_score_10 282.571  average_score_100 91.860

episode:  189 score:  355.2712385330086  average_score_10 293.192  average_score_100 95.208

episode:  190 score:  174.07571198043155  average_score_10 290.

episode:  267 score:  1600.591974939924  average_score_10 1465.615  average_score_100 1064.902

episode:  268 score:  1573.7415611109334  average_score_10 1461.697  average_score_100 1079.618

episode:  269 score:  1548.3625379620414  average_score_10 1461.662  average_score_100 1093.870

episode:  270 score:  1565.9200274236182  average_score_10 1463.398  average_score_100 1108.491

episode:  271 score:  1548.8780354051453  average_score_10 1456.877  average_score_100 1122.741

episode:  272 score:  1620.2791474181302  average_score_10 1461.044  average_score_100 1137.601

episode:  273 score:  675.2540390506692  average_score_10 1377.084  average_score_100 1143.130

episode:  274 score:  1606.5311770555336  average_score_10 1381.325  average_score_100 1157.901

episode:  275 score:  1564.0817055616492  average_score_10 1381.256  average_score_100 1171.979

episode:  276 score:  1649.5706960617883  average_score_10 1382.769  average_score_100 1186.823

episode:  277 score:  1602.05390

episode:  352 score:  1666.4966481748827  average_score_10 1623.692  average_score_100 1474.986

episode:  353 score:  1733.5893401954024  average_score_10 1627.267  average_score_100 1476.370

episode:  354 score:  1701.8629455695761  average_score_10 1625.617  average_score_100 1477.824

episode:  355 score:  1676.389521941008  average_score_10 1619.384  average_score_100 1478.855

episode:  356 score:  402.15774918625544  average_score_10 1543.669  average_score_100 1467.050

episode:  357 score:  1710.5706673209324  average_score_10 1573.597  average_score_100 1468.592

episode:  358 score:  1675.5792809750167  average_score_10 1571.384  average_score_100 1469.174

episode:  359 score:  1677.6545675222337  average_score_10 1572.944  average_score_100 1470.450

episode:  360 score:  1699.7506406984194  average_score_10 1577.384  average_score_100 1471.964

episode:  361 score:  1719.5520483233524  average_score_10 1581.560  average_score_100 1472.944

episode:  362 score:  1767.0873

episode:  437 score:  1821.5799269031916  average_score_10 1701.738  average_score_100 1650.852

episode:  438 score:  1834.7085191130304  average_score_10 1709.690  average_score_100 1652.505

episode:  439 score:  1769.6778241471877  average_score_10 1708.778  average_score_100 1669.701

episode:  440 score:  1774.9520648299638  average_score_10 1705.723  average_score_100 1671.027

episode:  441 score:  1789.739577449236  average_score_10 1704.693  average_score_100 1675.232

episode:  442 score:  1807.7924023198943  average_score_10 1711.152  average_score_100 1678.030

episode:  443 score:  1795.8570382487344  average_score_10 1707.409  average_score_100 1679.035

episode:  444 score:  1820.9301645726293  average_score_10 1711.686  average_score_100 1680.035

episode:  445 score:  1813.3985282575138  average_score_10 1714.153  average_score_100 1680.712

episode:  446 score:  1829.554761346252  average_score_10 1804.974  average_score_100 1686.599

episode:  447 score:  1750.29424

episode:  522 score:  1849.8303144827369  average_score_10 1739.525  average_score_100 1729.399

episode:  523 score:  1178.286104787827  average_score_10 1679.574  average_score_100 1722.913

episode:  524 score:  1862.3247665390754  average_score_10 1680.756  average_score_100 1723.547

episode:  525 score:  1842.1703542850496  average_score_10 1684.477  average_score_100 1724.121

episode:  526 score:  235.6578066396804  average_score_10 1536.467  average_score_100 1708.498

episode:  527 score:  1836.9680711123099  average_score_10 1535.963  average_score_100 1708.579

episode:  528 score:  1845.9686429431051  average_score_10 1641.351  average_score_100 1709.557

episode:  529 score:  1767.979350365574  average_score_10 1632.210  average_score_100 1709.441

episode:  530 score:  1827.9721058072726  average_score_10 1634.006  average_score_100 1709.633

episode:  531 score:  1831.592855554923  average_score_10 1629.259  average_score_100 1709.935

episode:  532 score:  1808.1206070

episode:  607 score:  1822.3384588096735  average_score_10 1462.240  average_score_100 1727.635

episode:  608 score:  1816.9708069135693  average_score_10 1456.200  average_score_100 1727.623

episode:  609 score:  1845.9320813800032  average_score_10 1460.051  average_score_100 1727.922

episode:  610 score:  1881.6991889212127  average_score_10 1563.898  average_score_100 1727.700

episode:  611 score:  1865.276593217099  average_score_10 1568.740  average_score_100 1727.794

episode:  612 score:  1869.4636459301864  average_score_10 1699.357  average_score_100 1728.199

episode:  613 score:  1879.539982542347  average_score_10 1702.550  average_score_100 1728.613

episode:  614 score:  1861.1001928561766  average_score_10 1852.871  average_score_100 1728.729

episode:  615 score:  1848.9222442007754  average_score_10 1852.805  average_score_100 1729.201

episode:  616 score:  1857.756248274526  average_score_10 1851.274  average_score_100 1729.142

episode:  617 score:  1887.321800

episode:  692 score:  1846.8746390399233  average_score_10 1815.864  average_score_100 1801.344

episode:  693 score:  1913.052150856362  average_score_10 1868.635  average_score_100 1802.388

episode:  694 score:  1726.240489351251  average_score_10 1849.256  average_score_100 1801.206

episode:  695 score:  1895.1501116020393  average_score_10 1856.974  average_score_100 1801.816

episode:  696 score:  1903.4831292182514  average_score_10 1860.276  average_score_100 1802.633

episode:  697 score:  1881.8449827534264  average_score_10 1865.740  average_score_100 1803.362

episode:  698 score:  1907.5885611710764  average_score_10 1868.934  average_score_100 1803.601

episode:  699 score:  1956.8173253680125  average_score_10 1877.978  average_score_100 1805.119

episode:  700 score:  1928.0111744330586  average_score_10 1885.424  average_score_100 1816.887

episode:  701 score:  1911.687337634581  average_score_10 1889.719  average_score_100 1817.874

episode:  702 score:  1887.250669

episode:  777 score:  1942.090365106783  average_score_10 1943.976  average_score_100 1829.755

episode:  778 score:  1974.1024954322158  average_score_10 1948.327  average_score_100 1838.405

episode:  779 score:  1947.2669648707274  average_score_10 1947.742  average_score_100 1839.227

episode:  780 score:  1981.7338780202258  average_score_10 1950.632  average_score_100 1840.431

episode:  781 score:  1953.2052103987207  average_score_10 1952.032  average_score_100 1841.065

episode:  782 score:  1972.6939848015252  average_score_10 1956.677  average_score_100 1842.278

episode:  783 score:  1960.5201558530118  average_score_10 1957.606  average_score_100 1848.495

episode:  784 score:  1926.6937816595073  average_score_10 1955.223  average_score_100 1848.370

episode:  785 score:  1938.4684063019897  average_score_10 1952.868  average_score_100 1849.639

episode:  786 score:  1963.7382966601845  average_score_10 1954.327  average_score_100 1850.595

episode:  787 score:  1907.2400

episode:  862 score:  1063.4608510153842  average_score_10 1758.054  average_score_100 1842.356

episode:  863 score:  1617.0180925387103  average_score_10 1722.229  average_score_100 1839.594

episode:  864 score:  1955.5044677184444  average_score_10 1718.696  average_score_100 1839.836

episode:  865 score:  1926.1977114321169  average_score_10 1719.796  average_score_100 1840.111

episode:  866 score:  1941.7838255455076  average_score_10 1719.564  average_score_100 1839.662

episode:  867 score:  1914.3080255780235  average_score_10 1714.343  average_score_100 1839.619

episode:  868 score:  1895.12424973346  average_score_10 1704.371  average_score_100 1839.311

episode:  869 score:  1963.1154683296506  average_score_10 1705.397  average_score_100 1839.404

episode:  870 score:  1947.1905662368695  average_score_10 1711.373  average_score_100 1839.377

episode:  871 score:  1956.919191987074  average_score_10 1710.812  average_score_100 1839.566

episode:  872 score:  1896.536215

episode:  947 score:  1969.1149613205428  average_score_10 1964.708  average_score_100 1880.313

episode:  948 score:  1978.2895268413722  average_score_10 1967.274  average_score_100 1881.000

episode:  949 score:  2008.0488072388625  average_score_10 1979.756  average_score_100 1881.243

episode:  950 score:  1973.9376259943763  average_score_10 1977.002  average_score_100 1881.886

episode:  951 score:  1961.882771416171  average_score_10 1975.678  average_score_100 1882.219

episode:  952 score:  1988.0481121733062  average_score_10 1972.963  average_score_100 1882.955

episode:  953 score:  1987.1309767782186  average_score_10 1974.755  average_score_100 1882.718

episode:  954 score:  2035.1978912899428  average_score_10 1983.448  average_score_100 1883.122

episode:  955 score:  1993.805985322344  average_score_10 1982.710  average_score_100 1883.911

episode:  956 score:  1986.6933999578368  average_score_10 1985.085  average_score_100 1884.330

episode:  957 score:  2009.57290

In [None]:
env_name = "AntPyBulletEnv-v0"
env = gym.make(env_name)
input_dims = env.observation_space.shape[0]
action_dims = env.action_space.shape[0]

num_episodes = 5000
fc1_dims=512
fc2_dims=256
batch_size=128
gamma = 0.99
tau = 0.005
buffer_size = 100000
SAC_alpha = 0.5
n_update_iter = 4
actor_alpha = 0.0003
critic_alpha = 0.0003
SAC_alpha_lr = 0.0003
tune_alpha = True
device='cuda'




SAC_agent = Agent(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,gamma=gamma,tau=tau,
                  SAC_alpha=SAC_alpha,actor_alpha=actor_alpha,critic_alpha=critic_alpha,SAC_alpha_lr=SAC_alpha_lr,
                  batch_size=batch_size,buffer_size=buffer_size,tune_alpha=tune_alpha,n_update_iter=n_update_iter,device=device)

scores = []
Scores_Array = np.zeros(num_episodes,)
for i in range(num_episodes):
  done = False
  score = 0
  observation = env.reset()

  while not done:
    action = SAC_agent.choose_action(observation)
    observation_,reward,done,_ = env.step(action)
    SAC_agent.store_transitions(observation,action,reward,observation_,done)
    score += reward
    observation = observation_
    
    SAC_agent.update()
  
  scores.append(score)

  avg_score = np.mean(scores[max(0, i-10):(i+1)])
  avg_score_100 = np.mean(scores[max(0, i-100):(i+1)])
  print('episode: ', i+1,'score: ', score,' average_score_10 %.3f' % avg_score,' average_score_100 %.3f' % avg_score_100)
  print()


Scores_Array = np.array(scores)
print(Scores_Array)

WalkerBase::__init__
episode:  1 score:  594.9027326860905  average_score_10 594.903  average_score_100 594.903

episode:  2 score:  368.328223433452  average_score_10 481.615  average_score_100 481.615

episode:  3 score:  139.24775660847814  average_score_10 367.493  average_score_100 367.493

episode:  4 score:  460.906144970379  average_score_10 390.846  average_score_100 390.846

episode:  5 score:  39.42550103377871  average_score_10 320.562  average_score_100 320.562

episode:  6 score:  360.8938323707513  average_score_10 327.284  average_score_100 327.284

episode:  7 score:  554.507305674708  average_score_10 359.744  average_score_100 359.744

episode:  8 score:  213.19812524446888  average_score_10 341.426  average_score_100 341.426

episode:  9 score:  541.3307531394702  average_score_10 363.638  average_score_100 363.638

episode:  10 score:  43.40175623510004  average_score_10 331.614  average_score_100 331.614

episode:  11 score:  423.02089948555687  average_score_10 3