In [1]:
from gym import make
import pybullet_envs
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy
import time
from collections import deque
from torch.distributions.normal import Normal

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Actor

In [3]:
LOG_STD_MIN = -20
LOG_STD_MAX = 2

In [4]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_dim, high, low):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(state_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        
        self.mean = nn.Linear(hidden_dim, action_size)
        self.log_std = nn.Linear(hidden_dim, action_size)
        
        self.high = torch.tensor(high).to(device)
        self.low = torch.tensor(low).to(device)
        
        self.action_scale = torch.FloatTensor(
                (high - low) / 2.).to(device)
        self.action_bias = torch.FloatTensor(
                (high + low) / 2.).to(device)
    
    def forward(self, state):
        x = state
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        m = self.mean(x)
        s = self.log_std(x)
        s = torch.clamp(s, min = LOG_STD_MIN, max = LOG_STD_MAX)
        return m, s
    
    def sample(self, state):
        m, s = self.forward(state) 
        std = s.exp()
        normal = Normal(m, std)
        a = normal.rsample()
        
        tanh = torch.tanh(a)
        action = tanh * self.action_scale + self.action_bias
        
        # we got a complicated distribution
        
        logp = normal.log_prob(a)
        logp -= torch.log(self.action_scale * (1 - tanh.pow(2)) + 1e-6)
        logp = logp.sum(1, keepdim=True)
        
        return action, logp

## Critic

In [5]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Critic, self).__init__()

        # Q1 
        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)

        # Q2 
        self.linear4 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
        self.linear6 = nn.Linear(hidden_dim, 1)


    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)

        q1 = F.relu(self.linear1(state_action))
        q1 = F.relu(self.linear2(q1))
        q1 = self.linear3(q1)

        q2 = F.relu(self.linear4(state_action))
        q2 = F.relu(self.linear5(q2))
        q2 = self.linear6(q2)
        return q1, q2

## Buffer

In [6]:
class Memory:
    def __init__(self, buffer_size, batch_size):
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = []
        self.position = 0

    def push(self, element):
        if len(self.memory) < self.buffer_size:
            self.memory.append(None)
        self.memory[self.position] = element
        self.position = (self.position + 1) % self.buffer_size

    def sample(self):
        return list(zip(*random.sample(self.memory, self.batch_size)))

    def __len__(self):
        return len(self.memory)

In [7]:
class Sac_agent:
    def __init__(self, state_size, action_size, hidden_dim, high, low, buffer_size, batch_size,
                 gamma, tau,num_updates, update_rate, alpha):
        
         # Actor Network and Target Network
        self.actor = Actor(state_size, action_size,hidden_dim, high, low).to(device)
        self.actor.apply(self.init_weights)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        

        # Critic Network and Target Network
        self.critic = Critic(state_size, action_size, hidden_dim).to(device)   
        self.critic.apply(self.init_weights)
        self.critic_target = Critic(state_size, action_size, hidden_dim).to(device)        
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-4)
        
        # copy weights
        self.hard_update(self.critic_target, self.critic)
        
        self.state_size = state_size
        self.action_size = action_size
        
        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2)
        self.target_entropy = -float(self.action_size)
        self.log_alpha = torch.zeros(1, requires_grad=True, device = device)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=1e-4)
        
        self.memory = Memory(buffer_size, batch_size)
        self.gamma = gamma
        self.tau = tau
        self.num_updates = num_updates
        self.update_rate = update_rate
        self.alpha = alpha
        
        self.iters = 0
        
    def init_weights(self, layer):
        if type(layer) == nn.Linear:
            nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
        
        
    def hard_update(self, target, network):
        for target_param, param in zip(target.parameters(), network.parameters()):
            target_param.data.copy_(param.data)
            
    def soft_update(self, target, network):
        for target_param, param in zip(target.parameters(), network.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)
            
    def learn(self, batch):
        for _ in range(self.num_updates):                
            state, action, reward, next_state, mask = batch

            state = torch.tensor(state).to(device).float()
            next_state = torch.tensor(next_state).to(device).float()
            reward = torch.tensor(reward).to(device).float().unsqueeze(1)
            action = torch.tensor(action).to(device).float()
            mask = torch.tensor(mask).to(device).int().unsqueeze(1)
            
            # compute target action
            with torch.no_grad():
                a, logp = self.actor.sample(next_state)
                
                # compute targets
                Q_target1, Q_target2 = self.critic_target(next_state, a) 
                min_Q = torch.min(Q_target1, Q_target2)
                Q_target = reward + self.gamma*mask*(min_Q - self.alpha*logp)

            # update critic       
            critic_1, critic_2 = self.critic(state, action)
            critic_loss1 = F.mse_loss(critic_1, Q_target)
            critic_loss2 = F.mse_loss(critic_2, Q_target)       

            # update actor 
            pi, log_pi = self.actor.sample(state)
            Q1_pi, Q2_pi = self.critic(state, pi)
            min_Q_pi = torch.min(Q1_pi, Q2_pi)
            actor_loss = (self.alpha*log_pi - min_Q_pi).mean()
            
            #gradient steps
            self.critic_optimizer.zero_grad()
            critic_loss1.backward()          
            self.critic_optimizer.step()
            
            self.critic_optimizer.zero_grad()
            critic_loss2.backward()          
            self.critic_optimizer.step()
            
            self.actor_optimizer.zero_grad()
            actor_loss.backward()            
            self.actor_optimizer.step()
            
            # update alpha
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()

            self.alpha = self.log_alpha.exp()

            # update critic_targe
            self.soft_update(self.critic_target, self.critic)
        
    def act(self, state):
        state =  torch.tensor(state).unsqueeze(0).to(device).float()
        action, logp = self.actor.sample(state)
        return action.cpu().data.numpy()[0]
    
    def step(self, state, action, reward, next_state, mask):
        self.iters += 1
        self.memory.push((state, action, reward, next_state, mask))
        if ( len(self.memory) >= self.memory.batch_size ) and ( self.iters % self.update_rate == 0 ):
            self.learn(self.memory.sample())
        
    def save(self):
        torch.save(self.actor.state_dict(), "ant_actor.pkl")
        torch.save(self.critic.state_dict(), "ant_critic.pkl")
        
    def test(self):
        new_env = make("AntBulletEnv-v0")
        new_env.seed(9)
        reward = []
        for i in range(10):
            state = new_env.reset()
            local_reward = 0
            done = False
            while not done:
                action = self.act(state)
                state, r, done, _ = new_env.step(action)
                local_reward += r
            reward.append(local_reward)
        return reward
            

## Ant

In [8]:
env = make("AntBulletEnv-v0")
np.random.seed(0)
env.seed(0)

action_size = env.action_space.shape[0]
print(f'size of each action = {action_size}')
state_size = env.observation_space.shape[0]
print(f'size of state = {state_size}')
low = env.action_space.low
high = env.action_space.high
print(f'low of each action = {low}')
print(f'high of each action = {high}')

size of each action = 8
size of state = 28
low of each action = [-1. -1. -1. -1. -1. -1. -1. -1.]
high of each action = [1. 1. 1. 1. 1. 1. 1. 1.]




## Train the Agent

In [9]:
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 256
GAMMA = 0.99            
TAU = 0.005               
EPISODES = 1500
NUM_UPDATES = 1
UPDATE_RATE = 1
ENTROPY_COEFFICIENT = 0.2

In [None]:
def sac(episodes):
    agent = Sac_agent(state_size = state_size, action_size = action_size, hidden_dim = 256, high = high, low = low, 
                  buffer_size = BUFFER_SIZE, batch_size = BATCH_SIZE, gamma = GAMMA, tau = TAU, 
                  num_updates = NUM_UPDATES, update_rate = UPDATE_RATE, alpha = ENTROPY_COEFFICIENT)
    time_start = time.time()
    reward_list = []
    avg_score_deque = deque(maxlen = 100)
    avg_scores_list = []
    mean_reward = -20000
    for i in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        episode_steps = 0
        while not done:
            episode_steps+=1    
            if i < 10:
                action = env.action_space.sample()
            else:
                action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            
            mask = 1 if episode_steps == env._max_episode_steps else float(not done)
            
            agent.step(state, action, reward, next_state, mask)
            total_reward += reward
            state = next_state
            
        reward_list.append(total_reward)
        avg_score_deque.append(total_reward)
        mean = np.mean(avg_score_deque)
        avg_scores_list.append(mean)
        
        if total_reward > 2000:
            r = agent.test()
            local_mean = np.mean(r)
            print(f"episode: {i+1}, steps:{episode_steps}, current reward: {total_reward}, max reward: {np.max(r)}, average reward on test: {local_mean}")
            if local_mean > mean_reward:
                mean_reward = local_mean
                agent.save()
                print("Saved")
        elif (i+1) % 1 == 0:
            s =  (int)(time.time() - time_start)            
            print("Ep.: {}, Ep.Steps: {}, Score: {:.2f}, Avg.Score: {:.2f}, Time: {:02}:{:02}:{:02}".\
            format(i+1, episode_steps, total_reward, mean, \
                  s//3600, s%3600//60, s%60))
            
    return reward_list, avg_scores_list

reward, avg_reward = sac(EPISODES)           

Ep.: 1, Ep.Steps: 1000, Score: 530.82, Avg.Score: 530.82, Time: 00:01:05
Ep.: 2, Ep.Steps: 1000, Score: 484.80, Avg.Score: 507.81, Time: 00:02:39
Ep.: 3, Ep.Steps: 1000, Score: 396.87, Avg.Score: 470.83, Time: 00:04:10
Ep.: 4, Ep.Steps: 447, Score: 217.66, Avg.Score: 407.54, Time: 00:04:47
Ep.: 5, Ep.Steps: 1000, Score: 510.66, Avg.Score: 428.16, Time: 00:06:14
Ep.: 6, Ep.Steps: 1000, Score: 522.79, Avg.Score: 443.93, Time: 00:07:41
Ep.: 7, Ep.Steps: 1000, Score: 508.87, Avg.Score: 453.21, Time: 00:09:25
Ep.: 8, Ep.Steps: 1000, Score: 526.13, Avg.Score: 462.33, Time: 00:11:03
Ep.: 9, Ep.Steps: 1000, Score: 451.80, Avg.Score: 461.16, Time: 00:12:30
Ep.: 10, Ep.Steps: 1000, Score: 485.92, Avg.Score: 463.63, Time: 00:13:56
Ep.: 11, Ep.Steps: 1000, Score: 251.85, Avg.Score: 444.38, Time: 00:15:35
Ep.: 12, Ep.Steps: 1000, Score: 419.57, Avg.Score: 442.31, Time: 00:17:12
Ep.: 13, Ep.Steps: 1000, Score: 282.28, Avg.Score: 430.00, Time: 00:18:48
Ep.: 14, Ep.Steps: 158, Score: 83.09, Avg.Score:

Ep.: 115, Ep.Steps: 20, Score: 1.88, Avg.Score: 183.39, Time: 01:31:43
Ep.: 116, Ep.Steps: 20, Score: 2.70, Avg.Score: 180.68, Time: 01:31:46
Ep.: 117, Ep.Steps: 20, Score: -0.47, Avg.Score: 177.29, Time: 01:31:49
Ep.: 118, Ep.Steps: 20, Score: 1.07, Avg.Score: 177.00, Time: 01:31:51
Ep.: 119, Ep.Steps: 20, Score: -0.69, Avg.Score: 176.39, Time: 01:31:54
Ep.: 120, Ep.Steps: 20, Score: -0.40, Avg.Score: 176.32, Time: 01:31:57
Ep.: 121, Ep.Steps: 20, Score: -0.57, Avg.Score: 172.78, Time: 01:32:00
Ep.: 122, Ep.Steps: 1000, Score: 394.75, Avg.Score: 176.68, Time: 01:33:37
Ep.: 123, Ep.Steps: 20, Score: 0.45, Avg.Score: 176.65, Time: 01:33:39
Ep.: 124, Ep.Steps: 20, Score: 0.38, Avg.Score: 176.61, Time: 01:33:42
Ep.: 125, Ep.Steps: 20, Score: 1.82, Avg.Score: 176.59, Time: 01:33:46
Ep.: 126, Ep.Steps: 20, Score: 2.00, Avg.Score: 176.58, Time: 01:33:48
Ep.: 127, Ep.Steps: 20, Score: 0.79, Avg.Score: 176.55, Time: 01:33:50
Ep.: 128, Ep.Steps: 20, Score: 1.25, Avg.Score: 176.53, Time: 01:33:5

Ep.: 230, Ep.Steps: 20, Score: 0.88, Avg.Score: 56.15, Time: 01:56:07
Ep.: 231, Ep.Steps: 20, Score: 0.55, Avg.Score: 56.16, Time: 01:56:09
Ep.: 232, Ep.Steps: 20, Score: 1.52, Avg.Score: 56.18, Time: 01:56:11
Ep.: 233, Ep.Steps: 20, Score: 0.52, Avg.Score: 56.19, Time: 01:56:13
Ep.: 234, Ep.Steps: 20, Score: -0.13, Avg.Score: 56.18, Time: 01:56:14
Ep.: 235, Ep.Steps: 20, Score: 0.10, Avg.Score: 56.18, Time: 01:56:16
Ep.: 236, Ep.Steps: 20, Score: 0.22, Avg.Score: 56.18, Time: 01:56:18
Ep.: 237, Ep.Steps: 20, Score: 1.03, Avg.Score: 56.20, Time: 01:56:19
Ep.: 238, Ep.Steps: 20, Score: -0.12, Avg.Score: 56.20, Time: 01:56:21
Ep.: 239, Ep.Steps: 20, Score: 0.22, Avg.Score: 56.20, Time: 01:56:23
Ep.: 240, Ep.Steps: 20, Score: 0.38, Avg.Score: 56.21, Time: 01:56:24
Ep.: 241, Ep.Steps: 20, Score: 0.49, Avg.Score: 56.21, Time: 01:56:26
Ep.: 242, Ep.Steps: 20, Score: 0.83, Avg.Score: 56.21, Time: 01:56:28
Ep.: 243, Ep.Steps: 20, Score: 0.42, Avg.Score: 56.21, Time: 01:56:29
Ep.: 244, Ep.Steps

Ep.: 343, Ep.Steps: 1000, Score: 339.00, Avg.Score: 294.96, Time: 03:19:59
Ep.: 344, Ep.Steps: 734, Score: 461.10, Avg.Score: 299.57, Time: 03:20:57
Ep.: 345, Ep.Steps: 1000, Score: 452.76, Avg.Score: 304.08, Time: 03:22:18
Ep.: 346, Ep.Steps: 20, Score: 3.74, Avg.Score: 304.12, Time: 03:22:19
Ep.: 347, Ep.Steps: 1000, Score: 617.23, Avg.Score: 310.29, Time: 03:23:38
Ep.: 348, Ep.Steps: 1000, Score: 358.58, Avg.Score: 313.86, Time: 03:24:57
Ep.: 349, Ep.Steps: 1000, Score: 328.23, Avg.Score: 317.11, Time: 03:26:18
Ep.: 350, Ep.Steps: 1000, Score: 315.59, Avg.Score: 315.81, Time: 03:27:39
Ep.: 351, Ep.Steps: 307, Score: 80.67, Avg.Score: 316.59, Time: 03:28:08
Ep.: 352, Ep.Steps: 1000, Score: 299.67, Avg.Score: 319.55, Time: 03:29:29
Ep.: 353, Ep.Steps: 1000, Score: 395.79, Avg.Score: 323.48, Time: 03:30:50
Ep.: 354, Ep.Steps: 1000, Score: 563.72, Avg.Score: 329.08, Time: 03:32:10
Ep.: 355, Ep.Steps: 1000, Score: 291.91, Avg.Score: 331.97, Time: 03:33:32
Ep.: 356, Ep.Steps: 1000, Score:

Ep.: 454, Ep.Steps: 20, Score: 5.59, Avg.Score: 383.86, Time: 05:20:28
Ep.: 455, Ep.Steps: 20, Score: 5.08, Avg.Score: 381.00, Time: 05:20:30
Ep.: 456, Ep.Steps: 20, Score: 4.20, Avg.Score: 376.75, Time: 05:20:32
Ep.: 457, Ep.Steps: 20, Score: 6.21, Avg.Score: 373.57, Time: 05:20:33
Ep.: 458, Ep.Steps: 20, Score: 3.31, Avg.Score: 367.16, Time: 05:20:35
Ep.: 459, Ep.Steps: 20, Score: 2.47, Avg.Score: 360.94, Time: 05:20:37
Ep.: 460, Ep.Steps: 20, Score: 2.31, Avg.Score: 354.90, Time: 05:20:38
Ep.: 461, Ep.Steps: 20, Score: 4.34, Avg.Score: 349.52, Time: 05:20:40
Ep.: 462, Ep.Steps: 20, Score: 5.41, Avg.Score: 343.28, Time: 05:20:42
Ep.: 463, Ep.Steps: 20, Score: 3.28, Avg.Score: 343.28, Time: 05:20:43
Ep.: 464, Ep.Steps: 20, Score: 4.33, Avg.Score: 343.27, Time: 05:20:45
Ep.: 465, Ep.Steps: 20, Score: 2.56, Avg.Score: 343.26, Time: 05:20:47
Ep.: 466, Ep.Steps: 20, Score: 4.68, Avg.Score: 343.29, Time: 05:20:49
Ep.: 467, Ep.Steps: 20, Score: 2.40, Avg.Score: 343.28, Time: 05:20:50
Ep.: 4

Ep.: 566, Ep.Steps: 484, Score: 213.95, Avg.Score: 302.41, Time: 06:40:35
Ep.: 567, Ep.Steps: 1000, Score: 703.40, Avg.Score: 309.42, Time: 06:42:01
Ep.: 568, Ep.Steps: 1000, Score: 627.49, Avg.Score: 315.63, Time: 06:43:29
Ep.: 569, Ep.Steps: 1000, Score: 602.74, Avg.Score: 321.62, Time: 06:44:59
Ep.: 570, Ep.Steps: 551, Score: 284.62, Avg.Score: 324.41, Time: 06:45:47
Ep.: 571, Ep.Steps: 1000, Score: 617.41, Avg.Score: 330.56, Time: 06:47:16
Ep.: 572, Ep.Steps: 694, Score: 461.74, Avg.Score: 335.16, Time: 06:48:16
Ep.: 573, Ep.Steps: 20, Score: 4.10, Avg.Score: 335.16, Time: 06:48:18
Ep.: 574, Ep.Steps: 20, Score: 3.65, Avg.Score: 335.16, Time: 06:48:20
Ep.: 575, Ep.Steps: 20, Score: 4.79, Avg.Score: 335.19, Time: 06:48:21
Ep.: 576, Ep.Steps: 21, Score: 3.10, Avg.Score: 335.17, Time: 06:48:23
Ep.: 577, Ep.Steps: 20, Score: 3.18, Avg.Score: 335.18, Time: 06:48:24
Ep.: 578, Ep.Steps: 1000, Score: 420.06, Avg.Score: 339.34, Time: 06:49:53
Ep.: 579, Ep.Steps: 1000, Score: 557.48, Avg.Sco

Ep.: 676, Ep.Steps: 1000, Score: 787.01, Avg.Score: 518.27, Time: 08:57:57
Ep.: 677, Ep.Steps: 1000, Score: 734.80, Avg.Score: 525.58, Time: 08:59:27
Ep.: 678, Ep.Steps: 1000, Score: 647.11, Avg.Score: 527.85, Time: 09:00:54
Ep.: 679, Ep.Steps: 1000, Score: 641.48, Avg.Score: 528.69, Time: 09:02:21
Ep.: 680, Ep.Steps: 1000, Score: 716.27, Avg.Score: 530.74, Time: 09:03:45
Ep.: 681, Ep.Steps: 1000, Score: 573.77, Avg.Score: 532.05, Time: 09:05:20
Ep.: 682, Ep.Steps: 1000, Score: 576.89, Avg.Score: 531.02, Time: 09:06:46
Ep.: 683, Ep.Steps: 1000, Score: 748.35, Avg.Score: 531.79, Time: 09:08:11
Ep.: 684, Ep.Steps: 1000, Score: 635.75, Avg.Score: 532.22, Time: 09:09:35
Ep.: 685, Ep.Steps: 1000, Score: 709.19, Avg.Score: 535.79, Time: 09:10:58
Ep.: 686, Ep.Steps: 1000, Score: 490.89, Avg.Score: 533.87, Time: 09:12:25
Ep.: 687, Ep.Steps: 1000, Score: 862.63, Avg.Score: 539.06, Time: 09:13:50
Ep.: 688, Ep.Steps: 1000, Score: 443.88, Avg.Score: 537.02, Time: 09:15:18
Ep.: 689, Ep.Steps: 1000,

Ep.: 786, Ep.Steps: 1000, Score: 758.19, Avg.Score: 713.00, Time: 11:27:43
Ep.: 787, Ep.Steps: 1000, Score: 873.21, Avg.Score: 713.11, Time: 11:29:05
Ep.: 788, Ep.Steps: 1000, Score: 701.98, Avg.Score: 715.69, Time: 11:30:28
Ep.: 789, Ep.Steps: 1000, Score: 475.00, Avg.Score: 712.98, Time: 11:31:49
Ep.: 790, Ep.Steps: 1000, Score: 643.26, Avg.Score: 711.79, Time: 11:33:10
Ep.: 791, Ep.Steps: 1000, Score: 634.95, Avg.Score: 713.67, Time: 11:34:33
Ep.: 792, Ep.Steps: 1000, Score: 569.64, Avg.Score: 712.10, Time: 11:35:54
Ep.: 793, Ep.Steps: 1000, Score: 861.39, Avg.Score: 714.45, Time: 11:37:16
Ep.: 794, Ep.Steps: 1000, Score: 573.96, Avg.Score: 712.38, Time: 11:38:39
Ep.: 795, Ep.Steps: 1000, Score: 913.00, Avg.Score: 715.11, Time: 11:39:59
Ep.: 796, Ep.Steps: 1000, Score: 787.56, Avg.Score: 716.97, Time: 11:41:19
Ep.: 797, Ep.Steps: 1000, Score: 681.45, Avg.Score: 716.52, Time: 11:42:41
Ep.: 798, Ep.Steps: 1000, Score: 769.70, Avg.Score: 717.01, Time: 11:44:04
Ep.: 799, Ep.Steps: 1000,

Ep.: 896, Ep.Steps: 1000, Score: 1057.31, Avg.Score: 798.01, Time: 13:58:11
Ep.: 897, Ep.Steps: 1000, Score: 918.53, Avg.Score: 800.38, Time: 13:59:37
Ep.: 898, Ep.Steps: 1000, Score: 680.79, Avg.Score: 799.49, Time: 14:01:05
Ep.: 899, Ep.Steps: 1000, Score: 874.24, Avg.Score: 799.98, Time: 14:02:30
Ep.: 900, Ep.Steps: 1000, Score: 1128.91, Avg.Score: 804.96, Time: 14:03:52
Ep.: 901, Ep.Steps: 1000, Score: 931.94, Avg.Score: 806.32, Time: 14:05:13
Ep.: 902, Ep.Steps: 1000, Score: 1022.85, Avg.Score: 808.03, Time: 14:06:34
Ep.: 903, Ep.Steps: 1000, Score: 860.06, Avg.Score: 809.13, Time: 14:07:54
Ep.: 904, Ep.Steps: 1000, Score: 919.76, Avg.Score: 810.58, Time: 14:09:21
Ep.: 905, Ep.Steps: 1000, Score: 856.23, Avg.Score: 811.51, Time: 14:10:46
Ep.: 906, Ep.Steps: 1000, Score: 818.90, Avg.Score: 813.08, Time: 14:12:23
Ep.: 907, Ep.Steps: 1000, Score: 1083.41, Avg.Score: 816.38, Time: 14:13:45
Ep.: 908, Ep.Steps: 1000, Score: 718.09, Avg.Score: 814.66, Time: 14:15:09
Ep.: 909, Ep.Steps: 1

Ep.: 1004, Ep.Steps: 1000, Score: 1425.15, Avg.Score: 1138.10, Time: 16:30:05
Ep.: 1005, Ep.Steps: 1000, Score: 1293.94, Avg.Score: 1142.48, Time: 16:31:19
Ep.: 1006, Ep.Steps: 1000, Score: 1300.91, Avg.Score: 1147.30, Time: 16:32:28
Ep.: 1007, Ep.Steps: 1000, Score: 1351.11, Avg.Score: 1149.98, Time: 16:33:38
Ep.: 1008, Ep.Steps: 1000, Score: 1460.07, Avg.Score: 1157.40, Time: 16:34:49
Ep.: 1009, Ep.Steps: 1000, Score: 1252.24, Avg.Score: 1161.41, Time: 16:36:03
Ep.: 1010, Ep.Steps: 1000, Score: 1289.54, Avg.Score: 1164.95, Time: 16:37:14
Ep.: 1011, Ep.Steps: 1000, Score: 1325.01, Avg.Score: 1168.17, Time: 16:38:24
Ep.: 1012, Ep.Steps: 1000, Score: 1325.93, Avg.Score: 1174.71, Time: 16:39:36
Ep.: 1013, Ep.Steps: 1000, Score: 1314.38, Avg.Score: 1176.48, Time: 16:40:46
Ep.: 1014, Ep.Steps: 1000, Score: 1459.93, Avg.Score: 1182.71, Time: 16:41:57
Ep.: 1015, Ep.Steps: 1000, Score: 1427.29, Avg.Score: 1187.15, Time: 16:43:10
Ep.: 1016, Ep.Steps: 1000, Score: 1488.50, Avg.Score: 1193.42, T

Ep.: 1110, Ep.Steps: 1000, Score: 1776.74, Avg.Score: 1601.24, Time: 18:56:49
Ep.: 1111, Ep.Steps: 1000, Score: 1812.37, Avg.Score: 1606.11, Time: 18:57:46
Ep.: 1112, Ep.Steps: 1000, Score: 1814.80, Avg.Score: 1611.00, Time: 18:58:44
Ep.: 1113, Ep.Steps: 1000, Score: 1838.53, Avg.Score: 1616.24, Time: 18:59:47
Ep.: 1114, Ep.Steps: 1000, Score: 1863.97, Avg.Score: 1620.28, Time: 19:00:49
Ep.: 1115, Ep.Steps: 1000, Score: 1720.30, Avg.Score: 1623.21, Time: 19:01:48
Ep.: 1116, Ep.Steps: 1000, Score: 1902.23, Avg.Score: 1627.35, Time: 19:02:52
Ep.: 1117, Ep.Steps: 1000, Score: 1834.25, Avg.Score: 1631.37, Time: 19:03:55
Ep.: 1118, Ep.Steps: 1000, Score: 1840.68, Avg.Score: 1634.60, Time: 19:04:56
Ep.: 1119, Ep.Steps: 1000, Score: 1873.42, Avg.Score: 1637.58, Time: 19:06:00
Ep.: 1120, Ep.Steps: 1000, Score: 1895.08, Avg.Score: 1641.12, Time: 19:06:58
Ep.: 1121, Ep.Steps: 1000, Score: 1938.68, Avg.Score: 1645.71, Time: 19:07:59
Ep.: 1122, Ep.Steps: 1000, Score: 1613.97, Avg.Score: 1645.68, T

episode: 1191, steps:1000, current reward: 2071.0404720924935, max reward: 2131.0869448291082, average reward on test: 2104.792386399624
episode: 1192, steps:1000, current reward: 2000.2464479798823, max reward: 2217.105262926898, average reward on test: 2168.4162026971667
episode: 1193, steps:1000, current reward: 2142.419553783604, max reward: 2193.565744709487, average reward on test: 2131.7834022174325
episode: 1194, steps:1000, current reward: 2162.1951351982243, max reward: 2146.2789981233554, average reward on test: 2110.247773213566
episode: 1195, steps:1000, current reward: 2125.8191518974186, max reward: 2070.5309288655503, average reward on test: 2025.5738083790031
episode: 1196, steps:1000, current reward: 2086.200220744159, max reward: 2152.7389393770445, average reward on test: 2092.7271738621175
episode: 1197, steps:1000, current reward: 2099.0560553619807, max reward: 2068.5362440481294, average reward on test: 2020.2511667526683
episode: 1198, steps:1000, current rewar

episode: 1251, steps:1000, current reward: 2216.404058646061, max reward: 2234.841984249258, average reward on test: 2199.764011637638
episode: 1252, steps:1000, current reward: 2129.861201506725, max reward: 2217.538034828894, average reward on test: 2155.44857305192
episode: 1253, steps:1000, current reward: 2098.5040842668755, max reward: 2198.6008955383654, average reward on test: 2121.644973420957
episode: 1254, steps:1000, current reward: 2056.489616544502, max reward: 2218.8455093950383, average reward on test: 2168.751057488256
episode: 1255, steps:1000, current reward: 2200.2176781132316, max reward: 2254.199737763092, average reward on test: 2216.33512782959
Saved
episode: 1256, steps:1000, current reward: 2183.7476750258784, max reward: 2240.1356473643855, average reward on test: 2211.308315889391
episode: 1257, steps:1000, current reward: 2212.3923127770663, max reward: 2204.2835115738662, average reward on test: 2163.295330168689
episode: 1258, steps:1000, current reward: 

episode: 1311, steps:1000, current reward: 2227.1719346024925, max reward: 2240.7695502444403, average reward on test: 2201.5210392185513
episode: 1312, steps:1000, current reward: 2237.9087030017413, max reward: 2288.4235444316696, average reward on test: 2242.4447599199407
episode: 1313, steps:1000, current reward: 2209.799255861388, max reward: 2283.2384487670106, average reward on test: 2237.8669215635946
episode: 1314, steps:1000, current reward: 2265.117015933977, max reward: 2262.179904886238, average reward on test: 2228.6334239626585
episode: 1315, steps:1000, current reward: 2184.5145310356797, max reward: 2310.634124883971, average reward on test: 2252.0770877511154
episode: 1316, steps:1000, current reward: 2133.5602775080206, max reward: 2259.1177296530395, average reward on test: 2213.687683389528
episode: 1317, steps:1000, current reward: 2248.6245430116423, max reward: 2333.757944217437, average reward on test: 2286.4959818284524
Saved
episode: 1318, steps:1000, current

In [21]:
best_actor = Actor(state_size, action_size, hidden_dim = 256, high = high, low = low)
best_actor.load_state_dict(torch.load("ant_actor.pkl"))

<All keys matched successfully>

In [22]:
best_actor.to(device)

Actor(
  (linear1): Linear(in_features=28, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=256, bias=True)
  (mean): Linear(in_features=256, out_features=8, bias=True)
  (log_std): Linear(in_features=256, out_features=8, bias=True)
)

In [49]:
new_env = make("AntBulletEnv-v0")
new_env.seed(9)
reward_test = []
for i in range(100):
    state = new_env.reset()
    local_reward = 0
    done = False
    while not done:
        state =  torch.tensor(state).to(device).float()
        action,logp = best_actor(state)        
        action = action.cpu().data.numpy()
        state, r, done, _ = new_env.step(action)
        local_reward += r
    reward_test.append(local_reward)

## Test results

In [50]:
import plotly.graph_objects as go

In [51]:
x = np.array(range(len(reward_test)))
m = np.mean(reward_test)

In [52]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=reward_test, name='test reward',
                                 line=dict(color="green", width=1)))

fig.add_trace(go.Scatter(x=x, y=[m]*len(reward_test), name='average reward',
                                 line=dict(color="red", width=1)))
    
fig.update_layout(title="SAC",
                           xaxis_title= "test",
                           yaxis_title= "reward")
fig.show()

In [56]:
print("average reward:", m)

average reward: 2069.692996878641
