In [10]:
import sys
sys.path.append('..')

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

from models.sac import SAC


In [27]:
class Actor(nn.Module):
    def __init__(self, n_sts: int, n_acts: int, dim: int):
        super(Actor, self).__init__()
                
        self.fc1 = nn.Linear(n_sts, dim)
        self.fc_mu = nn.Linear(dim, n_acts)
        self.fc_std = nn.Linear(dim, n_acts)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        mu = self.fc_mu(x)
        std = F.softplus(self.fc_std(x))
        dist = Normal(mu, std)
        act = dist.rsample()
        log_prob = dist.log_prob(act)
        real_act = torch.tanh(act)
        real_log_prob = log_prob - torch.log(1 - torch.tanh(act).pow(2) + 1e-7)

        return real_act, real_log_prob.sum(-1, keepdims=True)

In [28]:
class QCritic(nn.Module): 
    def __init__(self, n_sts: int, act_dim: int, dim: int):
        super(QCritic, self).__init__()
                
        self.fc1 = nn.Linear(n_sts + act_dim, dim)
        self.fc2 = nn.Linear(dim, 1)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
                
    def forward(self, xs):
        x, a = xs
        
        x = self.relu1(self.fc1(torch.cat([x, a], 1)))
        x = self.fc2(x)
        
        return x

In [29]:
lr = 3e-4
tau = 5e-3
eps = 0
gamma = 0.99

n_sts = 17
n_acts = 6
act_dim = 6

dim = 256

n_epis = 5000
n_epochs = 100
n_rollout = 10

In [30]:
act = Actor(n_sts, n_acts, dim)
act_opt = torch.optim.Adam(act.parameters(), lr=lr)

qcri1 = QCritic(n_sts, act_dim, dim)
qcri1_opt = torch.optim.Adam(qcri1.parameters(), lr=lr)

qcri2 = QCritic(n_sts, act_dim, dim)
qcri2_opt = torch.optim.Adam(qcri2.parameters(), lr=lr)

In [31]:
env = gym.make("HalfCheetah-v4", render_mode='human')
model = SAC(env, n_acts, act, act_opt, qcri1, qcri1_opt, qcri2, qcri2_opt, eps=eps, tau=tau, target_entropy=-n_acts)

In [32]:
model.train(n_epis, n_epochs, n_rollout)

  if not isinstance(terminated, (bool, np.bool8)):


epoch: 20, score: -2.1292862508162154, n_buffer: 210
epoch: 40, score: -5.94729659997092, n_buffer: 410
epoch: 60, score: -3.732358753816441, n_buffer: 610
epoch: 80, score: -3.7902938020746695, n_buffer: 810
epoch: 20, score: -3.316481201230931, n_buffer: 1210
epoch: 40, score: -6.555619447885772, n_buffer: 1410
epoch: 60, score: -2.6019912571142947, n_buffer: 1610
epoch: 80, score: -3.41632895235196, n_buffer: 1810
epoch: 20, score: -1.72434539773479, n_buffer: 2210
epoch: 40, score: -3.0421055459558763, n_buffer: 2410
epoch: 60, score: -3.207944931608279, n_buffer: 2610
epoch: 80, score: -2.6519699369114758, n_buffer: 2810
epoch: 20, score: -1.0499002497261052, n_buffer: 3210
epoch: 40, score: -0.968516668688728, n_buffer: 3410
epoch: 60, score: -1.9207387447339432, n_buffer: 3610
epoch: 80, score: -0.9427554077997489, n_buffer: 3810
epoch: 20, score: -0.31455067051975977, n_buffer: 4210
epoch: 40, score: -1.5771906261181894, n_buffer: 4410
epoch: 60, score: -1.4344724923549166, n_b

In [None]:
env.close()