In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

from models.sac import SAC


In [2]:
class Actor(nn.Module):
    def __init__(self, n_sts: int, n_acts: int, dim: int):
        super(Actor, self).__init__()
                
        self.fc1 = nn.Linear(n_sts, dim)
        self.fc_mu = nn.Linear(dim, n_acts)
        self.fc_std = nn.Linear(dim, n_acts)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        mu = self.fc_mu(x)
        std = F.softplus(self.fc_std(x))
        dist = Normal(mu, std)
        act = dist.rsample()
        log_prob = dist.log_prob(act)
        real_act = torch.tanh(act) * 2
        real_log_prob = log_prob - torch.log(1 - torch.tanh(act).pow(2) + 1e-7)

        return real_act, real_log_prob

In [3]:
class QCritic(nn.Module): 
    def __init__(self, n_sts: int, act_dim: int, dim: int):
        super(QCritic, self).__init__()
                
        self.fc1 = nn.Linear(n_sts + act_dim, dim)
        self.fc2 = nn.Linear(dim, 1)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
                
    def forward(self, xs):
        x, a = xs
        
        x = self.relu1(self.fc1(torch.cat([x, a], 1)))
        x = self.fc2(x)
        
        return x

In [4]:
lr = 3e-4
tau = 5e-3
eps = 0
gamma = 0.99

n_sts = 3
n_acts = 1
act_dim = 1

dim = 256

n_epis = 5000
n_epochs = 200
n_rollout = 10

In [5]:
act = Actor(n_sts, n_acts, dim)
act_opt = torch.optim.Adam(act.parameters(), lr=lr)

qcri1 = QCritic(n_sts, act_dim, dim)
qcri1_opt = torch.optim.Adam(qcri1.parameters(), lr=lr)

qcri2 = QCritic(n_sts, act_dim, dim)
qcri2_opt = torch.optim.Adam(qcri2.parameters(), lr=lr)

In [6]:
env = gym.make('Pendulum-v1', render_mode='human')
model = SAC(env, n_acts, act, act_opt, qcri1, qcri1_opt, qcri2, qcri2_opt, eps=eps, tau=tau)

In [7]:
model.train(n_epis, n_epochs, n_rollout)

  if not isinstance(terminated, (bool, np.bool8)):


step: 210, score: -79.35049449744875, n_buffer: 210
step: 410, score: -76.6402867006886, n_buffer: 410
step: 610, score: -73.03036679081903, n_buffer: 610
step: 810, score: -64.43652783446818, n_buffer: 810
step: 1010, score: -59.75235409520313, n_buffer: 1010
step: 1210, score: -46.13864840580496, n_buffer: 1210
step: 1410, score: -38.12823048642536, n_buffer: 1410
step: 1610, score: -50.43755788855604, n_buffer: 1610
step: 1810, score: -50.21407512671624, n_buffer: 1810
step: 2210, score: -63.233787729835704, n_buffer: 2210
step: 2410, score: -77.80509938994824, n_buffer: 2410
step: 2610, score: -75.88514561803673, n_buffer: 2610
step: 2810, score: -71.65761737424428, n_buffer: 2810
step: 3010, score: -70.03864588553242, n_buffer: 3010
step: 3210, score: -67.04478083468574, n_buffer: 3210
step: 3410, score: -62.28516347462689, n_buffer: 3410
step: 3610, score: -61.29148181227839, n_buffer: 3610
step: 3810, score: -57.00757985796953, n_buffer: 3810
step: 4210, score: -56.3469459899968

KeyboardInterrupt: 

In [None]:
env.close()
## TODO update interval이랑 train interval 정하기, TD exam 고치기