In [4]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

from models.sac import SAC


In [5]:
class Actor(nn.Module):
    def __init__(self, n_sts: int, n_acts: int, dim: int):
        super(Actor, self).__init__()
                
        self.fc1 = nn.Linear(n_sts, dim)
        self.fc_mu = nn.Linear(dim, n_acts)
        self.fc_std = nn.Linear(dim, n_acts)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        mu = self.fc_mu(x)
        std = F.softplus(self.fc_std(x))
        dist = Normal(mu, std)
        act = dist.rsample()
        log_prob = dist.log_prob(act)
        real_act = torch.tanh(act) * 2
        real_log_prob = log_prob - torch.log(1 - torch.tanh(act).pow(2) + 1e-7)

        return real_act, real_log_prob

In [6]:
class QCritic(nn.Module): 
    def __init__(self, n_sts: int, act_dim: int, dim: int):
        super(QCritic, self).__init__()
                
        self.fc1 = nn.Linear(n_sts + act_dim, dim)
        self.fc2 = nn.Linear(dim, 1)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
                
    def forward(self, xs):
        x, a = xs
        
        x = self.relu1(self.fc1(torch.cat([x, a], 1)))
        x = self.fc2(x)
        
        return x

In [7]:
lr = 3e-4
tau = 5e-3
eps = 0
gamma = 0.99

n_sts = 3
n_acts = 1
act_dim = 1

dim = 256

n_epis = 5000
n_epochs = 200
n_rollout = 10

In [8]:
act = Actor(n_sts, n_acts, dim)
act_opt = torch.optim.Adam(act.parameters(), lr=lr)

qcri1 = QCritic(n_sts, act_dim, dim)
qcri1_opt = torch.optim.Adam(qcri1.parameters(), lr=lr)

qcri2 = QCritic(n_sts, act_dim, dim)
qcri2_opt = torch.optim.Adam(qcri2.parameters(), lr=lr)

In [9]:
env = gym.make('Pendulum-v1', render_mode='human')
model = SAC(env, n_acts, act, act_opt, qcri1, qcri1_opt, qcri2, qcri2_opt, eps=eps, tau=tau)

In [10]:
model.train(n_epis, n_epochs, n_rollout)

  if not isinstance(terminated, (bool, np.bool8)):


epoch: 20, score: -56.10298844918289, n_buffer: 210
epoch: 40, score: -61.95893523914547, n_buffer: 410
epoch: 60, score: -53.00159818332821, n_buffer: 610
epoch: 80, score: -85.63001939505803, n_buffer: 810
epoch: 100, score: -90.17497053088728, n_buffer: 1010
epoch: 120, score: -80.88523486662467, n_buffer: 1210
epoch: 140, score: -76.63164429429561, n_buffer: 1410
epoch: 160, score: -76.59726817762666, n_buffer: 1610
epoch: 180, score: -76.74321542752982, n_buffer: 1810
epoch: 20, score: -76.14157654734154, n_buffer: 2210
epoch: 40, score: -81.00972252153288, n_buffer: 2410
epoch: 60, score: -83.4808207808425, n_buffer: 2610
epoch: 80, score: -78.42742285875218, n_buffer: 2810
epoch: 100, score: -78.02568635255966, n_buffer: 3010
epoch: 120, score: -77.39601792142585, n_buffer: 3210
epoch: 140, score: -76.28236015951241, n_buffer: 3410
epoch: 160, score: -71.90415285672267, n_buffer: 3610
epoch: 180, score: -70.31819484062588, n_buffer: 3810
epoch: 20, score: -50.561642016117375, n_

KeyboardInterrupt: 

In [None]:
env.close()
## TODO update interval이랑 train interval 정하기, TD exam 고치기