In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

from models.sac import SAC


In [2]:
class Actor(nn.Module):
    def __init__(self, n_sts: int, n_acts: int, dim: int):
        super(Actor, self).__init__()
                
        self.fc1 = nn.Linear(n_sts, dim)
        self.fc_mu = nn.Linear(dim, n_acts)
        self.fc_std = nn.Linear(dim, n_acts)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        mu = self.fc_mu(x)
        std = F.softplus(self.fc_std(x))
        dist = Normal(mu, std)
        act = dist.rsample()
        log_prob = dist.log_prob(act)
        real_act = torch.tanh(act)
        real_log_prob = log_prob - torch.log(1 - torch.tanh(act).pow(2) + 1e-7)

        return real_act, real_log_prob

In [3]:
class QCritic(nn.Module): 
    def __init__(self, n_sts: int, act_dim: int, dim: int):
        super(QCritic, self).__init__()
                
        self.fc1 = nn.Linear(n_sts + act_dim, dim)
        self.fc2 = nn.Linear(dim, 1)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
                
    def forward(self, xs):
        x, a = xs
        
        x = self.relu1(self.fc1(torch.cat([x, a], 1)))
        x = self.fc2(x)
        
        return x

In [4]:
lr = 3e-4
tau = 5e-3
eps = 0
gamma = 0.99

n_sts = 3
n_acts = 1
act_dim = 1

dim = 256

n_epis = 5000
n_epochs = 200
n_rollout = 10

In [5]:
act = Actor(n_sts, n_acts, dim)
act_opt = torch.optim.Adam(act.parameters(), lr=lr)

qcri1 = QCritic(n_sts, act_dim, dim)
qcri1_opt = torch.optim.Adam(qcri1.parameters(), lr=lr)

qcri2 = QCritic(n_sts, act_dim, dim)
qcri2_opt = torch.optim.Adam(qcri2.parameters(), lr=lr)

In [6]:
env = gym.make('Pendulum-v1', render_mode='human')
model = SAC(env, n_acts, act, act_opt, qcri1, qcri1_opt, qcri2, qcri2_opt, eps=eps, tau=tau)

In [7]:
model.train(n_epis, n_epochs, n_rollout)

  if not isinstance(terminated, (bool, np.bool8)):


epoch: 20, score: -64.13648159973475, n_buffer: 210
epoch: 40, score: -52.92937834151445, n_buffer: 410
epoch: 60, score: -41.696200221659566, n_buffer: 610
epoch: 80, score: -58.82456159709818, n_buffer: 810
epoch: 100, score: -66.86227389648458, n_buffer: 1010
epoch: 120, score: -79.32069995866993, n_buffer: 1210
epoch: 140, score: -86.17706800487332, n_buffer: 1410
epoch: 160, score: -79.82899974138819, n_buffer: 1610
epoch: 180, score: -60.092811070502684, n_buffer: 1810
epoch: 20, score: -89.74770327858913, n_buffer: 2210
epoch: 40, score: -69.73131623657692, n_buffer: 2410
epoch: 60, score: -51.585045024356916, n_buffer: 2610
epoch: 80, score: -46.43415282824772, n_buffer: 2810
epoch: 100, score: -53.36373234258218, n_buffer: 3010
epoch: 120, score: -54.11443092429903, n_buffer: 3210
epoch: 140, score: -58.02501913033817, n_buffer: 3410
epoch: 160, score: -53.99338702117582, n_buffer: 3610
epoch: 180, score: -57.75669883516524, n_buffer: 3810
epoch: 20, score: -31.517025435513006

KeyboardInterrupt: 

In [62]:
env.close()
## TODO update interval이랑 train interval 정하기, TD exam 고치기