In [2]:
from google.colab import drive
drive.mount('mnt')
!cp "mnt/My Drive/Colab Notebooks/installer.ipynb" . 
%run 'installer.ipynb'

Drive already mounted at mnt; to attempt to forcibly remount, call drive.mount("mnt", force_remount=True).
fatal: destination path 'lib/pybullet-gym' already exists and is not an empty directory.
Obtaining file:///content/lib/pybullet-gym
Installing collected packages: pybulletgym
  Found existing installation: pybulletgym 0.1
    Can't uninstall 'pybulletgym'. No files were found to uninstall.
  Running setup.py develop for pybulletgym
Successfully installed pybulletgym


In [None]:
restart_runtime()

In [2]:
!cp "mnt/My Drive/Colab Notebooks/base_setup.ipynb" . 
%run 'base_setup.ipynb'

In [3]:
# https://github.com/denisyarats/pytorch_sac/blob/master/agent/actor.py

class TanhTransform(pyd.transforms.Transform):
    domain = pyd.constraints.real
    codomain = pyd.constraints.interval(-1.0, 1.0)
    bijective = True
    sign = +1

    def __init__(self, cache_size=1):
        super().__init__(cache_size=cache_size)

    @staticmethod
    def atanh(x):
        return 0.5 * (x.log1p() - (-x).log1p())

    def __eq__(self, other):
        return isinstance(other, TanhTransform)

    def _call(self, x):
        return x.tanh()

    def _inverse(self, y):
        # We do not clamp to the boundary here as it may degrade the performance of certain algorithms.
        # one should use `cache_size=1` instead
        return self.atanh(y)

    def log_abs_det_jacobian(self, x, y):
        # We use a formula that is more numerically stable, see details in the following link
        # https://github.com/tensorflow/probability/commit/ef6bb176e0ebd1cf6e25c6b5cecdd2428c22963f#diff-e120f70e92e6741bca649f04fcd907b7
        return 2. * (math.log(2.) - x - F.softplus(-2. * x))


class SquashedNormal(pyd.transformed_distribution.TransformedDistribution):
    def __init__(self, loc, scale):
        self.loc = loc
        self.scale = scale

        self.base_dist = pyd.Normal(loc, scale)
        transforms = [TanhTransform()]
        super().__init__(self.base_dist, transforms)

    @property
    def mean(self):
        mu = self.loc
        for tr in self.transforms:
            mu = tr(mu)
        return mu

In [4]:
class Actor_Network(Network):
    def __init__(self, state_dims, fc1_dims, fc2_dims, action_dims, min_std = -5, max_std = 2, name='actor', chpt_dir='sac'):
        super().__init__(state_dims, fc1_dims, fc2_dims, action_dims, name, chpt_dir)
#         Network Settings
        self.log_std_max = max_std
        self.log_std_min = min_std
#         Sigma Layer
        self.layers.append(
            nn.Linear(fc2_dims, action_dims)
        )
    
        self.to(device)
    
    def forward(self, input_data):
        val = F.relu(self.layers[0](input_data))
        val = F.relu(self.layers[1](val))
        
        mu = self.layers[2](val)
        log_std = T.tanh(self.layers[3](val))
        log_std = self.log_std_min + 0.5 * (self.log_std_max - self.log_std_min) * (log_std + 1)
        std = log_std.exp()
        
        dist = SquashedNormal(mu, std)
        
        return dist

In [5]:
class Critic_DDPG(Network):
    def __init__(self, state_dims, action_dims, fc1_dim, fc2_dim, name='critic', chpt_dir='ddpg'):
        super(Network, self).__init__()
#         Network settings
        self.layers = nn.ModuleList().to(device)
#         Checkpoint system
        self.checkpoint_file = os.path.join(chpt_dir, name+'.h5')
        if not os.path.exists(chpt_dir):
            os.makedirs(chpt_dir)
#         Neural Network
        self.layers.append(
            nn.Linear(state_dims, fc1_dim)
        )
        self.layers.append(
            nn.Linear(fc1_dim + action_dims, fc2_dim)
        )
#         Q Layer
        self.layers.append(
            nn.Linear(fc2_dim, 1)
        )
        self.to(device)
    
    def forward(self, state, action):
        val = F.relu(self.layers[0](state))
        val = F.relu(self.layers[1](T.cat([val, action], 1)))
        val = self.layers[2](val)
        return val

In [6]:
class SAC_Agent(Agent):
    def __init__(self, state_dims, action_dims, name = "", replay_size = pow(10, 6),
                 alpha = 1e-3, beta= 1e-3, gamma = 0.99, tau = 0.005, batch_size = 256,
                 fc1_dim = 256, fc2_dim = 256,
                 init_temperature = 0.1, learnable_temperature = True,
                 env = None):
#         Store parameters
        self.memory = FIFO_Buffer(replay_size, state_dims, action_dims)
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.target_entropy = -action_dims
        self.learnable_temperature = learnable_temperature
        self.max_action = float(env.action_space.high.max())
        self.min_action = float(env.action_space.low.min())
        
#         Checkpoints Folder
        self.dir = name = name+"_sac"
        if not os.path.exists(name):
            os.makedirs(name)     
        
#         Create Networks
        self.actor = Actor_Network(
            state_dims, fc1_dim, fc2_dim, action_dims, name="actor", chpt_dir=name
        )
        self.critic1 = Critic_Default(state_dims + action_dims, fc1_dim, fc2_dim, 1, name="critic1", chpt_dir=self.dir)
        self.critic2 = Critic_Default(state_dims + action_dims, fc1_dim, fc2_dim, 1, name="critic2", chpt_dir=self.dir)
        self.target_critic1 = Critic_Default(
            state_dims + action_dims, fc1_dim, fc2_dim, 1, name="target_critic1", chpt_dir=self.dir
        )
        self.target_critic2 = Critic_Default(
            state_dims + action_dims, fc1_dim, fc2_dim, 1, name="target_critic2", chpt_dir=self.dir
        )
        
        self.log_alpha = T.tensor(np.log(init_temperature)).to(device)
        self.log_alpha.requires_grad = True
        
        default_network_initialization(self.actor.layers)
        default_network_initialization(self.critic1.layers)
        default_network_initialization(self.critic1.layers)
        default_network_initialization(self.target_critic1.layers)
        default_network_initialization(self.target_critic2.layers)
        
#         Create Optimizer
        self.actor_optimizer = T.optim.Adam(self.actor.parameters(), lr=alpha, betas = [0.9, 0.999])
        self.critic1_optimizer = T.optim.Adam(self.critic1.parameters(), lr=beta, betas = [0.9, 0.999])
        self.critic2_optimizer = T.optim.Adam(self.critic2.parameters(), lr=beta, betas = [0.9, 0.999])
        self.log_alpha_optimizer = T.optim.Adam([self.log_alpha], lr=alpha, betas = [0.9, 0.999])
        
#         Init target networks
        self.update_target_networks(tau = 1)
    
    def update_target_networks(self, tau=None):
        if tau is None:
            tau = self.tau
            
        for param, target_param in zip(self.critic1.parameters(), self.target_critic1.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        
        for param, target_param in zip(self.critic2.parameters(), self.target_critic2.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)
        
    def select_action(self, state, sample = False):
        state = T.tensor(state, dtype=T.float).to(device)
        state = state.unsqueeze(0)
        dist = self.actor.forward(state)
        action = dist.sample() if sample else dist.mean
        action = action.clamp(*[self.min_action, self.max_action])
        
        return action.cpu().data.numpy()[0]
    
    def train(self):
#         Sample from Replay buffer
        state, action, reward, next_state, not_done = self.memory.get_sameples_tensor(self.batch_size)

#         Update Critics
        with T.no_grad():
            dist = self.actor.forward(next_state)
            target_action = dist.rsample()
            log_prob = dist.log_prob(target_action).sum(-1, keepdim=True)
            target_critic1_val = self.target_critic1.forward(next_state, target_action)
            target_critic2_val = self.target_critic2.forward(next_state, target_action)
            target_value = T.min(target_critic1_val, target_critic2_val) - self.alpha.detach() * log_prob
            target = reward + self.gamma * target_value * not_done
            target.detach()

        critic1_loss = F.mse_loss(target, self.critic1.forward(state, action))
        critic2_loss = F.mse_loss(target, self.critic2.forward(state, action))
        critic_loss = critic1_loss + critic2_loss    

#         Optimization Critics
        self.critic1_optimizer.zero_grad()
        self.critic2_optimizer.zero_grad()   
        critic_loss.backward()
        self.critic1_optimizer.step()
        self.critic2_optimizer.step()
        
        
#         Update Actor Network and Alpha
        dist = self.actor.forward(state)
        action = dist.rsample()
        log_prob = dist.log_prob(action).sum(-1, keepdim = True)
        critic1_val = self.critic1.forward(state, action)
        critic2_val = self.critic2.forward(state, action)
        
        critic_val = T.min(critic1_val, critic2_val)
        actor_loss = (self.alpha.detach() * log_prob - critic_val).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        if self.learnable_temperature:
            alpha_loss = (self.alpha * (-log_prob - self.target_entropy).detach()).mean()
            
            self.log_alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.log_alpha_optimizer.step()
        
#         Soft update on Target Network
        self.update_target_networks()   
        
    def save_models(self):
        self.actor.save()
        self.critic1.save()
        self.critic2.save()
        self.target_critic1.save()
        self.target_critic2.save()
        T.save(self.log_alpha, os.path.join(self.dir, 'log_alpha.h5'))
        
    def load_models(self):
        self.actor.load()
        self.critic1.load()
        self.critic2.load()
        self.target_critic1.load()
        self.target_critic2.load()
        self.log_alpha = T.load(os.path.join(self.dir, 'log_alpha.h5'))
        
    @property
    def alpha(self):
        return self.log_alpha.exp()

In [7]:
# Environment
ENV_NAME = 'InvertedPendulumPyBulletEnv-v0'
env = gym.make(ENV_NAME)
env.seed(0)
# Agent    
agent = SAC_Agent(
    name = ENV_NAME,
    state_dims = env.observation_space.shape[0], env=env, action_dims=env.action_space.shape[0],
    fc1_dim = 64, fc2_dim = 64
)
# Run Test
test_agent(env, agent, 100000, False)



2021-01-28 19:24:51.841880 Total T: 12 Episode Num: 1 Episode T: 12 Reward: 12.000
2021-01-28 19:24:52.133256 Total T: 18 Episode Num: 2 Episode T: 6 Reward: 6.000
2021-01-28 19:24:53.277339 Total T: 42 Episode Num: 3 Episode T: 24 Reward: 24.000
2021-01-28 19:24:53.801670 Total T: 53 Episode Num: 4 Episode T: 11 Reward: 11.000
2021-01-28 19:24:54.874156 Total T: 76 Episode Num: 5 Episode T: 23 Reward: 23.000
2021-01-28 19:24:55.716981 Total T: 94 Episode Num: 6 Episode T: 18 Reward: 18.000
2021-01-28 19:24:56.695604 Total T: 115 Episode Num: 7 Episode T: 21 Reward: 21.000
2021-01-28 19:24:57.577991 Total T: 134 Episode Num: 8 Episode T: 19 Reward: 19.000
2021-01-28 19:24:58.051598 Total T: 144 Episode Num: 9 Episode T: 10 Reward: 10.000
2021-01-28 19:24:59.006370 Total T: 164 Episode Num: 10 Episode T: 20 Reward: 20.000
2021-01-28 19:25:00.471050 Total T: 195 Episode Num: 11 Episode T: 31 Reward: 31.000
2021-01-28 19:25:01.639424 Total T: 220 Episode Num: 12 Episode T: 25 Reward: 25.0

In [8]:
from google.colab import files
files.download(agent.dir+'/performance.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>