In [1]:
import numpy as np
import torch
import gym
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter


In [12]:
def mish(input):
    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, input): return mish(input)

In [13]:
# helper function to convert numpy arrays to tensors
def t(x):
    x = np.array(x) if not isinstance(x, np.ndarray) else x
    return torch.from_numpy(x).float()

In [14]:
# Actor module, categorical actions only
import math
leaky = torch.nn.LeakyReLU()

class Actor(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 32),
            # nn.BatchNorm1d(32),
            Mish(),
        )
        self.means_head = nn.Sequential(
            nn.Linear(32, n_actions),
            # nn.BatchNorm1d(n_actions),
            nn.Tanh(),
        )
        self.stds_head = nn.Sequential(
            nn.Linear(32, n_actions),
            # nn.BatchNorm1d(n_actions),
            nn.Softplus(),
        )
    
    def forward(self, X):
        data = self.model(X)
        means = self.means_head(data)
        stds = (self.stds_head(data) + 1e-3)
        
        dists = torch.distributions.Normal(means*2, stds)
        
        return dists

In [15]:
## Critic module
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 32),
            # nn.BatchNorm1d(n_actions),
            Mish(),
            nn.Linear(32, 32),
            # nn.BatchNorm1d(n_actions),
            Mish(),
            nn.Linear(32, 1),
        )
    
    def forward(self, X):
        return self.model(X)

In [16]:

writer = SummaryWriter()
env = gym.make("Pendulum-v0")

In [17]:
# config
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
actor = Actor(state_dim, n_actions)
critic = Critic(state_dim)
adam_actor = torch.optim.Adam(actor.parameters(), lr=1e-4)#, weight_decay=0.001)
adam_critic = torch.optim.Adam(critic.parameters(), lr=3e-4)#, weight_decay=0.001)
gamma = 0.98
entropy_beta = 1e-2
memory = []

In [18]:
def train(memory):
    actions = []
    states = []
    next_states = []
    rewards = []
    dones = []
    
    for action, reward, state, next_state, done in memory:
        actions.append(action)
        rewards.append(reward)
        states.append(state)
        next_states.append(next_state)
        dones.append(done)
        
    actions = t(actions).view(-1, 1)
    states = t(states)
    next_states = t(next_states)
    rewards = t(rewards).view(-1, 1)
    dones = t(dones).view(-1, 1)
    
    with torch.no_grad():
        td_target = rewards + gamma*critic(next_states)*(1-dones)
        advantage = td_target - critic(states)
    
    norm_dists = actor(states)
    logs_probs = norm_dists.log_prob(actions)
    actor_loss = (-logs_probs*advantage).mean() - entropy_beta*norm_dists.entropy().detach().mean()
    writer.add_scalar("losses/actor", actor_loss)
    adam_actor.zero_grad()
    actor_loss.backward()
    adam_actor.step()
    
    critic_loss = F.mse_loss(td_target, critic(t(states)))
    writer.add_scalar("losses/critic", critic_loss)
    adam_critic.zero_grad()
    critic_loss.backward()
    adam_critic.step()

In [19]:
def build_runner(env, steps, memory):
    states = [env.reset()]
    total_reward = [0]
    episodes = [0]
    
    def runner():
        for _ in range(steps):
            state = states[0]
            dists = actor(t(state))
            actions = dists.sample()
            actions_clamped = torch.clamp(actions, env.action_space.low.min(), env.action_space.high.max())

            next_state, reward, done, info = env.step(actions_clamped.detach().data.numpy())
            memory.append((actions, reward, states[0], next_state, done))
            states[0] = next_state
            total_reward[0] += reward
            
            if done:
                episodes[0] += 1
                if episodes[0] % 20 == 0:
                    print(f"episode #{episodes[0]}, reward: {total_reward[0]}")
                    writer.add_scalar("rewards/episode", total_reward[0])
                states[0] = env.reset()
                total_reward[0] = 0
                
    return runner

runners = [build_runner(gym.make("Pendulum-v0"), 4, memory) for _ in range(8)]

In [20]:
for i in range(5000):
    for runner in runners:
        rewards = runner()
    train(memory)
    memory.clear()

KeyboardInterrupt: 

In [11]:
plt.scatter(np.arange(len(episode_rewards)), episode_rewards, s=2)
plt.title("Total reward per episode (online)")
plt.ylabel("reward")
plt.xlabel("episode")
plt.show()

NameError: name 'episode_rewards' is not defined

In [68]:
dists.entropy()

tensor([-5.4888], grad_fn=<AddBackward0>)

In [69]:
state = env.reset()

In [70]:
m = actor.model(t(state))
m

tensor([-0.2877,  0.0264, -0.1438,  0.5027,  1.0066, -0.1544,  0.4802, -0.1561,
         0.2354, -0.0054,  0.3285,  0.1031,  0.7267, -0.0575, -0.0306,  0.7434,
         0.4232, -0.1967,  0.2230, -0.1707,  0.0087, -0.2611,  0.7855,  0.3474,
         0.8359,  0.1104, -0.2812, -0.2202,  0.3276,  0.1480, -0.3086, -0.1037],
       grad_fn=<MulBackward0>)

In [71]:
actor.stds_head(m)

tensor([0.0014], grad_fn=<SoftplusBackward>)

In [72]:
env.close()