In [116]:
import numpy as np
import torch
import gym
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter


In [320]:
def mish(input):
    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, input): return mish(input)

In [328]:
# helper function to convert numpy arrays to tensors
def t(x):
    x = np.array(x) if not isinstance(x, np.ndarray) else x
    return torch.from_numpy(x).float()

In [334]:
class Actor(nn.Module):
    def __init__(self, state_dim, n_actions, activation=nn.Tanh):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            activation(),
            nn.Linear(64, 64),
            activation(),
            nn.Linear(64, n_actions)
        )
        
        logstds_param = nn.Parameter(torch.full((n_actions,), 0.1))
        self.register_parameter("logstds", logstds_param)
    
    def forward(self, X):
        means = self.model(X)
        stds = torch.clamp(self.logstds.exp(), 1e-3, 50)
        
        return torch.distributions.Normal(means, stds)

In [335]:
## Critic module
class Critic(nn.Module):
    def __init__(self, state_dim, activation=nn.Tanh):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            activation(),
            nn.Linear(64, 64),
            activation(),
            nn.Linear(64, 1),
        )
    
    def forward(self, X):
        return self.model(X)

In [409]:
env = gym.make("Pendulum-v0")
writer = SummaryWriter()

In [410]:
# config
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
actor = Actor(state_dim, n_actions, activation=Mish)
critic = Critic(state_dim, activation=Mish)

In [411]:
def discounted_rewards(rewards, dones, gamma):
    ret = 0
    discounted = []
    for reward, done in zip(rewards[::-1], dones[::-1]):
        ret = reward + ret * gamma * (1-done)
        discounted.append(ret)
    
    return discounted[::-1]

In [412]:
def process_memory(memory, last_value, discount_rewards=True):
    actions = []
    states = []
    next_states = []
    rewards = []
    dones = []

    for action, reward, state, next_state, done in memory:
        actions.append(action)
        rewards.append(reward)
        states.append(state)
        next_states.append(next_state)
        dones.append(done)
    
    if discount_rewards:
        if dones[-1] == 0:
            rewards = discounted_rewards(rewards + [last_value], dones + [0], gamma)[:-1]
        else:
            rewards = discounted_rewards(rewards, dones, gamma)

    actions = t(actions).view(-1, 1)
    states = t(states)
    next_states = t(next_states)
    rewards = t(rewards).view(-1, 1)
    dones = t(dones).view(-1, 1)
    return actions, rewards, states, next_states, dones

def clip_grad_norm_(module, max_grad_norm):
    nn.utils.clip_grad_norm_([p for g in module.param_groups for p in g["params"]], max_grad_norm)

In [413]:
class A2CLearner():
    def __init__(self, env, actor, critic, gamma=0.9, entropy_beta=0,
                 actor_lr=4e-4, critic_lr=4e-3, max_grad_norm=0.5):
        self.gamma = gamma
        self.max_grad_norm = max_grad_norm
        self.actor = actor
        self.critic = critic
        self.entropy_beta = entropy_beta
        self.actor_optim = torch.optim.Adam(actor.parameters(), lr=actor_lr)
        self.critic_optim = torch.optim.Adam(critic.parameters(), lr=critic_lr)
    
    def learn(self, memory, last_value, steps, discount_rewards=True):
        actions, rewards, states, next_states, dones = process_memory(memory, last_value, discount_rewards)

        if discount_rewards:
            td_target = rewards
        else:
            td_target = rewards + gamma*critic(next_states)*(1-dones)
        value = critic(states)
        advantage = td_target - value

        # actor
        norm_dists = self.actor(states)
        logs_probs = norm_dists.log_prob(actions)
        entropy = norm_dists.entropy().mean()

        actor_loss = (-logs_probs*advantage.detach()).mean() - entropy*entropy_beta
        self.actor_optim.zero_grad()
        actor_loss.backward()
        clip_grad_norm_(self.actor_optim, self.max_grad_norm)
        self.actor_optim.step()

        # critic
        critic_loss = F.mse_loss(td_target, value)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_optim, self.max_grad_norm)
        self.critic_optim.step()
        
        # reports
        writer.add_scalar("losses/log_probs", -logs_probs.mean(), global_step=steps)
        writer.add_scalar("losses/entropy", entropy, global_step=steps) 
        writer.add_scalar("losses/entropy_beta", entropy_beta, global_step=steps) 
        writer.add_scalar("losses/actor", actor_loss, global_step=steps)
        writer.add_scalar("losses/advantage", advantage.mean(), global_step=steps)
        writer.add_scalar("losses/critic", critic_loss, global_step=steps)

learner = A2CLearner(env, actor, critic)

In [414]:
episode_rewards = []
total_steps = 0
memory = []

for i in range(1000):
    done = False
    total_reward = 0
    state = env.reset()
    max_steps = 16
    steps = 0
    memory.clear()

    while not done:
        dists = actor(t(state))
        actions = dists.sample()
        actions_clamped = torch.clamp(actions, env.action_space.low.min(), env.action_space.high.max())
        
        next_state, reward, done, info = env.step(actions_clamped.detach().data.numpy().reshape(-1))
        
        memory.append((actions, reward, state, next_state, done))
        
        total_reward += reward
        state = next_state
        steps += 1
        total_steps += 1
        
        writer.add_scalar("dists/mean", dists.loc[0], global_step=total_steps)
        writer.add_scalar("dists/scale", dists.scale[0], global_step=total_steps)

        if done or (steps % max_steps == 0):
            last_value = critic(t(next_state)).data.numpy()
            learner.learn(memory, last_value, total_steps)
            memory.clear()
            
        # env.render()
    
    if len(episode_rewards) % 10 == 0:
        print("episode:", len(episode_rewards), ", episode reward:", total_reward)
    writer.add_scalar("episode_reward", total_reward, global_step=total_steps)
    episode_rewards.append(total_reward)

episode: 0 , episode reward: -1180.6139006557023
episode: 10 , episode reward: -1252.2319197901786
episode: 20 , episode reward: -1365.4321144410337
episode: 30 , episode reward: -1169.5860742509194
episode: 40 , episode reward: -1552.9231831087127
episode: 50 , episode reward: -980.5214001353957
episode: 60 , episode reward: -983.9049866974093
episode: 70 , episode reward: -1178.8119621594988
episode: 80 , episode reward: -1276.2545393392404
episode: 90 , episode reward: -1007.3665244333504
episode: 100 , episode reward: -1642.8551018817825
episode: 110 , episode reward: -1385.0084370249972
episode: 120 , episode reward: -717.6605853767742
episode: 130 , episode reward: -726.2617473177773
episode: 140 , episode reward: -712.6438170658791
episode: 150 , episode reward: -409.4135211926305
episode: 160 , episode reward: -545.0475985641392
episode: 170 , episode reward: -542.3393151405004
episode: 180 , episode reward: -536.5122622360361
episode: 190 , episode reward: -806.3975705036598
e

KeyboardInterrupt: 

In [None]:
%debug

In [286]:
np.mean(episode_rewards[-800:-700])

-397.219121377562

In [69]:
state = env.reset()

In [70]:
m = actor.model(t(state))
m

tensor([-0.2877,  0.0264, -0.1438,  0.5027,  1.0066, -0.1544,  0.4802, -0.1561,
         0.2354, -0.0054,  0.3285,  0.1031,  0.7267, -0.0575, -0.0306,  0.7434,
         0.4232, -0.1967,  0.2230, -0.1707,  0.0087, -0.2611,  0.7855,  0.3474,
         0.8359,  0.1104, -0.2812, -0.2202,  0.3276,  0.1480, -0.3086, -0.1037],
       grad_fn=<MulBackward0>)

In [71]:
actor.stds_head(m)

tensor([0.0014], grad_fn=<SoftplusBackward>)

In [72]:
env.close()

## diario

*
* architecture review
  all tanh, std out of the network, from 32 => 64 neuros
* gradient clip

scale going up always, changed lr (increased), nsteps (5 => 16), gamma (0.95 => 0.98)

In [89]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(3, 3),
            nn.Tanh(),
            nn.Linear(3, 2)
        )
    
    def forward(self, X):
        out = self.model(X)
        mean = out[:, :1]
        logstd = out[:, 1:]
        
        return mean, logstd.exp()

In [90]:
x_np = np.random.randint(1, 5, (10, 3))
x_t = torch.from_numpy(x_np).float()

network = Network()

def sample_from_network(network, X):
    mean, std = network(X)
    return np.random.normal(0, 1, (len(X), 1))*std.data.numpy() + mean.data.numpy()
    
sample(network, x_t)

array([[-0.09484435],
       [ 0.66010272],
       [ 0.28752064],
       [ 1.33446952],
       [ 0.74480515],
       [-0.60730657],
       [ 1.53686034],
       [ 1.13118125],
       [-0.60768608],
       [-1.03437055]])