In [1]:
import gym
import torch
from torch import nn
from collections import namedtuple, deque
import itertools
from copy import deepcopy
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import tqdm

  import distutils.spawn


In [2]:
RENDER = False

In [3]:
ALPHA = 0.3             # How much we value entropy / exploration, increasing this will increase exploration.
GAMMA = 1 - 0.01        # How much we value future rewards.
TAU = 0.01              # How much q_target is updated when polyak averaging (step 15).
POLICY_LR = 0.001       # Policy learning rate.
Q_LR = 0.001            # Q learning rate.

In [4]:
env = gym.make("LunarLander-v2")

In [5]:
SARS = namedtuple('SARS', 'state, action, reward, next_state, t, failed, limit')

In [6]:
softmax = nn.Softmax(dim=0)
input = torch.tensor([1, 2, 3], dtype=float)
display(input)
output = softmax(input)
display(output)
sum(output)

tensor([1., 2., 3.], dtype=torch.float64)

tensor([0.0900, 0.2447, 0.6652], dtype=torch.float64)

tensor(1.0000, dtype=torch.float64)

In [7]:
softmax = nn.Softmax(dim=1)
input = torch.tensor([[1, 2, 3], [1, 2, 3], [3, 3, 3]], dtype=float)
display(input)
output = softmax(input)
display(output)
sum(output)

tensor([[1., 2., 3.],
        [1., 2., 3.],
        [3., 3., 3.]], dtype=torch.float64)

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.3333, 0.3333, 0.3333]], dtype=torch.float64)

tensor([0.5134, 0.8228, 1.6638], dtype=torch.float64)

In [8]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2000),
            nn.ReLU(),
            nn.Linear(2000, 1500),
            nn.ReLU(),
            nn.Linear(1500, output_size)
        )

    def forward(self, x):
        nn_out = self.linear_relu_stack(x)
        return nn.Softmax(dim=1)(nn_out)

    def __call__(self, x):
        raise RuntimeError("Use forward")

In [9]:
policy_network = PolicyNetwork(4, 2)
policy_network

PolicyNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=4, out_features=2000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2000, out_features=1500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1500, out_features=2, bias=True)
  )
)

In [10]:
torch.cuda.is_available()

True

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [12]:
mock_states = torch.rand(5, 4)
mock_states

tensor([[0.8173, 0.3151, 0.7515, 0.5986],
        [0.4009, 0.6616, 0.5826, 0.1513],
        [0.9163, 0.3828, 0.0130, 0.8971],
        [0.9868, 0.0035, 0.2382, 0.9287],
        [0.5092, 0.4537, 0.4599, 0.2654]])

In [13]:
policy_network.forward(mock_states)

tensor([[0.4749, 0.5251],
        [0.4902, 0.5098],
        [0.4454, 0.5546],
        [0.4506, 0.5494],
        [0.4820, 0.5180]], grad_fn=<SoftmaxBackward0>)

In [14]:
class QNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 2000),
            nn.ReLU(),
            nn.Linear(2000, 1500),
            nn.ReLU(),
            nn.Linear(1500, output_size)
        )

    def forward(self, x):
        nn_out = self.linear_relu_stack(x)
        return nn_out
    
    def __call__(self, x):
        raise RuntimeError("Use forward")

In [15]:
q_network = QNetwork(4, 2)
q_network

QNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=4, out_features=2000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2000, out_features=1500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1500, out_features=2, bias=True)
  )
)

In [16]:
q_network.forward(mock_states)

tensor([[-0.0050, -0.0480],
        [ 0.0081, -0.0944],
        [-0.0104,  0.0221],
        [-0.0133,  0.0398],
        [-0.0079, -0.0611]], grad_fn=<AddmmBackward0>)

In [17]:
def run_episode(action_f, step_f, env, policy, fail_at_limit=False):
    episode_reward = 0
    s = env.reset()
    for t in itertools.count(start=1):
        a = action_f(policy, s)
        next_state, reward, failed, info = env.step(a)
        episode_reward += reward
        assert t <= env._max_episode_steps
        limit = t == env._max_episode_steps
        if limit and not fail_at_limit:
            failed = False
        assert fail_at_limit or not (limit and failed)
        step_f(s, a, reward, next_state, t, failed, limit)
        if failed or limit:
            break
        s = next_state
    return episode_reward

In [18]:
class Policy:
    def __init__(self, env_state_size, env_action_space_size):
        self.policy_network = PolicyNetwork(env_state_size, env_action_space_size)
        self.q1_network = QNetwork(env_state_size, env_action_space_size)
        self.q2_network = QNetwork(env_state_size, env_action_space_size)
        self.q1_target_network = deepcopy(self.q1_network)
        self.q2_target_network = deepcopy(self.q2_network)
        self.policy_network.to(device)
        self.q1_network.to(device)
        self.q2_network.to(device)
        self.q1_target_network.to(device)
        self.q2_target_network.to(device)
        self.reset_optimizers()

    def reset_optimizers(self):
        self.policy_optimizer = torch.optim.SGD(self.policy_network.parameters(), lr=POLICY_LR)
        self.q1_optimizer = torch.optim.SGD(self.q1_network.parameters(), lr=Q_LR)
        self.q2_optimizer = torch.optim.SGD(self.q2_network.parameters(), lr=Q_LR)

In [19]:
replay_buffer = deque(maxlen=30_000)

In [20]:
env.action_space.n

4

In [21]:
env.observation_space.shape

(8,)

In [22]:
oss = env.observation_space.shape
if len(oss) != 1:
    raise RuntimeError(f'Unknown observation_space.shape: {oss}')
os_len = oss[0]
policy = Policy(os_len, env.action_space.n)

In [23]:
s = env.reset()
s

array([-0.00391579,  1.4190695 , -0.39665326,  0.36218733,  0.00454432,
        0.08984792,  0.        ,  0.        ], dtype=float32)

In [24]:
s = torch.tensor(s).reshape((1, -1))
s

tensor([[-0.0039,  1.4191, -0.3967,  0.3622,  0.0045,  0.0898,  0.0000,  0.0000]])

In [25]:
policy_output = policy.policy_network.forward(s.to(device))
policy_output

tensor([[0.2533, 0.2375, 0.2402, 0.2690]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [26]:
action_weights = policy_output.reshape((-1,)).tolist()

In [27]:
list(range(4))

[0, 1, 2, 3]

In [28]:
action = random.choices(range(len(action_weights)), weights=action_weights)[0]
action

1

In [29]:
def action(policy, s):
    tensor_s = torch.tensor(s).reshape((1, -1)).to(device)
    action_weights = policy.policy_network.forward(tensor_s).reshape((-1,)).tolist()
    action = random.choices(range(len(action_weights)), weights=action_weights)[0]
    return action

def step(initial_s, a, r, next_s, t, failed, limit):
    replay_buffer.append(SARS(initial_s, a, r, next_s, t, failed, limit))
    if RENDER:
        env.render()

In [30]:
run_episode(action, step, env, policy)

-274.3197699814156

In [31]:
len(replay_buffer)

83

In [32]:
replay_buffer

deque([SARS(state=array([-0.00770502,  1.4132539 , -0.78044456,  0.10368679,  0.00893491,
               0.17678225,  0.        ,  0.        ], dtype=float32), action=1, reward=-1.9920909988097992, next_state=array([-0.01549768,  1.415017  , -0.7903069 ,  0.07825594,  0.01985907,
               0.21850383,  0.        ,  0.        ], dtype=float32), t=1, failed=False, limit=False),
       SARS(state=array([-0.01549768,  1.415017  , -0.7903069 ,  0.07825594,  0.01985907,
               0.21850383,  0.        ,  0.        ], dtype=float32), action=0, reward=-1.0040721209665833, next_state=array([-0.02329063,  1.416182  , -0.7903408 ,  0.05159922,  0.03077668,
               0.21837242,  0.        ,  0.        ], dtype=float32), t=2, failed=False, limit=False),
       SARS(state=array([-0.02329063,  1.416182  , -0.7903408 ,  0.05159922,  0.03077668,
               0.21837242,  0.        ,  0.        ], dtype=float32), action=0, reward=-1.037461635782165, next_state=array([-0.03108406,  1.4

# Polyak Averaging

In [33]:
test_parameter_1 = next(policy.policy_network.named_parameters())[1]
test_parameter_1

Parameter containing:
tensor([[-0.2818, -0.1534, -0.0185,  ..., -0.2867, -0.2902, -0.1200],
        [-0.0851,  0.0455,  0.1821,  ...,  0.2526, -0.1581,  0.1606],
        [-0.1401,  0.2872, -0.3022,  ..., -0.2287, -0.0575, -0.2660],
        ...,
        [ 0.2737, -0.2306, -0.3291,  ...,  0.1814,  0.3081, -0.1547],
        [ 0.3150, -0.2740,  0.2353,  ...,  0.2184,  0.2793,  0.1959],
        [-0.3047, -0.1932, -0.2062,  ...,  0.2474, -0.3462,  0.2167]],
       device='cuda:0', requires_grad=True)

In [34]:
test_parameter_2 = test_parameter_1 * 0 + 0.0128
test_parameter_2

tensor([[0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128],
        [0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128],
        [0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128],
        ...,
        [0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128],
        [0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128],
        [0.0128, 0.0128, 0.0128,  ..., 0.0128, 0.0128, 0.0128]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [35]:
test_parameter_1 * 0.9 + test_parameter_2 * 0.1

tensor([[-0.2523, -0.1368, -0.0153,  ..., -0.2568, -0.2599, -0.1067],
        [-0.0753,  0.0423,  0.1652,  ...,  0.2287, -0.1410,  0.1458],
        [-0.1248,  0.2597, -0.2707,  ..., -0.2046, -0.0505, -0.2382],
        ...,
        [ 0.2476, -0.2063, -0.2949,  ...,  0.1646,  0.2786, -0.1379],
        [ 0.2848, -0.2453,  0.2131,  ...,  0.1978,  0.2526,  0.1776],
        [-0.2729, -0.1726, -0.1843,  ...,  0.2239, -0.3103,  0.1963]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [36]:
def polyak_update(network_to_update, target_network, tau=0.001):
    with torch.no_grad():
        for to_update, target in zip(network_to_update.parameters(), target_network.parameters()):
            to_update *= 1-tau
            to_update += target * tau

In [37]:
test_network_1 = QNetwork(5, 3)
test_network_2 = QNetwork(5, 3)
display(list(test_network_1.parameters())[0])
display(list(test_network_2.parameters())[0])
polyak_update(test_network_2, test_network_1, 0.1)
display(list(test_network_1.parameters())[0])
display(list(test_network_2.parameters())[0])

Parameter containing:
tensor([[ 0.0741, -0.1410, -0.1811, -0.3515, -0.3451],
        [-0.2800,  0.4027, -0.4254,  0.0194, -0.2672],
        [ 0.0916, -0.2298, -0.0562,  0.3056,  0.1465],
        ...,
        [ 0.3771, -0.3419, -0.0322,  0.2105,  0.4407],
        [-0.2593, -0.1618, -0.3870,  0.4451, -0.2519],
        [-0.2275,  0.2986,  0.3021,  0.0071,  0.2386]], requires_grad=True)

Parameter containing:
tensor([[ 0.1611,  0.0708,  0.0283,  0.3144,  0.3004],
        [ 0.1640,  0.0361,  0.2048,  0.4455, -0.3690],
        [-0.0309,  0.3177, -0.3345,  0.2461, -0.2685],
        ...,
        [ 0.1074, -0.0733, -0.2744, -0.0081,  0.0439],
        [ 0.4372,  0.0680,  0.2465,  0.1044,  0.3385],
        [ 0.0889,  0.4237,  0.0940, -0.2569,  0.1042]], requires_grad=True)

Parameter containing:
tensor([[ 0.0741, -0.1410, -0.1811, -0.3515, -0.3451],
        [-0.2800,  0.4027, -0.4254,  0.0194, -0.2672],
        [ 0.0916, -0.2298, -0.0562,  0.3056,  0.1465],
        ...,
        [ 0.3771, -0.3419, -0.0322,  0.2105,  0.4407],
        [-0.2593, -0.1618, -0.3870,  0.4451, -0.2519],
        [-0.2275,  0.2986,  0.3021,  0.0071,  0.2386]], requires_grad=True)

Parameter containing:
tensor([[ 0.1524,  0.0496,  0.0073,  0.2478,  0.2358],
        [ 0.1196,  0.0728,  0.1418,  0.4029, -0.3588],
        [-0.0186,  0.2630, -0.3067,  0.2520, -0.2270],
        ...,
        [ 0.1344, -0.1001, -0.2501,  0.0137,  0.0835],
        [ 0.3675,  0.0450,  0.1831,  0.1384,  0.2795],
        [ 0.0572,  0.4112,  0.1148, -0.2305,  0.1176]], requires_grad=True)

# Log Experiments

In [38]:
for p in [0.99, 0.9, 0.8, 0.6, 0.5]:
    logs = -np.log([p, 1-p])
    display(p, logs, sum(logs))

0.99

array([0.01005034, 4.60517019])

4.615220521841592

0.9

array([0.10536052, 2.30258509])

2.4079456086518722

0.8

array([0.22314355, 1.60943791])

1.8325814637483102

0.6

array([0.51082562, 0.91629073])

1.4271163556401456

0.5

array([0.69314718, 0.69314718])

1.3862943611198906

In [39]:
random.sample(replay_buffer, k=4)

[SARS(state=array([-0.28285256,  1.2509098 , -0.86983234, -0.49966753,  0.40808973,
         0.24865513,  0.        ,  0.        ], dtype=float32), action=0, reward=-1.6441222857491766, next_state=array([-0.29143548,  1.2391429 , -0.8698246 , -0.5263462 ,  0.42052236,
         0.24865246,  0.        ,  0.        ], dtype=float32), t=35, failed=False, limit=False),
 SARS(state=array([-0.5792671 ,  0.6246547 , -1.1355448 , -1.1637185 ,  0.87443626,
         0.33244595,  0.        ,  0.        ], dtype=float32), action=0, reward=-2.4370339139325097, next_state=array([-0.59051526,  0.5980657 , -1.1355182 , -1.1904002 ,  0.89105827,
         0.33243972,  0.        ,  0.        ], dtype=float32), t=67, failed=False, limit=False),
 SARS(state=array([-0.70458233,  0.29871827, -1.2337439 , -1.4366765 ,  1.0936056 ,
         0.48716837,  0.        ,  0.        ], dtype=float32), action=3, reward=-3.520515716150014, next_state=array([-0.71677244,  0.26621062, -1.2291383 , -1.4581699 ,  1.1158035 

# Training

![Psudocode](sac_psudocode.png)

Source: https://spinningup.openai.com/en/latest/algorithms/sac.html#pseudocode

In [40]:
def q_min(q1, q2, states):
    
    def f(q):
        state_values = q.forward(states.to(device)).detach()
        return state_values
        
    return torch.minimum(*map(f, (q1, q2)))

In [41]:
    stats = {}
    # Step 11
    training_batch = random.sample(replay_buffer, k=min(len(replay_buffer), 100))
    # Prep
    states = torch.tensor(np.array([sars.state for sars in training_batch]), requires_grad=False).to(device)
    actions = torch.tensor(np.array([sars.action for sars in training_batch]), requires_grad=False).to(device)
    actions_hot = nn.functional.one_hot(actions, env.action_space.n).to(device)
    rewards = torch.tensor(np.array([sars.reward for sars in training_batch]), requires_grad=False).to(device)
    next_states = torch.tensor(np.array([sars.next_state for sars in training_batch]), requires_grad=False).to(device)
    fails = torch.tensor(np.array([sars.failed for sars in training_batch]), dtype=int, requires_grad=False).to(device)
    # Step 12
    next_action_probs = policy.policy_network.forward(next_states.to(device)).detach()
    assert not next_action_probs.requires_grad
    next_states_q_min = q_min(policy.q1_target_network, policy.q2_target_network, next_states)
    assert not next_states_q_min.requires_grad
    next_actions_q_min = torch.sum(next_states_q_min * next_action_probs, 1)
    assert not next_actions_q_min.requires_grad
    next_actions_entropy = torch.sum(next_action_probs * torch.log(next_action_probs), 1)
    assert not next_actions_entropy.requires_grad
    y = rewards + GAMMA * (1-fails) * (next_actions_q_min - ALPHA * next_actions_entropy)
    assert not y.requires_grad
    # Step 13
    for qi, q, opt in ((1, policy.q1_network, policy.q1_optimizer),
                       (2, policy.q2_network, policy.q2_optimizer)):
        assert not states.requires_grad
        assert not actions_hot.requires_grad
        q_state_action = torch.sum(q.forward(states.to(device)) * actions_hot, 1)
        assert q_state_action.requires_grad
        q_loss = torch.mean((q_state_action - y)**2)
        stats[f'train/q_loss_{qi}'] = q_loss.detach()
        assert q_loss.requires_grad
        opt.zero_grad()
        q_loss.backward()
        opt.step()
    # Step 14
    action_probs = policy.policy_network.forward(states)
    assert action_probs.requires_grad
    states_q_min = q_min(policy.q1_network, policy.q2_network, states)
    assert not states_q_min.requires_grad
    actions_q_min = torch.sum(states_q_min * action_probs, 1)
    assert actions_q_min.requires_grad
    actions_entropy = torch.sum(action_probs * torch.log(action_probs), 1)
    assert actions_entropy.requires_grad
    policy_loss = -1 * torch.mean(actions_q_min - ALPHA * actions_entropy)
    stats['train/policy_loss'] = policy_loss.detach()
    policy.policy_optimizer.zero_grad()
    policy_loss.backward()
    policy.policy_optimizer.step()
    # Step 15
    polyak_update(policy.q1_target_network, policy.q1_network, tau=TAU)
    polyak_update(policy.q2_target_network, policy.q2_network, tau=TAU)
    stats

{'train/q_loss_1': tensor(131.5877, device='cuda:0', dtype=torch.float64),
 'train/q_loss_2': tensor(131.0647, device='cuda:0', dtype=torch.float64),
 'train/policy_loss': tensor(-0.2658, device='cuda:0')}

In [42]:
def train(policy, replay_buffer):
    stats = {}
    # Step 11
    training_batch = random.sample(replay_buffer, k=min(len(replay_buffer), 100))
    # Prep
    states = torch.tensor(np.array([sars.state for sars in training_batch]), requires_grad=False).to(device)
    actions = torch.tensor(np.array([sars.action for sars in training_batch]), requires_grad=False).to(device)
    actions_hot = nn.functional.one_hot(actions, env.action_space.n).to(device)
    rewards = torch.tensor(np.array([sars.reward for sars in training_batch]), requires_grad=False).to(device)
    next_states = torch.tensor(np.array([sars.next_state for sars in training_batch]), requires_grad=False).to(device)
    fails = torch.tensor(np.array([sars.failed for sars in training_batch]), dtype=int, requires_grad=False).to(device)
    # Step 12
    next_action_probs = policy.policy_network.forward(next_states.to(device)).detach()
    assert not next_action_probs.requires_grad
    next_states_q_min = q_min(policy.q1_target_network, policy.q2_target_network, next_states)
    assert not next_states_q_min.requires_grad
    next_actions_q_min = torch.sum(next_states_q_min * next_action_probs, 1)
    assert not next_actions_q_min.requires_grad
    next_actions_entropy = torch.sum(next_action_probs * torch.log(next_action_probs), 1)
    assert not next_actions_entropy.requires_grad
    y = rewards + GAMMA * (1-fails) * (next_actions_q_min - ALPHA * next_actions_entropy)
    assert not y.requires_grad
    # Step 13
    for qi, q, opt in ((1, policy.q1_network, policy.q1_optimizer),
                       (2, policy.q2_network, policy.q2_optimizer)):
        assert not states.requires_grad
        assert not actions_hot.requires_grad
        q_state_action = torch.sum(q.forward(states.to(device)) * actions_hot, 1)
        assert q_state_action.requires_grad
        q_loss = torch.mean((q_state_action - y)**2)
        stats[f'train/q_loss_{qi}'] = q_loss.detach()
        assert q_loss.requires_grad
        opt.zero_grad()
        q_loss.backward()
        opt.step()
    # Step 14
    action_probs = policy.policy_network.forward(states)
    assert action_probs.requires_grad
    states_q_min = q_min(policy.q1_network, policy.q2_network, states)
    assert not states_q_min.requires_grad
    actions_q_min = torch.sum(states_q_min * action_probs, 1)
    assert actions_q_min.requires_grad
    actions_entropy = torch.sum(action_probs * torch.log(action_probs), 1)
    assert actions_entropy.requires_grad
    policy_loss = -1 * torch.mean(actions_q_min - ALPHA * actions_entropy)
    stats['train/policy_loss'] = policy_loss.detach()
    policy.policy_optimizer.zero_grad()
    policy_loss.backward()
    policy.policy_optimizer.step()
    # Step 15
    polyak_update(policy.q1_target_network, policy.q1_network, tau=TAU)
    polyak_update(policy.q2_target_network, policy.q2_network, tau=TAU)
    return stats

In [43]:
tb_writer = SummaryWriter()

oss = env.observation_space.shape
if len(oss) != 1:
    raise RuntimeError(f'Unknown observation_space.shape: {oss}')
os_len = oss[0]
policy = Policy(os_len, env.action_space.n)

replay_buffer = deque(maxlen=30_000)

def action(policy, s):
    tensor_s = torch.tensor(s).reshape((1, -1))
    action_weights = policy.policy_network.forward(tensor_s.to(device)).reshape((-1,)).tolist()
    action = random.choices(range(len(action_weights)), weights=action_weights)[0]
    return action

def step(initial_s, a, r, next_s, t, failed, limit):
    replay_buffer.append(SARS(initial_s, a, r, next_s, t, failed, limit))
    if RENDER:
        env.render()

for episode in tqdm.tqdm(range(1, 3000+1)):
    episode_reward = run_episode(action, step, env, policy, fail_at_limit=True)
    tb_writer.add_scalar('main/episode_reward', episode_reward, episode)
    tb_writer.add_scalar('main/replay_buffer_length', len(replay_buffer), episode)
    policy.reset_optimizers()
    for training_iteration in range(1, 100+1):
        stats = train(policy, replay_buffer)
        for stat, value in stats.items():
            tb_writer.add_scalar(stat, value, episode)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [52:14<00:00,  1.04s/it]


In [None]:
env = gym.make("LunarLander-v2")

def render_only(initial_s, a, r, next_s, t, failed, limit):
    env.render()

for episode in tqdm.tqdm(range(1, 10+1)):
    run_episode(action, render_only, env, policy)

env.close()

 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 9/10 [02:34<00:19, 19.36s/it]