In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from collections import deque

In [2]:
problem = 'LunarLanderContinuous-v2'
#problem = 'Pendulum-v0'
env = gym.make(problem)


num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print(num_states, num_actions, upper_bound, lower_bound)

8 2 1.0 -1.0


In [3]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [4]:
### This shit just doesn't work with deque
### So it needs to be this way

class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size

        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1
    
    @tf.function
    def update(self, states, actions, rewards, n_states):
        with tf.GradientTape() as tape:
            target_actions = target_actor(n_states, training=True)
            y = rewards + gamma * target_critic([n_states, target_actions], training=True)
            critic_value = critic_model([states, actions], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables))

        with tf.GradientTape() as tape:
            acts = actor_model(states, training=True)
            critic_value = critic_model([states, acts], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))
        
    def learn(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)

        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)

In [5]:
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [6]:
def actor():
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(2, activation="tanh", kernel_initializer=last_init)(out)

    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def critic():
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [7]:
def choose_action(state, noise_obj):
    state = tf.expand_dims(state, 0)
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_obj()
    sampled_actions = sampled_actions.numpy() + noise

    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

In [8]:
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = actor()
critic_model = critic()

target_actor = actor()
target_critic = critic()

target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

critic_lr = 0.0005
actor_lr = 0.0005

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 2000

gamma = 0.99

tau = 0.005

memory = Buffer(50000, 64)

In [None]:
score_hist = []
avg_hist = []

for i in range(total_episodes):
    state = env.reset()
    score = 0
    
    while True:
        
        state = tf.convert_to_tensor(state)
        
        action = choose_action(state, ou_noise)[0]
        
        n_state, reward, done, _ = env.step(action)
        
        memory.record((state, action, reward, n_state))
        
        score += reward
        
        memory.learn()
        
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        
        if done:
            break
        
        state = n_state
    
    score_hist.append(score)
    
    avg = np.mean(score_hist[-40:])
    
    print('Episode', i, 'Avg score', avg)
    avg_hist.append(avg)

Episode 0 Avg score -450.6093964189201
Episode 1 Avg score -398.2833827378147
Episode 2 Avg score -507.59979983803777
Episode 3 Avg score -562.2027921499589
Episode 4 Avg score -569.624299392421
Episode 5 Avg score -586.7020341994127
Episode 6 Avg score -624.9890687880983
Episode 7 Avg score -643.0601275223013
Episode 8 Avg score -647.7114793869096
Episode 9 Avg score -644.667887296718
Episode 10 Avg score -655.5862183105421
Episode 11 Avg score -658.5156396861669
Episode 12 Avg score -668.7519162162496
Episode 13 Avg score -664.3272525351882
Episode 14 Avg score -627.1765433966134
Episode 15 Avg score -596.683019651146
Episode 16 Avg score -574.6843872689534
Episode 17 Avg score -551.622145143026
Episode 18 Avg score -529.6344502987391
Episode 19 Avg score -509.7768479413332
Episode 20 Avg score -492.2695289456448
Episode 21 Avg score -477.1609040225799
Episode 22 Avg score -461.0867557130995
Episode 23 Avg score -446.5700503501494
Episode 24 Avg score -439.87085854448776
Episode 25 A

Episode 201 Avg score -314.99959028859877
Episode 202 Avg score -308.51946348914447
Episode 203 Avg score -302.4559799994255
Episode 204 Avg score -292.6246484399209
Episode 205 Avg score -281.6724068218388
Episode 206 Avg score -275.3840854054074
Episode 207 Avg score -272.0502689624243
Episode 208 Avg score -261.1123994650237
Episode 209 Avg score -259.8560510077425
Episode 210 Avg score -259.27623191204094
Episode 211 Avg score -259.4944045626271
Episode 212 Avg score -244.3705581995167
Episode 213 Avg score -236.40312512696147
Episode 214 Avg score -237.18001002137538
Episode 215 Avg score -242.27250788115862
Episode 216 Avg score -239.7942524872762
Episode 217 Avg score -234.67237759406308
Episode 218 Avg score -227.57832860996808
Episode 219 Avg score -222.4436247573452
Episode 220 Avg score -215.08396207038976
Episode 221 Avg score -208.2982022445098
Episode 222 Avg score -208.0415922102815
Episode 223 Avg score -204.76948858024053
Episode 224 Avg score -198.97930313767844
Episo

In [None]:
plt.plot(avg_hist)
plt.grid()