In [63]:
import tensorflow as tf
from tensorflow.keras.layers import Concatenate,Dense
import gym
import numpy as np
from typing import List
import tqdm
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tensorflow.keras.optimizers import Adam

In [64]:
env = gym.make('BipedalWalker-v3')
print(env.action_space.high)
print(env.action_space.low)
state_dim = env.observation_space.shape
action_dim = env.action_space.shape
print(state_dim[0],action_dim[0])

[1. 1. 1. 1.]
[-1. -1. -1. -1.]
24 4


In [65]:
def env_step(action: np.ndarray) -> List[np.ndarray]:
    state, reward, done, _ = env.step(action)
    return (
        state.astype(np.float32),
        np.array(reward, np.float32),
        np.array(done, np.int32)
    )

def tf_env_step(action: tf.Tensor):
    return tf.numpy_function(
        env_step, [action], [tf.float32, tf.float32, tf.int32]
    )


In [66]:
last_weight_init = tf.keras.initializers.RandomUniform(minval=-.003, maxval=.003)

In [67]:
bound = tf.constant(env.action_space.high)

In [68]:
# func approx for deterministic policy 
class Actor(tf.keras.Model):
    def __init__(self,action_shape):
        super(Actor,self).__init__()
        self.fc1 = Dense(64,activation='relu')
        self.fc2 = Dense(32,activation='relu')
        #fed through tanh to bound actions between (-1,1) 
        self.fc3 = Dense(action_shape,activation='tanh',kernel_initializer=last_weight_init)
    def call(self,inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)
        return x * bound
    
# func approx for Q(s,a)
class Critic(tf.keras.Model):
    def __init__(self,action_shape):
        super(Critic,self).__init__()
        self.state_fc1 = Dense(64,activation='relu')
        self.state_fc2 = Dense(32,activation='relu')
        
        self.action_fc1 = Dense(64,activation='relu')
        
        self.concat = Concatenate()

        self.out = Dense(1,kernel_initializer=last_weight_init)
    
    def call(self,inputs,training):
        [state,action] = inputs
        state_x = self.state_fc1(state)
        state_x = self.state_fc2(state_x)
        
        action_x = self.action_fc1(action)
        
        concat_x = self.concat([state_x,action_x])
        
        return self.out(concat_x)
        

In [83]:

data_spec =  (
    tf.TensorSpec([state_dim[0]], tf.float32, 'state'),
    tf.TensorSpec([action_dim[0]], tf.float32, 'action'),
    tf.TensorSpec([1], tf.float32, 'reward'),
    tf.TensorSpec([state_dim[0]], tf.float32, 'next_state'),
    tf.TensorSpec([1], tf.int32, 'done'),
)

batch_size = 1
max_length = int(1e6)

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec,batch_size=batch_size,max_length=max_length)

In [77]:
class ReplayMemory:	

    def __init__(self,capacity,state_shape,action_shape):
        self.capacity = capacity
        self.num_in_memory = 0
        self.n_idx = 0
        self.states = np.empty((capacity,)+state_shape,dtype=np.float32)
        self.actions = np.empty((capacity,)+action_shape)
        self.rewards = np.empty((capacity,))
        self.next_states = np.empty((capacity,)+state_shape,dtype=np.float32)
        self.dones = np.empty((capacity,))

    def store(self,state,action,reward,next_state,done):
        self.states[self.n_idx] = state
        self.actions[self.n_idx] = action
        self.rewards[self.n_idx] = reward
        self.next_states[self.n_idx] = next_state
        self.dones[self.n_idx] = done
        self.n_idx = (self.n_idx + 1) % self.capacity
        self.num_in_memory = min(self.num_in_memory + 1, self.capacity)

    def sample(self,batch_size):
        size = batch_size
        if size > self.num_in_memory:
            size = self.num_in_memory 
        idxs = np.random.choice(self.num_in_memory,size)
        
        return (
            tf.convert_to_tensor(self.states[idxs]),
            tf.convert_to_tensor(self.actions[idxs]),
            tf.convert_to_tensor(self.rewards[idxs]),
            tf.convert_to_tensor(self.next_states[idxs]),
            tf.convert_to_tensor(self.dones[idxs]),
        )

In [75]:
actor = Actor(env.action_space.shape[0])
critic = Critic(env.action_space.shape[0])

actor_t_net = Actor(env.action_space.shape[0])
critic_t_net = Critic(env.action_space.shape[0])

actor_t_net.set_weights(actor.get_weights())
critic_t_net.set_weights(critic.get_weights())

max_episodes = 100
gamma = .99
max_steps = 10000
actor_lr = .0001
critic_lr = .0001

actor_optim = Adam(learning_rate=actor_lr)
critic_optim = Adam(learning_rate=critic_lr)


capacity = int(1e6)
state_dim = env.observation_space.shape
action_dim = env.action_space.shape

In [79]:
replay_memory = ReplayMemory(capacity,state_dim,action_dim)

In [82]:
state = tf.constant(env.reset(),dtype=tf.float32)
while True:
    action = actor(tf.expand_dims(state,0))
    next_state,reward,done = tf_env_step(tf.squeeze(action))
    
    replay_memory.store(state,action,reward,next_state,done)

    if tf.cast(done,tf.bool):
        break

replay_memory.sample(2)

(<tf.Tensor: shape=(2, 24), dtype=float32, numpy=
 array([[ 2.7474794e-03, -7.9761385e-06,  6.2046666e-04, -1.5999949e-02,
          9.2039801e-02, -8.1880658e-04,  8.6022431e-01,  2.0082442e-03,
          1.0000000e+00,  3.2446746e-02, -8.1874535e-04,  8.5377127e-01,
          5.7906413e-04,  1.0000000e+00,  4.4081402e-01,  4.4582012e-01,
          4.6142277e-01,  4.8955020e-01,  5.3410280e-01,  6.0246104e-01,
          7.0914888e-01,  8.8593185e-01,  1.0000000e+00,  1.0000000e+00],
        [ 2.7474267e-03, -1.4054701e-05,  1.0933396e-03, -1.5999904e-02,
          9.1957770e-02, -1.4428418e-03,  8.6027026e-01,  2.4691357e-03,
          1.0000000e+00,  3.2363418e-02, -1.4427404e-03,  8.5381937e-01,
          1.0203881e-03,  1.0000000e+00,  4.4081411e-01,  4.4582021e-01,
          4.6142289e-01,  4.8955029e-01,  5.3410292e-01,  6.0246116e-01,
          7.0914906e-01,  8.8593203e-01,  1.0000000e+00,  1.0000000e+00]],
       dtype=float32)>,
 <tf.Tensor: shape=(2, 4), dtype=float64, numpy

In [84]:
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [85]:
def learn(states,actions,rewards,next_states,dones):
    with tf.GradientTape() as tape:
        target_actions = actor_t_net(next_states)
        td_target = rewards + gamma * critic_t_net([next_states,actions])
        td_pred = critic([states,target_actions])
        critic_loss = tf.reduce_mean(tf.math.square(td_pred-td_target))
    critic_grads = tape.gradient(critic_loss,critic.trainable_variables)
    critic_optim.apply_gradients(zip(critic_grads,critic.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor(states)
        action_values = critic([states,actions])
        actor_loss = -tf.math.reduce_mean(action_values)
    actor_grads = tape.gradient(actor_loss,actor.trainable_variables)
    actor_optim.apply_gradients(zip(actor_grads,actor.trainable_variables))

In [86]:
@tf.function
def train_step(initial_state: tf.Tensor, gamma: float, max_steps: int, batch_size: int, tau: float, transitions_stored: tf.Tensor) -> tf.Tensor:
    
    state = initial_state
    state_shape = initial_state.shape
    
    reward_shape = (1,)
    episode_reward = tf.constant([0],dtype=tf.float32)
    episode_reward.set_shape(reward_shape)
    
    batch_sz = tf.constant(batch_size)
    
    for t in tf.range(max_steps):
        action = actor(tf.expand_dims(state,0))
        action = tf.squeeze(action)
        
        next_state,reward,done = tf_env_step(action)
        
        reward,done = [
          tf.expand_dims(x,0)  for x in [reward,done]
        ]
                
        next_state.set_shape(state_shape)
        reward.set_shape(reward_shape)
        done.set_shape(reward_shape)

        transition = (state,action,reward,next_state,done)
        transition = tf.nest.map_structure(lambda t: tf.stack([t] * 1),
                                       transition)
        
        replay_buffer.add_batch(transition)
        transitions_stored += tf.constant(1)
        
        if tf.math.greater_equal(transitions_stored,batch_sz):
            sample = replay_buffer.as_dataset(sample_batch_size=128,num_steps=1)
            iterator = iter(sample)
            (states,actions,rewards,next_states,dones),_ = iterator.next()
            
            learn(states,actions,rewards,next_states,dones)
            
            update_target(actor_t_net.trainable_variables,actor.trainable_variables,tau)
            
            update_target(critic_t_net.trainable_variables,critic.trainable_variables,tau)
            
        
        state = next_state
        
        episode_reward += reward 
        episode_reward.set_shape(reward_shape)
    
        if tf.cast(done,tf.bool):
            break

    return episode_reward,transitions_stored


In [87]:
running_reward = 0
reward_threshold = 195
transitions_stored = tf.constant(0)

with tqdm.trange(max_episodes) as t:
    rewards = []
    for i in t:
        state = tf.constant(env.reset(),dtype=tf.float32)
        episode_reward,transitions_stored = train_step(state,gamma,max_steps,128,.005,transitions_stored)
        
        episode_reward = float(episode_reward)
        
        rewards.append(episode_reward)
                
        running_reward = np.mean(rewards[-100:])
    
        t.set_description(f"Episode {i}")
        t.set_postfix(episode_reward=episode_reward,running_reward=running_reward)
        
        if running_reward > reward_threshold:
            break

  0%|          | 0/100 [00:00<?, ?it/s]

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 2.7449911e-03,  1.5867983e-05, -2.0691280e-03, -1.6000107e-02,
         9.2661008e-02,  4.8033712e-03,  8.5960138e-01, -2.4167576e-03,
         1.0000000e+00,  3.2931868e-02,  4.8031555e-03,  8.5344452e-01,
        -3.2682212e-03,  1.0000000e+00,  4.4081321e-01,  4.4581932e-01,
         4.6142194e-01,  4.8954931e-01,  5.3410184e-01,  6.0245991e-01,
         7.0914757e-01,  8.8593018e-01,  1.0000000e+00,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.00569507, -0.00061952, -0.0041189 ,  0.00453697]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.08055206]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[-5.0083953e-03,  4.5464060e-04, -5.2258424e-03, -1.1077368e-03,
         4.3538982e-01,  5.7960466e-02,  1.1553264e-01, -7.4642710e-02,
         1.0000000e+00,  3.3807853e-01,  5.1104367e-02,  1.1713135e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[-2.5518425e-03,  4.3699349e-04, -9.8409150e-03, -1.0743592e-01,
         5.2298146e-01,  3.4207505e-01, -6.0381651e-02, -4.4763181e-01,
         1.0000000e+00,  4.3342188e-01,  3.4333494e-01, -6.1032772e-02,
        -4.4712695e-01,  1.0000000e+00,  4.2929652e-01,  4.3417183e-01,
         4.4936684e-01,  4.7675931e-01,  5.2014786e-01,  5.8672005e-01,
         6.9062042e-01,  8.6278439e-01,  1.0000000e+00,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.00336604,  0.00172603, -0.00113456,  0.00654181]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00618123]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[-2.2734853e-03,  5.3724454e-04, -1.0076993e-02, -1.2298283e-01,
         5.5251133e-01,  3.7028050e-01, -1.1865997e-01, -4.8570451e-01,
         1.0000000e+00,  4.6319175e-01,  3.7181079e-01, -1.1937320e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 0.00270556,  0.00225368, -0.00823122, -0.23416653,  0.7992612 ,
         0.48336703, -0.60824347, -0.6521438 ,  1.        ,  0.71199065,
         0.4881641 , -0.6089184 , -0.651712  ,  1.        ,  0.36236835,
         0.3664836 ,  0.37930965,  0.4024316 ,  0.43905574,  0.49524924,
         0.5829513 ,  0.7282746 ,  0.9995369 ,  1.        ]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.00611022,  0.0077337 , -0.00147207,  0.00458095]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.02764099]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 0.00951295,  0.01359569,  0.00978391, -0.00664199,  0.79114676,
        -0.1140463 , -0.6100557 , -0.00992183,  1.        ,  0.7037786 ,
        -0.10096624, -0.6102526 , -0.01929898,  1.        ,  0.36208385,
         0.36619586,  0.37901187,  0.40211567,  0.43871105,  0.49486044,
         0.5

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 7.4619032e-02,  2.2665614e-02,  1.9722503e-02, -2.1549726e-03,
         7.0428878e-01, -1.8849599e-01, -6.0785055e-01,  2.5542578e-04,
         1.0000000e+00,  6.1700827e-01, -1.8834859e-01, -6.0847354e-01,
        -4.6525648e-04,  1.0000000e+00,  3.6214706e-01,  3.6625978e-01,
         3.7907803e-01,  4.0218586e-01,  4.3878764e-01,  4.9494681e-01,
         5.8259535e-01,  7.2782987e-01,  9.9889618e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.0102059 ,  0.00809904, -0.0018363 ,  0.0064831 ]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.04636537]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 8.6757734e-02,  2.4276029e-02,  2.1392146e-02, -2.5538795e-03,
         6.8807483e-01, -2.0218827e-01, -6.0775685e-01,  2.7461597e-04,
         1.0000000e+00,  6.0083073e-01, -2.0203276e-01, -6.0848820e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.7968564e-01,  3.6394645e-02,  3.3643499e-02, -6.6606104e-03,
         5.6350917e-01, -3.0455214e-01, -6.0695338e-01,  4.1144589e-04,
         1.0000000e+00,  4.7653294e-01, -3.0435255e-01, -6.0851920e-01,
        -7.4487430e-04,  1.0000000e+00,  3.6108696e-01,  3.6518764e-01,
         3.7796837e-01,  4.0100855e-01,  4.3750319e-01,  4.9349797e-01,
         5.8088994e-01,  7.2569931e-01,  9.9594015e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.01267665,  0.0111063 , -0.00506366,  0.00991366]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.07295515]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.9912244e-01,  3.8870744e-02,  3.6115721e-02, -7.7438233e-03,
         5.3741246e-01, -3.2535422e-01, -6.0676718e-01,  4.3770173e-04,
         1.0000000e+00,  4.5048946e-01, -3.2514876e-01, -6.0850620e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 3.7576306e-01,  6.0173836e-02,  5.7704974e-02, -2.0196369e-02,
         3.0001247e-01, -5.0503206e-01, -6.0434341e-01,  2.3442705e-03,
         1.0000000e+00,  2.1370293e-01, -5.0355202e-01, -6.0769534e-01,
         1.9619863e-04,  1.0000000e+00,  3.5675520e-01,  3.6080670e-01,
         3.7343407e-01,  3.9619789e-01,  4.3225470e-01,  4.8757774e-01,
         5.7392132e-01,  7.1699351e-01,  9.8398525e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.01791002,  0.01657849, -0.00595549,  0.01162715]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.11825524]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 4.0765548e-01,  6.3781634e-02,  6.1468650e-02, -2.3030441e-02,
         2.5717568e-01, -5.3498483e-01, -6.0379934e-01,  2.4433434e-03,
         1.0000000e+00,  1.7099078e-01, -5.3344488e-01, -6.0744727e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 6.86707199e-01,  9.24331471e-02,  9.31447446e-02,
        -5.23540787e-02, -1.15865245e-01, -7.69233584e-01,
        -5.98902106e-01,  3.35023808e-03,  1.00000000e+00,
        -2.01017603e-01, -7.67323136e-01, -6.05111241e-01,
         1.90814331e-04,  1.00000000e+00,  3.43890518e-01,
         3.47795904e-01,  3.59967947e-01,  3.81910890e-01,
         4.16667491e-01,  4.69995588e-01,  5.53225577e-01,
         6.91138566e-01,  9.48629141e-01,  1.00000000e+00]], dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.02422138,  0.02239325, -0.0068843 ,  0.01557134]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.17653352]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 7.3513985e-01,  9.6879609e-02,  9.8443784e-02, -5.7984892e-02,
        -1.8019463e-01, -8.0478144e-01, -5.9806991e-01,  3.4636061e-03,
         1.0000000e+00, -2.6517892e-01, -8.028

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.1351149e+00,  1.2508373e-01,  1.4030159e-01, -1.0257676e-01,
        -7.0553613e-01, -1.0233953e+00, -5.9252000e-01,  3.3623774e-03,
         1.0000000e+00, -7.8932232e-01, -1.0216221e+00, -6.0241747e-01,
        -5.3714711e-04,  1.0000000e+00,  3.1433839e-01,  3.1790817e-01,
         3.2903421e-01,  3.4909147e-01,  3.8086128e-01,  4.2960665e-01,
         5.0568426e-01,  6.3174576e-01,  8.6748081e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.03314739,  0.03006643, -0.00659119,  0.02052067]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.22131509]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.1989928e+00,  1.2780677e-01,  1.4700538e-01, -1.0823329e-01,
        -7.8873962e-01, -1.0446678e+00, -5.9182119e-01,  3.9820173e-03,
         1.0000000e+00, -8.7235242e-01, -1.0428272e+00, -6.0218775e-01,
        

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.1136305e+00, -2.6309311e-02,  1.2782806e-01, -2.5382308e-02,
        -7.9082650e-01, -5.3482130e-02, -5.8777213e-01,  2.6094088e-02,
         1.0000000e+00, -8.3353168e-01,  1.4305115e-06, -6.2718761e-01,
        -6.3175163e-03,  0.0000000e+00,  3.0266863e-01,  3.0610588e-01,
         3.1681889e-01,  3.3613151e-01,  3.6672190e-01,  4.1365758e-01,
         4.8691085e-01,  6.0829234e-01,  8.3527571e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.03256854,  0.03013749, -0.0022546 ,  0.01963756]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.16687249]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 1.0995113e+00, -2.8225983e-02,  1.3482380e-01, -2.6189273e-02,
        -7.9609257e-01, -6.5697238e-02, -5.8297646e-01,  3.7571728e-02,
         1.0000000e+00, -8.3357328e-01,  3.5762787e-07, -6.2715352e-01,
        -

(<tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 9.7733676e-01, -4.0111572e-02,  1.8339695e-01, -5.0455585e-02,
        -8.0268353e-01, -1.9669533e-06, -5.6666017e-01,  1.2539913e-02,
         1.0000000e+00, -8.3342922e-01,  0.0000000e+00, -6.2501585e-01,
         3.9666891e-04,  0.0000000e+00,  2.8870970e-01,  2.9198843e-01,
         3.0220735e-01,  3.2062930e-01,  3.4980884e-01,  3.9457989e-01,
         4.6445477e-01,  5.8023816e-01,  7.9675323e-01,  1.0000000e+00]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.04135492,  0.03903069,  0.00029432,  0.02402167]],
      dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.24196954]], dtype=float32)>, <tf.Tensor: shape=(1, 24), dtype=float32, numpy=
array([[ 9.5628250e-01, -4.2087857e-02,  1.9081303e-01, -5.5431604e-02,
        -8.0269545e-01, -2.7418137e-06, -5.6475592e-01,  1.3148059e-02,
         1.0000000e+00, -8.3342999e-01,  0.0000000e+00, -6.2462509e-01,
         

  0%|          | 0/100 [00:10<?, ?it/s]


KeyboardInterrupt: 