Necessary imports.

In [4]:
from collections import deque
import keras.models, keras.layers, keras.optimizers, keras.backend
import numpy as np

Define the Experience Replay Memory buffer:

In [None]:
class ExperienceReplayBuffer:
    def __init__(self, capacity, batch_size):
        self.batch_size = batch_size
        self.mem = deque(maxlen=capacity)
        
    def add_env_reaction(self, env_reaction):
        # St, At, Rt1, St1.
        self.mem.append(env_reaction)
    
    def sample_batch(self):
        indexes = np.random.choice(a=np.arange(len(self.meme)), size=batch_size)
        states = list()
        actions = list()
        rewards = list()
        next_states = list()
        for index in indexes:
            st, at, rt, st_1 = self.mem[index]
            states.append(st)
            actions.append(at)
            rewards.append(rt)
            next_states.append(st_1)      
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states)


### Actor:
    Define NN for policy approximation and specify loss, backprop with action gradients dL/dA from Critc.

In [3]:
class Actor:
    def __init__(self, state_space, action_space, action_range, action_min, hidden_units, name):
        self.state_space = state_space
        self.action_space = action_space
        self.action_range = action_range
        self.action_min = action_min
        self.name = name
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Inputs(shape=(actions_space,), dtype=keras.float32, name='input_states')
        fc1 = layers.Dense(units=hidden_units, activation='relu', name='fc1')
        fc2 = layers.Dense(units=2*hidden_units, activation='relu', name='fc2')
        fc3 = layers.Dense(units=hidden_units, activation='relu', name='fc3')
        norm_action = layers.Dense(self.action_space, activation='sigmoid', name='norm_action')
        
        # Adapt actions for the range in which rotors work.
        actions = layers.Lambda(lambda x: x*self.action_range + action_min, name='actions')(norm_action)
        self.actor_model = models.Model(input=[input_states], output=[actions])
        
        # Define Loss
        input_act_grad = layers.Inputs(shape=(actions_space,), dtype=keras.float32, name='input_act_grad')
        loss = backend.mean(-input_act_grad*actions)
        
        # Get trainable parameters and define backprop optimization.
        adam_optimizer = optimizers.Adam()
        train_param = adam_optimizer.get_updates(param=self.actor_model.trainable_weights, loss=loss)
        # keras.backend.learning_phase() gives a flag to be passed as input
        # to any Keras function that uses a different behavior at train time and test time.
        self.train_nn = backend.function(inputs=[input_states, input_act_grad, backend.learning_phase()],\
                                         outputs=[], updates=train_param)
        

### Critic:
    Define NN for Action value approximation and specify action gradients dL/dA to pass to Actor.

In [None]:
class Critc:
    def __init__(self, state_space, action_space, hidden_units):
        self.state_space = state_space
        self.action_space = action_space
        self.hidden_units = hidden_units
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Input(shape=(state_space,), dtype=keras.float32, name='input_states')
        fc_states1 = layers.Dense(units=hidden_units, activation='relu')(input_states)
        fc_states2 = layers.Dense(units=2*hidden_units, actication='relu')(fc_states1)
        
        input_actions = layers.Input(shape=(action_shape,), dtype=keras.float32, name='input_actions')
        fc_actions1 = layers.Dense(units=hidden_units, activation='relu')(input_actions)
        fc_actions2 = layers.Dense(units=2*hidden_units, activation='relu')(fc_actions1)
        
        # Advantage function.
        fc_sa1 = layers.Add()([fc_states2, fc_actions2])
        fc_sa2 = layers.Activation('relu')(fc_sa1)
        q_values = layers.Dense(units=1, activation='relu', name='q_values')(fc_sa2)
        self.critic_model = models.Model(inputs=[input_states, input_actions], outputs=[q_values])
        
        # Optimizer and Loss.
        adam_optimizer = optimiziers.Adam()
        self.critic_model.compile(loss='mean_squared_error', optimizer=adam_optimizer)
        
        # Define function to get action gradients.
        action_gradients = backend.gradients(loss=loss, variables=[input_actions])
        self.get_action_gradients = backend.function(inputs=[input_states, input_actions, backend.learning_phase()], \
                                                    outputs=[action_gradients])

### Deep Deterministic Policy Gradient, DDPG Agent:
    Agent definition following DDPG

In [None]:
class DDPG_Agent:
    def __init__(self, task):
        self.task = task
        self.action_low = self.task.action_low
        self.action_high = self.task.action_high
        self.state_space = self.task.state_size
        self.action_space = self.task.action_size
        
        # Instantiate Actors and Critics.
        self.actor = Actor(self.state_space, self.action_space, self.action_high-self.action_low, self.action_low,\
                          hidden_units=32, name='actor')
        self.actor_target = Actor(self.state_space, self.action_space, self.action_high-self.action_low, \
                                  self.action_low, hidden_units=32, name='actor_target')        
        self.critic = Critic(self.state_space, self.action_space, hidden_units=32)
        self.critic_target = Critic(self.state_space, self.action_space, hidden_units=32)
        
        # Set same weights in target.
        self.actor_target.set_weights(self.actor.get_weights())
        self.critic_target.set_weights(self.critic.get_weights())
        
        # Noise for exploration.
        
        # Experience Replay memory.
        self.capacity = 100000
        self.batch_size = 64
        self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity, batch_size=self.batch_size)
        
        # RL parameters.
        self.gamma = 0.99
        self.t = 0.01
        
    # NEED TO DEBUG THIS.
    def act(self, state):
        action = self.actor.actor_model.predict(np.reshape(state, newshape=(-1, self.state_space)))
        action = action + noise
        return action
        
    
    # Saves expirience into memory and updates actor-critic weights.
    def store_learn(self, state, action, reward, next_state):
        
        # Store experience into exp replay memory.
        self.er_buffer.add_env_reaction((state, action, reward, next_state))
        
        # Learn if agent has enough experiences.
        if len(self.er_buffer.mem) > self.batch_size:
            self.learn()
        
        # Update to the current state of the enviroment.
        self.state = next_state
     
    # Learn step of the agent, update weights of actor-critic and actor-critic target NN.
    def learn(self):
        states, actions, rewards, next_states = self.er_buffer.sample_batch()
        
        # Estimated Target Q action values.
        self.critic.critic_model.predit(states, actions)
        
        
        

Run agent on the enviroment: