Necessary imports.

In [3]:
from collections import deque
from task import Task
from keras import layers, models, optimizers
from keras import backend
import numpy as np
import copy
import sys
import pixiedust

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Pixiedust database opened successfully


Define the Experience Replay Memory buffer:

In [None]:
class ExperienceReplayBuffer:
    def __init__(self, capacity, batch_size):
        self.batch_size = batch_size
        self.mem = deque(maxlen=capacity)
        
    def add_env_reaction(self, env_reaction):
        # St, At, Rt1, Dt, St1.
        self.mem.append(env_reaction)
    
    def sample_batch(self):
        indexes = np.random.choice(a=np.arange(len(self.mem)), size=batch_size, replace=False)
        states = list()
        actions = list()
        rewards = list()
        dones = list()
        next_states = list()
        for index in indexes:
            st, at, rt, dt, st_1 = self.mem[index]
            states.append(st)
            actions.append(at)
            rewards.append(rt)
            dones.append(dt)
            next_states.append(st_1)      
        return np.array(states), np.array(actions), np.array(rewards), np.array(dones), np.array(next_states)


### Actor:
    Define NN for policy approximation and specify loss, backprop with action gradients dL/dA from Critc.

In [167]:
class Actor:
    def __init__(self, state_space, action_space, action_range, action_min, hidden_units, name):
        self.state_space = state_space
        self.action_space = action_space
        self.action_range = action_range
        self.action_min = action_min
        self.name = name
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Input(shape=(self.state_space,), dtype=np.float32, name='input_states')
        fc1 = layers.Dense(units=hidden_units, activation='relu', name='fc1')(input_states)
        fc2 = layers.Dense(units=2*hidden_units, activation='relu', name='fc2')(fc1)
        fc3 = layers.Dense(units=hidden_units, activation='relu', name='fc3')(fc2)
        norm_action = layers.Dense(self.action_space, activation='sigmoid', name='norm_action')(fc3)
        
        # Adapt actions for the range in which rotors work.
        actions = layers.Lambda(lambda x: x*self.action_range + action_min, name='actions')(norm_action)
        self.actor_model = models.Model(input=[input_states], output=[actions])
        
        # Define Loss
        input_act_grad = layers.Input(shape=(self.action_space,), dtype=np.float32, name='input_act_grad')
        loss = backend.mean(-input_act_grad*actions)
        
        # Get trainable parameters and define backprop optimization.
        adam_optimizer = optimizers.Adam()
        train_param = adam_optimizer.get_updates(params=self.actor_model.trainable_weights, loss=loss)
        # keras.backend.learning_phase() gives a flag to be passed as input
        # to any Keras function that uses a different behavior at train time and test time.
        self.train_nn = backend.function(inputs=[input_states, input_act_grad, backend.learning_phase()],\
                                         outputs=[], updates=train_param)
        

### Critic:
    Define NN for Action value approximation and specify action gradients dL/dA to pass to Actor.

In [168]:
class Critic:
    def __init__(self, state_space, action_space, hidden_units):
        self.state_space = state_space
        self.action_space = action_space
        self.hidden_units = hidden_units
        
        # Neural Network definition.
        
        # Network Architecture.
        input_states = layers.Input(shape=(self.state_space,), dtype=np.float32, name='input_states')
        fc_states1 = layers.Dense(units=hidden_units, activation='relu')(input_states)
        fc_states2 = layers.Dense(units=2*hidden_units, activation='relu')(fc_states1)
        
        input_actions = layers.Input(shape=(self.action_space,), dtype=np.float32, name='input_actions')
        fc_actions1 = layers.Dense(units=hidden_units, activation='relu')(input_actions)
        fc_actions2 = layers.Dense(units=2*hidden_units, activation='relu')(fc_actions1)
        
        # Advantage function.
        fc_sa1 = layers.Add()([fc_states2, fc_actions2])
        fc_sa2 = layers.Activation('relu')(fc_sa1)
        
        q_values = layers.Dense(units=1, activation='relu', name='q_values')(fc_sa2)
        self.critic_model = models.Model(inputs=[input_states, input_actions], outputs=[q_values])
        
        # Optimizer and Loss.
        adam_optimizer = optimizers.Adam()
        self.critic_model.compile(loss='mean_squared_error', optimizer=adam_optimizer)
        
        # Define function to get action gradients.
        action_gradients = backend.gradients(loss=q_values, variables=[input_actions])
        self.get_action_gradients = backend.function(inputs=[input_states, input_actions, backend.learning_phase()], \
                                                    outputs=action_gradients)

### Ornstein–Uhlenbeck process definition for exploration:

In [169]:
class OUNoise:
    def __init__(self, action_space, mean, sigma, theta):
        self.mean = mean*np.ones(action_space)
        self.sigma = sigma
        self.theta = theta
        self.restart()
        
    def restart(self):
        self.current = copy.copy(self.mean)
        
    def sample(self):
        x = self.current
        dx = self.theta*(self.mean-x) + self.sigma*np.random.randn(len(x))
        self.current = x+dx
        return x+dx

### Deep Deterministic Policy Gradient, DDPG Agent:
    Agent definition following DDPG

In [None]:
%%pixie_debugger

class DDPG_Agent:
    def __init__(self, task, noise, memory, rl_param, nn_hidden):
        self.task = task
        self.action_low = self.task.action_low
        self.action_high = self.task.action_high
        self.state_space = self.task.state_size
        self.action_space = self.task.action_size
        
        # Instantiate Actors and Critics.
        self.actor = Actor(self.state_space, self.action_space, self.action_high-self.action_low, self.action_low,\
                          hidden_units=nn_hidden[0], name='actor')
        self.actor_target = Actor(self.state_space, self.action_space, self.action_high-self.action_low, \
                                  self.action_low, hidden_units=nn_hidden[1], name='actor_target')        
        self.critic = Critic(self.state_space, self.action_space, hidden_units=32)
        self.critic_target = Critic(self.state_space, self.action_space, hidden_units=32)
        
        # Set same weights in target.
        self.actor_target.actor_model.set_weights(self.actor.actor_model.get_weights())
        self.critic_target.critic_model.set_weights(self.critic.critic_model.get_weights())
        
        # Noise for exploration.
        self.mean = noise[0]
        self.sigma = noise[1]
        self.theta = noise[2]
        self.ounoise = OUNoise(self.action_space, self.mean, self.sigma, self.theta)
        
        # Experience Replay memory.
        self.capacity = memory[0]
        self.batch_size = memory[1]
        self.er_buffer = ExperienceReplayBuffer(capacity=self.capacity, batch_size=self.batch_size)
        
        # RL parameters.
        self.gamma = rl_param[0]
        self.t = rl_param[1]
        
        # Keeping track of learning.
        self.learning_rewards = list()
        self.total_reward = None
        self.best_reward = 0
        
    def restart_task(self):
        if self.total_reward is not None:
            self.learning_rewards.append(self.total_reward)
            if self.total_reward > self.best_reward: best_reward = self.total_reward
        self.total_reward = 0
        self.state = self.task.reset()
        self.ounoise.restart()
        return self.state
        
    def act(self, state):
        action = self.actor.actor_model.predict(np.reshape(state, newshape=(-1, self.state_space)))
        self.step_noise = self.ounoise.sample()
        action = action + self.step_noise
        return action[0]
        
    # Saves expirience into memory and updates actor-critic weights.
    def store_learn(self, state, action, reward, done, next_state):
        
        # Store experience into exp replay memory.
        self.er_buffer.add_env_reaction((state, action, reward, done, next_state))
        
        # Learn if agent has enough experiences.
        if len(self.er_buffer.mem) > self.batch_size:
            self.learn()
        
        self.total_reward += reward
        # Update to the current state of the enviroment.
        self.state = next_state
     
    def soft_update(self):
        actor_current = np.array(self.actor.actor_model.get_weights())
        critic_current = np.array(self.critic.critic_model.get_weights())
        actor_target = np.array(self.actor_target.actor_model.get_weights())
        critic_target = np.array(self.critic_target.critic_model.get_weights())
        
        self.actor_target.actor_model.set_weights(actor_target*(1-self.t) + self.t*actor_current)
        self.critic_target.critic_model.set_weights(critic_target*(1-self.t) + self.t*critic_current)
    
    # Learn step of the agent, update weights of actor-critic and actor-critic target NN.
    def learn(self):
        states, actions, rewards, dones, next_states = self.er_buffer.sample_batch()
        
        # Get action for deterministic policy.
        next_actions = self.actor_target.actor_model.predict_on_batch(next_states)
        next_q_values = self.critic_target.critic_model.predict_on_batch([next_states, next_actions])
        next_q_values = next_q_values.reshape((self.batch_size,))
        
        # Need to handle the done case.
        targets = rewards + self.gamma*next_q_values*(1-dones)
        self.critic.critic_model.train_on_batch(x=[states, actions],y=[targets])
        
        
        # Learning Phase = 0 (Test), we just want the gradient, no update on weights.
        action_gradients = self.critic.get_action_gradients([states, actions, 0])
        self.actor.train_nn([states, action_gradients[0], 1])
        
        
        # Do soft update on weigths.
        self.soft_update()
        
        

### Run agent on the enviroment:

In [163]:
# NN sizes
actor_hidden = 32
critic_hidden = 32
nn_hidden = [actor_hidden, critic_hidden]

# Noise for exploration.
mean = 0
sigma = 0.15
theta = 0.2
noise = [mean, sigma, theta]

# RL parameters.
gamma = 0.99
t = 0.01
rl_param = [gamma, t]

# Experience Replay memory.
capacity = 100000
batch_size = 2
memory = [capacity, batch_size]

# Task parameters and instance.
# Modify the values below to give the quadcopter a different starting position.
runtime = 5.                                     # time limit of the episode
init_pose = np.array([0., 0., 10., 0., 0., 0.])  # initial pose
init_velocities = np.array([0., 0., 0.])         # initial velocities
init_angle_velocities = np.array([0., 0., 0.])   # initial angle velocities
file_output = 'data.txt'                         # file name for saved results
target_pos = np.array([0., 0., 10.])
task = Task(init_pose, init_velocities, init_angle_velocities, runtime, target_pos)


# Pending items.
# 4. Need to add batch norm to function approximation NN.
quadcopter_agent = DDPG_Agent(task, noise, memory, rl_param, nn_hidden)



In [4]:
%load_ext autoreload
%autoreload 2
import csv


def track_quad(task, results):
    labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
              'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
              'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4']
    results = {x : [] for x in labels}
    for ii in range(len(labels)):
            results[labels[ii]].append(to_write[ii])
    line = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds)
    return line

# Run the simulation, and save the results.
with open(file_output, 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(labels)
    while True:
        rotor_speeds = agent.act()
        _, _, done = task.step(rotor_speeds)
        writer.writerow(to_write)
        if done:
            break

In [None]:
num_episodes = 1000

for episode in range(1, num_episodes+1):
    state = quadcopter_agent.restart_task()
    done = False
    while not done:
        action = quadcopter_agent.act(state)
        next_state, reward, done = task.step(action)
        quadcopter_agent.store_learn(state, action, reward, done, next_state)
    print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f}), noise_scale = {}".format(
                episode, quadcopter_agent.total_reward, quadcopter_agent.best_reward, \
                quadcopter_agent.step_noise), end="")
    sys.stdout.flush()