# Install Dependencies

In [2]:

!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install gym
!pip install box2d_py


Reading package lists... Done
Building dependency tree       
Reading state information... Done
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2).
zlib1g-dev set to manually installed.
The following additional packages will be installed:
  autotools-dev cmake-data file ibverbs-providers libarchive13 libfabric1
  libhwloc-dev libhwloc-plugins libhwloc5 libibverbs-dev libibverbs1
  libjsoncpp1 libltdl-dev libltdl7 liblzo2-2 libmagic-mgc libmagic1
  libnl-3-200 libnl-route-3-200 libnuma-dev libnuma1 libopenmpi2 libpciaccess0
  libpsm-infinipath1 librdmacm1 librhash0 libtool libuv1 ocl-icd-libopencl1
  openmpi-bin openmpi-common swig3.0
Suggested packages:
  cmake-doc ninja-build lrzip libhwloc-contrib-plugins libtool-doc openmpi-doc
  pciutils autoconf automaken gcj-jdk swig-doc swig-examples swig3.0-examples
  swig3.0-doc
The following NEW packages will be installed:
  autotools-dev cmake cmake-data file ibverbs-providers libarchive13
  libfabric1 libhwloc-dev libhwloc-pl

# Check if we are allocated a GPU



In [3]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

# Connect to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Deep Deterministic Policy Gradient

In this implementation, the actions of the BipedalWalker are discretized into 81 actions, each action being a permutation of {-1,0,1} for each of the four outputs.

## Import Modules

In [0]:
import keras
import gym
import os
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization, Input, Add, LeakyReLU, Subtract, Concatenate, Lambda
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf

import pickle # for saving episodes -> rewards

import numpy as np
from collections import deque
import random
from tqdm import tqdm, trange, tnrange

## Build the Model

### Create Noise

In [0]:
# Taken from https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py, which is
# based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.5, theta=.20, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)


### Replay Buffer

In [0]:
class ReplayBuffer:
    """
    This class represents the experience replay buffer
    """
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.capacity = buffer_size
        self.len = 0
    
    def sample(self, n_samples):
        batch = []
        n_samples = min(self.len, n_samples)
        batch = random.sample(self.buffer, n_samples)
        
        curr_states = np.float32([arr[0] for arr in batch])
        actions = np.int32([arr[1] for arr in batch])
        rewards = np.float32([arr[2] for arr in batch])
        done = np.bool_([arr[3] for arr in batch])
        next_states = np.float32([arr[4] for arr in batch])
        
        return np.array(curr_states), np.array(actions), np.array(rewards), np.array(done), np.array(next_states)
    
    def add(self, curr_state, action, reward, done, next_state):
        self.buffer.append([curr_state, action, reward, done, next_state])
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
        
        

### Actor Network

In [0]:
class Actor():
    """
    Input to the network is the state, output is the action
    under a deterministic policy.
    The output layer activation is a tanh to keep the action
    between -action_bound and action_bound
    """
    def __init__(self, state_dim, action_dim, learning_rate, tau):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau  # tau is the update rate for the target network - allowing it to slowly chase
        
        # Actor Network
        self.model = self.create_actor_network()
        self.actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        
        print(self.model)
        # Target Network
        self.target = self.create_actor_network()

    def create_actor_network(self):
        model = Sequential()
        #print(model)
        model.add(BatchNormalization())
        model.add(Dense(256, input_dim=self.state_dim, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        #print(model)
        model.add(Dense(256, activation='linear'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2))
        #print(model)
        model.add(Dense(256, activation='linear'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2))
        #print(model)
        # output of bipedal walker is between -1 and 1
        model.add(Dense(self.action_dim, activation='tanh'))
        #print(model)
        model.compile(
            optimizer=Adam(lr=self.learning_rate, ),
            loss="mse"
        )
        #model.summary()
        print(model)
        
        return model

    def fit(self, states, action_gradients):
        self.model.fit(states, action_gradients, epochs=5, verbose=0)
    
    def noisy_predict(self, inputs):
        return self.model.predict(inputs, verbose=0) + self.actor_noise() 
    
    def predict(self, states):
        return self.model.predict(states, verbose=0)

    def predict_target(self, states):
        return self.target.predict(states, verbose=0)
    
    def update_target(self):
        """
        Soft update of target network
        """
        self.target.set_weights(
            [x[0] * self.tau + x[1] * (1 - self.tau) for x in zip(self.model.get_weights(), self.target.get_weights())]
        )
    


## Critic Network

In [0]:
class Critic():
    """
    This network takes in 2 inputs the state and action, 
    the output is the Q(s,a)
    The input action is given by the Actor
    """
    
    def __init__(self, sess, state_dim, action_dim, 
                 learning_rate, tau, gamma):
        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma
        
        # Session needed to grab action gradients
        K.set_session(sess)

        # Create networks
        self.model, self.state, self.action = self.create_critic_network()
        self.target, self.target_action, self.target_state = self.create_critic_network()
        
        self.action_grads = tf.gradients(self.model.output, self.action)
        self.sess.run(tf.initialize_all_variables())

    def create_critic_network(self):
        state_inputs = Input(shape=(self.state_dim,), name='state_input')
        action_inputs = Input(shape=(self.action_dim,), name='action_input')
        
        state_net = BatchNormalization()(state_inputs)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        state_net = Dense(300)(state_net)
        
        # Actions do not need to be normalized - already between -1 and 1
        action_net = Dense(300)(action_inputs)
        
        # Combine state_net and action_net
        net = Add()([state_net, action_net])
        net = Dense(300)(net)
        net = BatchNormalization()(net)
        net = LeakyReLU(alpha=0.2)(net)
        net = Dense(1)(net)
        
        model = keras.models.Model(inputs=[state_inputs, action_inputs], outputs=net)
        model.compile(
            optimizer=Adam(lr=self.learning_rate),
            loss="mse"
        )
        model.summary()
        
        return model, state_inputs, action_inputs

    def fit(self, states, actions, predicted_q_values):
        self.model.fit({'state_input': states, 'action_input': actions}, predicted_q_values, verbose=0)

    def predict(self, states, action):
        return self.model.predict({'state_input': states, 'action_input': actions}, verbose=0)

    def predict_target(self, states, actions):
        return self.target.predict({'state_input': states, 'action_input': actions}, verbose=0)
    
    def get_action_gradients(self, states, actions):
        return self.sess.run(
            self.action_grads,
            feed_dict={
                self.state: states,
                self.action: actions
            })[0]
    
    def update_target(self):
        self.target.set_weights(
            [x[0] * self.tau + x[1] * (1 - self.tau) for x in zip(self.model.get_weights(), self.target.get_weights())]
        )


## Intrinsic Curiosity Module

In [0]:
class ICM:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim        
        self.state_encoder, self.trainer, self.predictor = self.create_models()
    
    def create_models(self):
        feature_dim = 32
        state_input = Input(shape=(self.state_dim,), name='state_input')
        next_state_input = Input(shape=(self.state_dim,), name='next_state_input')
        action_input = Input(shape=(self.action_dim,), name='action_input')
        
        # Feature Encoder
        state_encoder_input = Input(shape=(self.state_dim,))
        state_encoder = Dense(128)(state_encoder_input)
        state_encoder = Dense(128)(state_encoder)
        encoded_state = Dense(feature_dim)(state_encoder)
        state_encoder_model = Model(input=state_encoder_input, output=encoded_state)
        
        encoded_state = state_encoder_model(state_input)
        encoded_next_state = state_encoder_model(next_state_input)
        
        # Inverse Model
        inverse_model = Concatenate()([encoded_state, encoded_next_state])
        inverse_model = Dense(128)(inverse_model)
        inverse_model = Dense(128)(inverse_model)
        action_output = Dense(self.action_dim, name='action_output')(inverse_model)
        
        # Forward Model
        forward_model = Concatenate()([action_input, encoded_state])
        forward_model = Dense(128)(forward_model)
        forward_model = Dense(128)(forward_model)
        next_state_output = Dense(feature_dim, name='next_state_output')(forward_model)
        
        # Reward Output
        reward_processor = Subtract()([next_state_output, encoded_next_state])
        reward_output = Lambda((lambda x: K.sum(x ** 2)), name='reward_output')(reward_processor)
        
        # All combined
        trainer = Model(input=[action_input, state_input, next_state_input], output=[next_state_output, action_output])
        trainer.compile(optimizer='adam', loss='mse')
        predictor = Model(input=[action_input, state_input, next_state_input], output=reward_output)
        
        return state_encoder_model, trainer, predictor
    
    def fit(self, states, actions, next_states):
        encoded_states = self.state_encoder.predict(states)
        self.trainer.fit({
            'state_input': states,
            'next_state_input': next_states,
            'action_input': actions,
        },{
            'next_state_output': encoded_states,
            'action_output': actions,
        }, verbose=0)
        
    def predict(self, states, actions, next_states):
        return self.predictor.predict({
            'state_input': states,
            'next_state_input': next_states,
            'action_input': actions,
        }, verbose=0)

## Create the Model

In [0]:
class DDPGAgent:
    def __init__(self, sess, state_dim, action_dim, buffer_size=30000, 
                 learning_rate=0.001, batch_size=64, gamma=0.9, 
                 epsilon=1.00, epsilon_decay=0.99999, epsilon_min=0.001,
                 name='DDPG', tau=0.001):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.tau = tau
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.name = name;
        
        self.actor = Actor(state_dim, action_dim, learning_rate, tau)
        self.critic = Critic(sess, state_dim, action_dim, learning_rate, tau, gamma)
        self.icm = ICM(state_dim, action_dim)
        
        self.buffer = ReplayBuffer(buffer_size)
        
    
    def get_noisy_action(self, states):
        states = np.reshape(states, (1, self.state_dim))
        return self.actor.noisy_predict(states)
    
    
    def train_model(self):     
        if (self.buffer.len > self.batch_size):
            states, actions, rewards, done, next_states = self.buffer.sample(self.batch_size)
            self.icm.fit(states, actions, next_states)
            rewards_i = self.icm.predict(states, actions, next_states)
            target_q = self.critic.predict_target(next_states, actions)
            
            targets = []
            for i in range(self.batch_size):
                if (done[i]):
                    targets.append(rewards[i] + rewards_i[i])
                else:
                    targets.append(rewards[i] + rewards_i[i] + self.gamma + target_q[i])
            
            # Update critic
            self.critic.fit(states, actions, np.reshape(targets, (-1,1)))
            
            actor_actions = self.actor.predict(states)
            action_gradients = self.critic.get_action_gradients(states, actor_actions)
            self.actor.fit(states, action_gradients)
            
            self.actor.update_target()
            self.critic.update_target()
    
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.add(state, action, reward, done, next_state)
    
    def save_model(self, n_episodes):
        GOOGLE_DIR = '/content/gdrive/My Drive/cs4246_project/models/ddpg/trained_models/'
        HOME_DIR = './'
        self.actor.model.save(HOME_DIR + 'actor' + '_ep' + str(n_episodes) + '.h5')
        self.critic.model.save(HOME_DIR + 'critic' + '_ep' + str(n_episodes) + '.h5')
        pass
    
    def load_model(self, model_name):
        self.model = keras.models.load_model(model_name)
        pass
        
        

## Setup Gym Environment and Initialize Model

In [45]:
env = gym.make('BipedalWalker-v2')
n_state_params = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

# allow GPU optimization
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#sess = tf.Session(config=config)
sess = tf.Session()

agent = DDPGAgent(sess, n_state_params, n_actions)
BATCH_SIZE = 64
MAX_EPISODES = 100000
MAX_REWARD = 300
MAX_STEPS = env._max_episode_steps


  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
<keras.engine.sequential.Sequential object at 0x7f60747c8358>
<keras.engine.sequential.Sequential object at 0x7f60747c8358>
<keras.engine.sequential.Sequential object at 0x7f60747c8cf8>
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
state_input (InputLayer)        (None, 24)           0                                            
__________________________________________________________________________________________________
batch_normalization_167 (BatchN (None, 24)           96          state_input[0][0]                
__________________________________________________________________________________________________
dens



## Run Model

In [0]:
for ep in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for t in range(MAX_STEPS):
        prog = tf.keras.utils.Progbar(MAX_STEPS)
        state = np.reshape(state, [1, n_state_params])
        actions = agent.get_noisy_action(state)
        state = np.reshape(state, [n_state_params])
        next_state, reward, isDone, _ = env.step(actions[0])
        
        agent.store_transition(state, actions[0], reward, isDone, next_state)
        state = next_state
        total_reward += reward
        prog.update(t)
        if (isDone):
            prog.update(MAX_STEPS)
            print("episode: {}/{}, score: {}, e: {:.2}".format(ep, MAX_EPISODES, total_reward, agent.epsilon))
            break
    if (agent.buffer.len > BATCH_SIZE):
        agent.train_model()
    
    # record rewards dynamically
    GOOGLE_FILE = '/content/gdrive/My Drive/record.dat'
    HOME_FILE = './record.dat'
    record_filename = GOOGLE_FILE
    data = [ep, total_reward]
    with open(record_filename, "ab") as f:
        pickle.dump(data, f)
    
    if (total_reward > 200):
        agent.save_model(ep)
        break
    
    # save model every 10000 episodes
    if ((ep % 10000) == 0):
        agent.save_model(ep)
    
    if (total_reward > 200):
        agent.save_model(ep)
        break
    
    # save model every 100 episodes
    if ((ep % 100) == 0):
        agent.save_model(ep)
        
env.close()

episode: 0/100000, score: -101.02764031297515, e: 1.0
episode: 1/100000, score: -98.43909253419848, e: 1.0
episode: 2/100000, score: -118.87385683593364, e: 1.0
episode: 3/100000, score: -132.59563647794818, e: 1.0
episode: 4/100000, score: -111.25222020680071, e: 1.0
episode: 5/100000, score: -129.37717455128353, e: 1.0
episode: 6/100000, score: -129.45024779963697, e: 1.0
episode: 7/100000, score: -112.57614215879774, e: 1.0
episode: 8/100000, score: -134.00041068707057, e: 1.0
episode: 9/100000, score: -109.49361843645134, e: 1.0
episode: 10/100000, score: -111.88126622490496, e: 1.0
episode: 11/100000, score: -112.8188575200161, e: 1.0
episode: 12/100000, score: -152.66884828497433, e: 1.0
episode: 13/100000, score: -100.02996818271781, e: 1.0
episode: 14/100000, score: -98.46894151914535, e: 1.0
episode: 15/100000, score: -101.81608172539879, e: 1.0
episode: 16/100000, score: -103.52328051606153, e: 1.0
episode: 17/100000, score: -102.35437573552902, e: 1.0
episode: 18/100000, sco