# Install Dependencies

In [10]:

!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install gym
!pip install box2d_py


Reading package lists... Done
Building dependency tree       
Reading state information... Done
cmake is already the newest version (3.10.2-1ubuntu2).
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2).
libopenmpi-dev is already the newest version (2.1.1-8).
swig is already the newest version (3.0.12-1).
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.


# Check if we are allocated a GPU



In [49]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

# Connect to Google Drive

In [11]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Deep Deterministic Policy Gradient

In this implementation, the actions of the BipedalWalker are discretized into 81 actions, each action being a permutation of {-1,0,1} for each of the four outputs.

## Import Modules

In [41]:
import keras
import gym
import os
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization, Input, Add, LeakyReLU
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf

import pickle # for saving episodes -> rewards

import numpy as np
from collections import deque
import random

## Build the Model

### Create Noise

In [42]:
# Taken from https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py, which is
# based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.5, theta=.20, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)


### Replay Buffer

In [43]:
class ReplayBuffer:
    """
    This class represents the experience replay buffer
    """
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.capacity = buffer_size
        self.len = 0
    
    def sample(self, n_samples):
        batch = []
        n_samples = min(self.len, n_samples)
        batch = random.sample(self.buffer, n_samples)
        
        curr_states = np.float32([arr[0] for arr in batch])
        actions = np.int32([arr[1] for arr in batch])
        rewards = np.float32([arr[2] for arr in batch])
        done = np.bool_([arr[3] for arr in batch])
        next_states = np.float32([arr[4] for arr in batch])
        
        return np.array(curr_states), np.array(actions), np.array(rewards), np.array(done), np.array(next_states)
    
    def add(self, curr_state, action, reward, done, next_state):
        self.buffer.append([curr_state, action, reward, done, next_state])
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
        
        

### Actor Network

In [44]:
class Actor():
    """
    Input to the network is the state, output is the action
    under a deterministic policy.
    The output layer activation is a tanh to keep the action
    between -action_bound and action_bound
    """
    def __init__(self, state_dim, action_dim, learning_rate, tau):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau  # tau is the update rate for the target network - allowing it to slowly chase
        
        # Actor Network
        self.model = self.create_actor_network()
        self.actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        
        # Target Network
        self.target = self.create_actor_network()

    def create_actor_network(self):
        model = Sequential()
        model.add(BatchNormalization())
        model.add(Dense(256, input_dim=self.state_dim, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        
        model.add(Dense(256, activation='linear'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2))
        
        model.add(Dense(256, activation='linear'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.2))
        
        # output of bipedal walker is between -1 and 1
        model.add(Dense(self.action_dim, activation='tanh'))
        model = model.compile(
            optimizer=Adam(lr=self.learning_rate),
            loss="mse"
        )
        #model.summary()
        
        return model

    def fit(self, states, action_gradients):
        self.model.fit(states, action_gradients)
    
    def noisy_predict(self, inputs):
        return self.model.predict(inputs) + self.actor_noise() 
    
    def predict(self, states):
        return self.model.predict(states)

    def predict_target(self, states):
        return self.target.predict(states)
    
    def update_target(self):
        """
        Soft update of target network
        """
        self.target.set_weights(
            self.model.get_weights() * self.tau + self.target.get_weights() * (1 - self.tau)
        )
    


## Critic Network

In [54]:
class Critic():
    """
    This network takes in 2 inputs the state and action, 
    the output is the Q(s,a)
    The input action is given by the Actor
    """
    
    def __init__(self, sess, state_dim, action_dim, 
                 learning_rate, tau, gamma):
        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma
        
        # Session needed to grab action gradients
        K.set_session(sess)

        # Create networks
        self.model, self.action, self.state = self.create_critic_network()
        self.target, self.target_action, self.target_state = self.create_critic_network()
        
        self.action_grads = tf.gradients(self.model.output, self.action)
        self.sess.run(tf.initialize_all_variables())

    def create_critic_network(self):
        state_inputs = Input(shape=(None,self.state_dim))
        action_inputs = Input(shape=(None,self.action_dim))
        
        state_net = BatchNormalization()(state_inputs)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        
        state_net = Dense(256)(state_net)
        state_net = BatchNormalization()(state_net)
        state_net = LeakyReLU(alpha=0.2)(state_net)
        state_net = Dense(300)(state_net)
        
        # Actions do not need to be normalized - already between -1 and 1
        action_net = Dense(300)(action_inputs)
        
        # Combine state_net and action_net
        net = Add()([state_net, action_net])
        net = Dense(300)(net)
        net = BatchNormalization()(net)
        net = LeakyReLU(alpha=0.2)(net)
        net = Dense(1)(net)
        
        model = keras.models.Model(inputs=[state_inputs, action_inputs], outputs=net)
        model.compile(
            optimizer=Adam(lr=self.learning_rate),
            loss="mse"
        )
        model.summary()
        
        return model, state_inputs, action_inputs

    def fit(self, states, actions, predicted_q_values):
        self.model.fit([states, actions], predicted_q_values)

    def predict(self, states, action):
        return self.model.predict([states, actions])

    def predict_target(self, states, action):
        return self.target.predict([states, actions])
    
    def get_action_gradients(self, states, actions):
        return self.sess.run(
            self.action_grads,
            feed_dict={
                self.state: states,
                elf.actions: actions
            })[0]
    
    def update_target(self):
        self.target.set_weights(
            self.model.get_weights() * self.tau + self.target.get_weights() * (1 - self.tau)
        )


## Create the Model

In [55]:
class DDPGAgent:
    def __init__(self, sess, state_dim, action_dim, buffer_size=30000, 
                 learning_rate=0.001, batch_size=64, gamma=0.9, 
                 epsilon=1.00, epsilon_decay=0.99999, epsilon_min=0.001,
                 name='DDPG', tau=0.001):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.tau = tau
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.name = name;
        
        self.actor = Actor(state_dim, action_dim, learning_rate, tau)
        self.critic = Critic(sess, state_dim, action_dim, learning_rate, tau, gamma)
        
        self.buffer = ReplayBuffer(buffer_size)
        
    
    def get_noisy_action(self, states):
        states = np.reshape(states, (1, self.state_dim))
        return self.actor.noisy_predict(states)
    
    
    def train_model(self):     
        if (buffer.len > self.batch_size):
            states, actions, rewards, done, next_states = self.buffer.sample(self.batch_size)
            target_q = self.critic.predict_target(next_states)
            
            targets = []
            for i in range(self.batch_size):
                if (done[i]):
                    targets.append(rewards[i])
                else:
                    targets.append(rewards[i] + self.gamma + target_q[i])
            
            # Update critic
            self.critic.fit(states, actions, np.reshape(targets, (-1,1)))
            
            actor_actions = self.actor.predict(states)
            action_gradients = critic.get_action_gradients(states, actor_actions)
            actor.fit(states, action_gradients)
            
            actor.update_target()
            critic.update_target()
    
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.add(state, action, reward, done, next_state)
    
    def save_model(self, n_episodes):
        self.model.model.save('/content/gdrive/My Drive/cs4246_project/models/discrete_dqn/trained_models/' + self.name + '_ep' + str(n_episodes) + '.h5')
        pass
    
    def load_model(self, model_name):
        self.model = keras.models.load_model(model_name)
        pass
        
        

## Setup Gym Environment and Initialize Model

In [56]:
env = gym.make('BipedalWalker-v2')
n_state_params = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

# allow GPU optimization
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#sess = tf.Session(config=config)
sess = tf.Session()

agent = DDPGAgent(sess, n_state_params, n_actions)
BATCH_SIZE = 64
MAX_EPISODES = 100000
MAX_REWARD = 300
MAX_STEPS = env._max_episode_steps


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, None, 24)     0                                            
__________________________________________________________________________________________________
batch_normalization_56 (BatchNo (None, None, 24)     96          input_8[0][0]                    
__________________________________________________________________________________________________
dense_73 (Dense)                (None, None, 256)    6400        batch_normalization_56[0][0]     
___________________________________________________________________________________________

Instructions for updating:
Use `tf.global_variables_initializer` instead.


## Run Model

In [41]:
for ep in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for t in range(MAX_STEPS):
        state = np.reshape(state, [1, n_state_params])
        action_idx = agent.get_action_idx(state)
        action = agent.get_action(action_idx)
        state = np.reshape(state, [n_state_params])
        next_state, reward, isDone, _ = env.step(action)
        
        agent.store_transition(state, action_idx, reward, next_state)
        state = next_state
        
        total_reward += reward
        if (isDone):
            print("episode: {}/{}, score: {}, e: {:.2}".format(ep, MAX_EPISODES, total_reward, agent.epsilon))
            break
        
        if (agent.buffer.len > BATCH_SIZE):
            agent.train_model()
    
    # record rewards dynamically
    record_filename = '/content/gdrive/My Drive/cs4246_project/models/discrete_dqn/record.dat'
    data = [ep, total_reward]
    with open(record_filename, "ab") as f:
        pickle.dump(data, f)
    
    if (total_reward > 200):
        agent.save_model(ep)
        break
    
    # save model every 100 episodes
    if ((ep % 100) == 0):
        agent.save_model(ep)
        
ienv.close()

episode: 0/100000, score: -107.25541092623709, e: 1.0
episode: 1/100000, score: -104.6333288865226, e: 1.0
episode: 2/100000, score: -107.81080988087629, e: 1.0
episode: 3/100000, score: -116.75870301603835, e: 1.0
episode: 4/100000, score: -115.614538464915, e: 1.0
episode: 5/100000, score: -115.05946592452437, e: 1.0
episode: 6/100000, score: -102.77113069122346, e: 1.0
episode: 7/100000, score: -112.58217402355774, e: 0.98
episode: 8/100000, score: -125.03426624044755, e: 0.97
episode: 9/100000, score: -104.38126264189493, e: 0.96
episode: 10/100000, score: -114.0125247704219, e: 0.96
episode: 11/100000, score: -123.11261205165212, e: 0.96
episode: 12/100000, score: -125.58434687942328, e: 0.96
episode: 13/100000, score: -115.78959571995928, e: 0.95
episode: 14/100000, score: -122.22588829351776, e: 0.95
episode: 15/100000, score: -123.67174794781975, e: 0.93
episode: 16/100000, score: -120.95582840502168, e: 0.92
episode: 17/100000, score: -120.90046272229527, e: 0.91
episode: 18/1

KeyboardInterrupt: ignored

In [46]:
import pandas as pd

data = []
with open(record_filename, 'rb') as fr:
    try:
        while True:
            data.append(pickle.load(fr))
    except EOFError:
        pass
data = pd.DataFrame(np.array(data))



                  1
count  21044.000000
mean    -104.284060
std        8.402799
min     -234.282691
25%     -107.400778
50%     -101.803275
75%     -101.135857
max        3.163485
