# Install Dependencies

In [1]:

!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install gym
!pip install box2d_py


Reading package lists... Done
Building dependency tree       
Reading state information... Done
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2).
zlib1g-dev set to manually installed.
The following additional packages will be installed:
  autotools-dev cmake-data file ibverbs-providers libarchive13 libfabric1
  libhwloc-dev libhwloc-plugins libhwloc5 libibverbs-dev libibverbs1
  libjsoncpp1 libltdl-dev libltdl7 liblzo2-2 libmagic-mgc libmagic1
  libnl-3-200 libnl-route-3-200 libnuma-dev libnuma1 libopenmpi2 libpciaccess0
  libpsm-infinipath1 librdmacm1 librhash0 libtool libuv1 ocl-icd-libopencl1
  openmpi-bin openmpi-common swig3.0
Suggested packages:
  cmake-doc ninja-build lrzip libhwloc-contrib-plugins libtool-doc openmpi-doc
  pciutils autoconf automaken gcj-jdk swig-doc swig-examples swig3.0-examples
  swig3.0-doc
The following NEW packages will be installed:
  autotools-dev cmake cmake-data file ibverbs-providers libarchive13
  libfabric1 libhwloc-dev libhwloc-pl

# Connect to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Discrete DQN

In this implementation, the actions of the BipedalWalker are discretized into 81 actions, each action being a permutation of {-1,0,1} for each of the four outputs.

## Import Modules

In [3]:
import keras
import gym
import os
from keras.models import Sequential
from keras.layers import Dense 
from keras.optimizers import Adam
import tensorflow as tf

import pickle # for saving episodes -> rewards

import numpy as np
from collections import deque
import random

Using TensorFlow backend.


## Build the Model

### Replay Buffer

In [0]:
class ReplayBuffer:
    """
    This class represents the experience replay buffer
    """
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.capacity = buffer_size
        self.len = 0
    
    def sample(self, n_samples):
        batch = []
        n_samples = min(self.len, n_samples)
        batch = random.sample(self.buffer, n_samples)
        
        curr_states = np.float32([arr[0] for arr in batch])
        actions = np.int32([arr[1] for arr in batch])
        rewards = np.float32([arr[2] for arr in batch])
        next_states = np.float32([arr[3] for arr in batch])
        
        return np.array(curr_states), np.array(actions), np.array(rewards), np.array(next_states)
    
    def add(self, curr_state, action, reward, next_state):
        self.buffer.append([curr_state, action, reward, next_state])
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
        
        

## Q Network

In [0]:
class DQN:
    def __init__(self, n_inputs, n_output_dim, learning_rate):
        self.learning_rate = learning_rate
        self.model = self.get_model(n_inputs, n_output_dim)
        
    def get_model(self, n_input_dim, n_output_dim):
        # Output can be sigmoid since we are computing Q-values and not the regressing 
        # to the actual value of the action. 
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(32, input_dim=n_input_dim, activation='relu'))
        model.add(tf.keras.layers.Dense(64, activation='relu'))        
        model.add(tf.keras.layers.Dense(128, activation='relu'))
        model.add(tf.keras.layers.Dense(256, activation='relu'))
        model.add(tf.keras.layers.Dense(3**n_output_dim, activation='relu'))
        model.summary()
        model.compile(
            optimizer=tf.train.AdamOptimizer(learning_rate=self.learning_rate, ),
            loss=tf.keras.losses.MSE
        )
        
        
        return model
    
    def predict(self, states):
        return self.model.predict(states)
    
    def fit(self, states, targets, epochs=1, verbose=0):
        self.model.fit(states, targets, epochs=1, verbose=0)

## Create the Model

In [0]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, buffer_size=10000, 
                 learning_rate=0.001, batch_size=64, gamma=0.95, 
                 epsilon=1.00, epsilon_decay=0.999999, epsilon_min=0.001,
                 name='discreteDQN'):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.name = name; 
        self.n_actions = 3**action_dim
        
        self.model = DQN(state_dim, action_dim, learning_rate)
        self.buffer = ReplayBuffer(buffer_size)
        self.actions = self.init_actions()
        
    def init_actions(self):
        actions = []
        for action_idx in range(self.n_actions):
            prev_divisor = self.n_actions
            action = []
            for _ in range(self.action_dim):
                next_divisor = prev_divisor / 3
                val = int((action_idx % prev_divisor) / next_divisor) - 1
                action.append(val)
                prev_divisor = next_divisor
            actions.append(action)
        return actions
    
    def get_action_idx(self, state):
        if (np.random.rand() < self.epsilon):
            return int(random.randrange(self.n_actions))
        else:
            qvalues = self.model.predict(state);
            return np.argmax(qvalues)
    
    def get_action(self, action_idx):
        return self.actions[action_idx]
    
    def train_model(self):
        states, actions, rewards, next_states = self.buffer.sample(self.batch_size)
        qvalues = self.model.predict(next_states)
        qvalues = np.float32([np.amax(qvalue) for qvalue in qvalues])
        #print(qvalues.shape)
        targets = rewards + self.gamma * qvalues
        training_targets = self.model.predict(states)
        for i in range(self.batch_size):
            #print(actions[i])
            training_targets[i][actions[i]] = targets[i]
        self.model.fit(states, training_targets, epochs=1, verbose=0)
        if (self.epsilon > self.epsilon_min):
            self.epsilon = self.epsilon * self.epsilon_decay
    
    def store_transition(self, state, action, reward, next_state):
        self.buffer.add(state, action, reward, next_state)
    
    def save_model(self, n_episodes):
        self.model.model.save('/content/gdrive/My Drive/cs4246_project/models/discrete_dqn/trained_models/' + self.name + '_ep' + str(n_episodes) + '.h5')
        pass
    
    def load_model(self, model_name):
        self.model = keras.models.load_model(model_name)
        pass
        
        

## Setup Gym Environment and Initialize Model

In [7]:
env = gym.make('BipedalWalker-v2')
n_state_params = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DQNAgent(n_state_params, n_actions)
BATCH_SIZE = 64
MAX_EPISODES = 100000
MAX_REWARD = 300
MAX_STEPS = env._max_episode_steps


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                800       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_3 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_4 (Dense)              (None, 81)                20817     
Total params: 65,073
Trainable params: 65,073
Non-trainabl

## Run Model

In [0]:
for ep in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for t in range(MAX_STEPS):
        state = np.reshape(state, [1, n_state_params])
        action_idx = agent.get_action_idx(state)
        action = agent.get_action(action_idx)
        state = np.reshape(state, [n_state_params])
        next_state, reward, isDone, _ = env.step(action)
        
        agent.store_transition(state, action_idx, reward, next_state)
        state = next_state
        
        total_reward += reward
        if (isDone):
            print("episode: {}/{}, score: {}, e: {:.2}".format(ep, MAX_EPISODES, total_reward, agent.epsilon))
            break
        
        if (agent.buffer.len > BATCH_SIZE):
            agent.train_model()
    
    # record rewards dynamically
    record_filename = '/content/gdrive/My Drive/cs4246_project/models/discrete_dqn/record.dat'
    data = [ep, total_reward]
    with open(record_filename, "ab") as f:
        pickle.dump(data, f)
    
    # save model every 100 episodes
    if ((ep % 100) == 0):
        agent.save_model(ep)
        
env.close()

episode: 0/100000, score: -114.9139851437307, e: 1.0
episode: 1/100000, score: -105.73710150918241, e: 1.0
episode: 2/100000, score: -103.54016262707052, e: 1.0
episode: 3/100000, score: -98.96204675767073, e: 1.0
episode: 4/100000, score: -122.54733376328275, e: 1.0
episode: 5/100000, score: -234.2826911419794, e: 1.0
episode: 6/100000, score: -110.73131360655464, e: 1.0
episode: 7/100000, score: -109.05794114433725, e: 1.0
episode: 8/100000, score: -131.7664098660719, e: 0.99
episode: 9/100000, score: -103.5009322006988, e: 0.99
episode: 10/100000, score: -118.53856857313586, e: 0.99
episode: 11/100000, score: -118.6109447264069, e: 0.99
episode: 12/100000, score: -110.08297023956416, e: 0.99
episode: 13/100000, score: -116.91814636328071, e: 0.99
episode: 14/100000, score: -104.83499971356429, e: 0.99
episode: 15/100000, score: -101.37529747438741, e: 0.99
episode: 16/100000, score: -111.10149473803241, e: 0.99
episode: 17/100000, score: -114.00062381407557, e: 0.99
episode: 18/1000

In [0]:
data = []
with open(record_filename, 'rb') as fr:
    try:
        while True:
            data.append(pickle.load(fr))
    except EOFError:
        pass
print(data)
