# Install Dependencies

In [1]:

!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install gym
!pip install box2d_py


Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 9%Reading package lists... 9%Reading package lists... 9%Reading package lists... 9%Reading package lists... 83%Reading package lists... 83%Reading package lists... 84%Reading package lists... 84%Reading package lists... 88%Reading package lists... 89%Reading package lists... 89%Reading package lists... 89%Reading package lists... 89%Reading package lists... 95%Reading package lists... 95%Reading package lists... 95%Reading package lists... 95%Reading package lists... 95%Reading package lists... 95%Reading package lists... 97%Reading package lists... 97%Reading package lists... 98%Reading package lists... 98%Reading package lists... 98%Reading package lists... 98%Reading package lists... 98%Reading package lists... 98%Reading package lists... 99%Reading package lists... 99%Reading package lists... 99%Reading package 

# Check if we are allocated a GPU



In [10]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

# Connect to Google Drive

In [18]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Discrete DQN

In this implementation, the actions of the BipedalWalker are discretized into 81 actions, each action being a permutation of {-1,0,1} for each of the four outputs.

## Import Modules

In [0]:
import keras
import gym
import os
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU
from keras.optimizers import Adam
import tensorflow as tf

import pickle # for saving episodes -> rewards

import numpy as np
from collections import deque
import random

## Build the Model

### Replay Buffer

In [0]:
class ReplayBuffer:
    """
    This class represents the experience replay buffer
    """
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.capacity = buffer_size
        self.len = 0
    
    def sample(self, n_samples):
        batch = []
        n_samples = min(self.len, n_samples)
        batch = random.sample(self.buffer, n_samples)
        
        curr_states = np.float32([arr[0] for arr in batch])
        actions = np.int32([arr[1] for arr in batch])
        rewards = np.float32([arr[2] for arr in batch])
        next_states = np.float32([arr[3] for arr in batch])
        
        return np.array(curr_states), np.array(actions), np.array(rewards), np.array(next_states)
    
    def add(self, curr_state, action, reward, next_state):
        self.buffer.append([curr_state, action, reward, next_state])
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
    
    def processed_add(self, entry):
        self.buffer.append(entry)
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
    
    def clear(self):
        self.buffer.clear()
        
        

## Q Network

In [0]:
class DQN:
    def __init__(self, n_inputs, n_output_dim, learning_rate):
        self.learning_rate = learning_rate
        self.model = self.get_model(n_inputs, n_output_dim)
        
    def get_model(self, n_input_dim, n_output_dim):
        # Output can be sigmoid since we are computing Q-values and not the regressing 
        # to the actual value of the action. 
        model = Sequential()
        model.add(Dense(16, input_dim=n_input_dim, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(32, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(64, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(256, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(3**n_output_dim, activation='linear'))
        model.add(LeakyReLU(alpha=0.2))
        model.summary()
        model.compile(
            optimizer=Adam(lr=self.learning_rate, ),
            loss="mse"
        )
        
        return model
    
    def predict(self, states):
        return self.model.predict(states)
    
    def fit(self, states, targets, epochs=1, verbose=0):
        self.model.fit(states, targets, epochs=1, verbose=0)

## Create the Model

In [0]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, 
                 global_buffer_size=30000, local_buffer_size=10000,
                 learning_rate=0.001, batch_size=64, gamma=0.9, 
                 epsilon=0.99, epsilon_decay=0.001, epsilon_min=0.001,
                 name='discreteDQN'):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.name = name; 
        self.n_actions = 3**action_dim
        
        self.model = DQN(state_dim, action_dim, learning_rate)
        self.buffer = ReplayBuffer(global_buffer_size)
        self.local_buffer = ReplayBuffer(local_buffer_size)
        
        self.total_rewards=np.ones(200)*-300
    
    def get_action_idx(self, state):
        if (np.random.rand() < self.epsilon):
            return int(random.randrange(self.n_actions))
        else:
            qvalues = self.model.predict(state);
            return np.argmax(qvalues)
    
    def get_action(self, action_idx):
        action = []
        #1
        output = int(action_idx / 27) - 1
        rest = action_idx - 27 * int(action_idx / 27)
        action.append(output)
        #2
        output = int(rest / 9) - 1
        rest = rest - 9*int(rest / 9)
        action.append(output)
        #3
        output = int(rest / 3) - 1
        rest = rest - 3*int(rest / 3)
        action.append(output)
        #4
        action.append(rest -1)
        
        return action
    
    def train_model(self):
        states, actions, rewards, next_states = self.buffer.sample(self.batch_size)
        qvalues = self.model.predict(next_states)
        qvalues = np.float32([np.amax(qvalue) for qvalue in qvalues])
        #print(qvalues.shape)
        targets = rewards + self.gamma * qvalues
        training_targets = self.model.predict(states)
        for i in range(self.batch_size):
            #print(actions[i])
            training_targets[i][actions[i]] = targets[i]
        self.model.fit(states, training_targets, epochs=1, verbose=0)
        if (self.epsilon > self.epsilon_min):
            self.epsilon = self.epsilon - self.epsilon_decay
    
    def store_transition(self, state, action, reward, next_state):
        self.local_buffer.add(state, action, reward, next_state)
    
    def add_local_experience(self, total_reward):
        if (np.min(self.total_rewards) < total_reward):
            idx = np.argmin(self.total_rewards)
            self.total_rewards[idx]=total_reward
            
            for x in self.local_buffer.buffer:
                self.buffer.processed_add(x)
        
        # Simulate regular experience replay 
        if np.random.random()<0.01:
            for x in self.local_buffer.buffer:
                self.buffer.processed_add(x)
        
        # Clear local memory
        self.local_buffer.clear()
    
    def save_model(self, n_episodes):
        GOOGLE_DIR = '/content/gdrive/My Drive/cs4246_project/models/improved_dqn/trained_models/'
        HOME_DIR = './trained_models/'
        self.model.model.save(GOOGLE_DIR + self.name + '_ep' + str(n_episodes) + '.h5')
        pass
    
    def load_model(self, model_name):
        self.model = keras.models.load_model(model_name)
        pass    
        


## Setup Gym Environment and Initialize Model

In [24]:
env = gym.make('BipedalWalker-v2')
n_state_params = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DQNAgent(n_state_params, n_actions)
BATCH_SIZE = 64
MAX_EPISODES = 30000
MAX_REWARD = 300
MAX_STEPS = env._max_episode_steps


  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 16)                400       
_________________________________________________________________
leaky_re_lu_16 (LeakyReLU)   (None, 16)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 32)                544       
_________________________________________________________________
leaky_re_lu_17 (LeakyReLU)   (None, 32)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 64)                2112      
__________________________________________________________

## Run Model

In [0]:
for ep in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for t in range(MAX_STEPS):
        state = np.reshape(state, [1, n_state_params])
        action_idx = agent.get_action_idx(state)
        action = agent.get_action(action_idx)
        state = np.reshape(state, [n_state_params])
        next_state, reward, isDone, _ = env.step(action)
        
        agent.store_transition(state, action_idx, reward, next_state)
        state = next_state
        
        total_reward += reward
        if (isDone):
            print("episode: {}/{}, score: {}, e: {:.2}".format(ep, MAX_EPISODES, total_reward, agent.epsilon))
            break
        
    
    agent.add_local_experience(total_reward)
    if (agent.buffer.len > BATCH_SIZE):
        agent.train_model()
    
    # record rewards dynamically
    GOOGLE_FILE = '/content/gdrive/My Drive/cs4246_project/models/improved_dqn/record.dat'
    HOME_FILE = './record.dat'
    record_filename = GOOGLE_FILE
    data = [ep, total_reward]
    with open(record_filename, "ab") as f:
        pickle.dump(data, f)
    
    if (total_reward > 200):
        agent.save_model(ep)
    
    # save model every 10000 episodes
    if ((ep % 100) == 0):
        agent.save_model(ep)
        
ienv.close()

In [0]:
  import pandas as pd

data = []
with open(record_filename, 'rb') as fr:
    try:
        while True:
            data.append(pickle.load(fr))
    except EOFError:
        pass
data = pd.DataFrame(np.array(data))

