# Discrete DQN

In this implementation, the actions of the BipedalWalker are discretized into 81 actions, each action being a permutation of {-1,0,1} for each of the four outputs.

## Import Modules

In [2]:
import keras
import gym
from keras.models import Sequential
from keras.layers import Dense 
from keras.optimizers import Adam

import numpy as np
from collections import deque
import random

## Build the Model

### Replay Buffer

In [3]:
class ReplayBuffer:
    """
    This class represents the experience replay buffer
    """
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)
        self.capacity = buffer_size
        self.len = 0
    
    def sample(self, n_samples):
        batch = []
        n_samples = min(self.len, n_samples)
        batch = random.sample(self.buffer, n_samples)
        
        curr_states = np.float32([arr[0] for arr in batch])
        actions = np.int32([arr[1] for arr in batch])
        rewards = np.float32([arr[2] for arr in batch])
        next_states = np.float32([arr[3] for arr in batch])
        
        return np.array(curr_states), np.array(actions), np.array(rewards), np.array(next_states)
    
    def add(self, curr_state, action, reward, next_state):
        self.buffer.append([curr_state, action, reward, next_state])
        self.len = self.len + 1
        if (self.len > self.capacity):
            self.len = self.capacity
        
        

## Q Network

In [4]:
class DQN:
    def __init__(self, n_inputs, n_output_dim, learning_rate):
        self.learning_rate = learning_rate
        self.model = self.get_model(n_inputs, n_output_dim)
        
    def get_model(self, n_input_dim, n_output_dim):
        # Output can be sigmoid since we are computing Q-values and not the regressing 
        # to the actual value of the action. 
        model = Sequential()
        model.add(Dense(32, input_dim=n_input_dim, activation='relu'))
        model.add(Dense(64, activation='relu'))        
        model.add(Dense(128, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(3**n_output_dim, activation='relu'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def predict(self, states):
        return self.model.predict(states)
    
    def fit(self, states, targets, epochs=1, verbose=0):
        self.model.fit(states, targets, epochs=1, verbose=0)

## Create the Model

In [5]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, buffer_size=10000, 
                 learning_rate=0.001, batch_size=64, gamma=0.95, 
                 epsilon=1.00, epsilon_decay=0.999999, epsilon_min=0.001,
                 name='discreteDQN'):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.name = name; 
        self.n_actions = 3**action_dim
        
        self.model = DQN(state_dim, action_dim, learning_rate)
        self.buffer = ReplayBuffer(buffer_size)
        self.actions = self.init_actions()
        
    def init_actions(self):
        actions = []
        for action_idx in range(self.n_actions):
            prev_divisor = self.n_actions
            action = []
            for _ in range(self.action_dim):
                next_divisor = prev_divisor / 3
                val = int((action_idx % prev_divisor) / next_divisor) - 1
                action.append(val)
                prev_divisor = next_divisor
            actions.append(action)
        return actions
    
    def get_action_idx(self, state):
        if (np.random.rand() < self.epsilon):
            return int(random.randrange(self.n_actions))
        else:
            qvalues = self.model.predict(state);
            return np.argmax(qvalues)
    
    def get_action(self, action_idx):
        return self.actions[action_idx]
    
    def train_model(self):
        states, actions, rewards, next_states = self.buffer.sample(self.batch_size)
        qvalues = self.model.predict(next_states)
        qvalues = np.float32([np.amax(qvalue) for qvalue in qvalues])
        #print(qvalues.shape)
        targets = rewards + self.gamma * qvalues
        training_targets = self.model.predict(states)
        for i in range(self.batch_size):
            #print(actions[i])
            training_targets[i][actions[i]] = targets[i]
        self.model.fit(states, training_targets, epochs=1, verbose=0)
        if (self.epsilon > self.epsilon_min):
            self.epsilon = self.epsilon * self.epsilon_decay
    
    def store_transition(self, state, action, reward, next_state):
        self.buffer.add(state, action, reward, next_state)
    
    def save_model(self, n_episodes):
        self.model.save(this.name + '_ep' + str(n_episodes) + '.h5')
        pass
    
    def load_model(self, model_name):
        self.model = keras.models.load_model(model_name)
        pass
        
        

## Setup Gym Environment and Initialize Model

In [6]:
env = gym.make('BipedalWalker-v2')
n_state_params = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DQNAgent(n_state_params, n_actions)
BATCH_SIZE = 64
MAX_EPISODES = 100000
MAX_REWARD = 300
MAX_STEPS = env._max_episode_steps


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Run Model

In [7]:
for ep in range(MAX_EPISODES):
    state = env.reset()
    total_reward = 0
    for t in range(MAX_STEPS):
        state = np.reshape(state, [1, n_state_params])
        action_idx = agent.get_action_idx(state)
        action = agent.get_action(action_idx)
        state = np.reshape(state, [n_state_params])
        next_state, reward, isDone, _ = env.step(action)
        
        agent.store_transition(state, action_idx, reward, next_state)
        state = next_state
        
        total_reward += reward
        if (isDone):
            print("episode: {}/{}, score: {}, e: {:.2}".format(ep, MAX_EPISODES, total_reward, agent.epsilon))
            break
        
        if (agent.buffer.len > BATCH_SIZE):
            agent.train_model()
        
env.close()

episode: 0/100000, score: -111.86236645039543, e: 1.0
episode: 1/100000, score: -103.86318466452633, e: 1.0
episode: 2/100000, score: -133.32493511108493, e: 1.0
episode: 3/100000, score: -104.93018017781029, e: 1.0
episode: 4/100000, score: -107.67045454182848, e: 1.0
episode: 5/100000, score: -125.76934146014243, e: 1.0
episode: 6/100000, score: -114.23122017936221, e: 1.0
episode: 7/100000, score: -237.50671605550994, e: 1.0
episode: 8/100000, score: -112.81443813058263, e: 0.99
episode: 9/100000, score: -121.49121829751275, e: 0.99
episode: 10/100000, score: -106.20407351755165, e: 0.99
episode: 11/100000, score: -120.85944012957128, e: 0.99
episode: 12/100000, score: -109.5514186022884, e: 0.99
episode: 13/100000, score: -120.34705721405521, e: 0.99


KeyboardInterrupt: 