# Deep Q-learning with Doom
In this tutorial we will implement deep Q-learning to teach an agent to play Doom.

We will use Keras for the deep learning part, and vizdoom to run doom in python.

## Prerequisites
- python3.7
- pip install numpy pyplot gym tensorflow keras skimage
- vizdoom

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import vizdoom
import os
import time
import keras
import random
from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D
from keras.models import Sequential
from keras.optimizers import SGD
from collections import deque
import numpy as np
from skimage import transform
from IPython.display import display, clear_output

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Initialize DoomGame
We will load the **defend_the_center** scenario.

Hese is the summary of this scenario from https://github.com/mwydmuch/ViZDoom/tree/master/scenarios:

> The purpose of this scenario is to teach the agent that killing the monsters is GOOD and when monsters kill you is BAD. In addition, wasting amunition is not very good either. Agent is rewarded only for killing monsters so he has to figure out the rest for himself.

> Map is a large circle. Player is spawned in the exact center. 5 melee-only, monsters are spawned along the wall. Monsters are killed after a single shot. After dying each monster is respawned after some time. Episode ends when the player dies (it's inevitable becuse of limitted ammo).

> REWARDS: +1 for killing a monster

> Further configuration:

> 3 available buttons: turn left, turn right, shoot (attack)

> death penalty = 1

In [2]:
game = vizdoom.DoomGame()
game.load_config("scenarios/defend_the_center.cfg")

# Visualize the game (set to False to train faster)
game.set_window_visible(True)

# Set screen format to greyscale
# Improves training time
game.set_screen_format(vizdoom.ScreenFormat.GRAY8)

# Make the game end after 2100 ticks (set to 0 to disable)
game.set_episode_timeout(2100)

# Init game
game.init()

# Setup Keras Model
## Let's Define some Hyperparameter

In [3]:
num_episodes       = 500
num_steps          = 100
num_actions        = game.get_available_buttons_size()
replay_buffer_size = 1000000
learning_rate      = 0.001
discount_factor    = 0.7
batch_size         = 64

In [4]:
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='elu', padding="valid", input_shape=(84, 84, 4)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3), activation='elu', padding="valid"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), activation='elu', padding="valid"))
model.add(Flatten())
model.add(Dense(512, activation='elu'))
model.add(Dense(128, activation='elu'))
model.add(Dense(game.get_available_buttons_size(), activation=None))
model.summary()
model.compile(loss="mse", optimizer=SGD(lr=learning_rate))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 82, 82, 32)        1184      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 41, 41, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 39, 39, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 19, 19, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 17, 17, 128)       73856     
_________________________________________________________________
flatten_1 (Flatten)          (None, 36992)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               18940416  
__________

In [5]:
def preprocess_frame(frame):
    cropped_frame = frame[30:-10, 30:-30]                             # Crop the screen
    normalized_frame = cropped_frame / 255.0                          # Normalize pixel values    
    preprocessed_frame = transform.resize(normalized_frame, [84, 84]) # Resize
    return preprocessed_frame

In [6]:
# Initialize replay deque
replay_buffer = deque(maxlen=replay_buffer_size)

# Initialize frame stack
frame_stack = deque(maxlen=4)

# For every episode
episode_loss = float("nan")
for episode in range(num_episodes):
    clear_output(wait=True)
    print("-- Episode {}/{} --\n".format(episode, num_episodes))
    print("Episode loss:", episode_loss)
    
    # Start new episode
    game.new_episode()
    
    # Initialize frame stack with the first frame of the game
    initial_frame = preprocess_frame(game.get_state().screen_buffer)
    for _ in range(4):
        frame_stack.append(initial_frame)
    state = np.stack(frame_stack, axis=2) # Stack the frames to setup the inital state
    
    episode_loss = 0
    while game.is_episode_finished():    
        # Get action with highest Q-value for current state
        action = np.argmax(model.predict_on_batch(np.expand_dims(state, axis=0)))
        action_one_hot = [False] * num_actions
        action_one_hot[action] = True
        
        # Take action and get reward
        reward = game.make_action(action_one_hot)
        
        # Break if the episode is finished
        if game.is_episode_finished():
            break
            
        # Episode is not fished
        # Get new state
        frame_stack.append(preprocess_frame(game.get_state().screen_buffer))
        new_state = np.stack(frame_stack, axis=2)

        # Store the experience
        replay_buffer.append((state, action, reward, new_state))
        state = new_state

        # Train network on a random sample of previous expreiences
        if len(replay_buffer) >= batch_size:
            # Get replay batch
            replay_batch      = random.sample(replay_buffer, batch_size)
            replay_state      = np.array([r[0] for r in replay_batch])
            replay_reward     = np.array([r[2] for r in replay_batch])
            replay_next_state = np.array([r[3] for r in replay_batch])

            # Q_target = reward + gamma * max_a' Q(s')
            Q_target = np.expand_dims(replay_reward, axis=1) + discount_factor * model.predict_on_batch(replay_next_state)

            # Run training pass
            loss += model.train_on_batch(replay_state, Q_target)

-- Episode 499/500 --

Episode loss: 0


# Let's Evaluate the Model

In [7]:
for episode in range(10):
    game.new_episode()
    while game.is_episode_finished():
        action_one_hot = [False] * game.get_available_buttons_size()
        #action_one_hot[action] = True
        
        # Take action and get reward
        reward = game.make_action(action_one_hot)