## Step 1: Import the libraries 📚

In [1]:
import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
from vizdoom import *        # Doom Environment

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 

  from ._conv import register_converters as _register_converters


## Step 2: Create our environment 🎮
- Now that we imported the libraries/dependencies, we will create our environment.
- Doom environment takes:
    - A `configuration file` that **handle all the options** (size of the frame, possible actions...)
    - A `scenario file`: that **generates the correct scenario** (in our case basic **but you're invited to try other scenarios**).
- Note: We have 3 possible actions `[[0,0,1], [1,0,0], [0,1,0]]` so we don't need to do one hot encoding (thanks to < a href="https://stackoverflow.com/users/2237916/silgon">silgon</a> for figuring out. 

### Our environment
<img src="assets/doom.png" style="max-width:500px;" alt="Doom"/>
                                    
- A monster is spawned **randomly somewhere along the opposite wall**. 
- Player can only go **left/right and shoot**. 
- 1 hit is enough **to kill the monster**. 
- Episode finishes when **monster is killed or on timeout (300)**.
<br><br>
REWARDS:

- +101 for killing the monster 
- -5 for missing 
- Episode ends after killing the monster or on timeout.
- living reward = -1

In [2]:
"""
Here we create our environment
"""
def create_environment():
    game = DoomGame()
    
    # Load the correct configuration
    game.load_config("deadly_corridor.cfg")
    
    # Load the correct scenario (in our case basic scenario)
    game.set_doom_scenario_path("deadly_corridor.wad")
    
    possible_actions = np.identity(7,dtype=int).tolist()
    
    return game, possible_actions
       

In [3]:
game,possible_actions = create_environment()

In [4]:
#test_environment()

## Step 3: Define the preprocessing functions ⚙️
### preprocess_frame
Preprocessing is an important step, <b>because we want to reduce the complexity of our states to reduce the computation time needed for training.</b>
<br><br>
Our steps:
- Grayscale each of our frames (because <b> color does not add important information </b>). But this is already done by the config file.
- Crop the screen (in our case we remove the roof because it contains no information)
- We normalize pixel values
- Finally we resize the preprocessed frame

In [5]:
"""
    preprocess_frame:
    Take a frame.
    Resize it.
        __________________
        |                 |
        |                 |
        |                 |
        |                 |
        |_________________|
        
        to
        _____________
        |            |
        |            |
        |            |
        |____________|
    Normalize it.
    
    return preprocessed_frame
    
    """
def preprocess_frame(frame):
    # Greyscale frame already done in our vizdoom config
    # x = np.mean(frame,-1)
    # Crop the screen (remove the roof because it contains no information)
    cropped_frame = frame[15:-5,20:-20]
    
    # Normalize Pixel Values
    normalized_frame = cropped_frame/255.0
    
    # Resize
    preprocessed_frame = transform.resize(normalized_frame, [100,120])
    
    return preprocessed_frame

### stack_frames
👏 This part was made possible thanks to help of <a href="https://github.com/Miffyli">Anssi</a><br>

As explained in this really <a href="https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/">  good article </a> we stack frames.

Stacking frames is really important because it helps us to **give have a sense of motion to our Neural Network.**

- First we preprocess frame
- Then we append the frame to the deque that automatically **removes the oldest frame**
- Finally we **build the stacked state**

This is how work stack:
- For the first frame, we feed 4 frames
- At each timestep, **we add the new frame to deque and then we stack them to form a new stacked frame**
- And so on
<img src="https://raw.githubusercontent.com/simoninithomas/Deep_reinforcement_learning_Course/master/DQN/Space%20Invaders/assets/stack_frames.png" alt="stack">
- If we're done, **we create a new stack with 4 new frames (because we are in a new episode)**.

In [6]:
stack_size = 4 # We stack 4 frames

# Initialize deque with zero-images one array for each image
stacked_frames  =  deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4) 

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

## Step 4: Set up our hyperparameters ⚗️
In this part we'll set up our different hyperparameters. But when you implement a Neural Network by yourself you will **not implement hyperparamaters at once but progressively**.

- First, you begin by defining the neural networks hyperparameters when you implement the model.
- Then, you'll add the training hyperparameters when you implement the training algorithm.

In [7]:
### MODEL HYPERPARAMETERS
state_size = [100,120,4]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
action_size = game.get_available_buttons_size()              # 3 possible actions: left, right, shoot
learning_rate =  0.00025     # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 5000      # Total episodes for training
max_steps = 5000              # Max possible steps in an episode
batch_size = 64             

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.000005         # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.95               # Discounting rate

#targetq hyperparameters
max_tau = 10000

### MEMORY HYPERPARAMETERS
pretrain_length = 100000  # Number of experiences stored in the Memory when initialized for the first time
memory_size = 100000         # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = True

## Step 5: Create our Deep Q-learning Neural Network model 🧠
<img src="https://raw.githubusercontent.com/simoninithomas/Deep_reinforcement_learning_Course/master/DQN/doom/assets/model.png" alt="Model" />
This is our Deep Q-learning model:
- We take a stack of 4 frames as input
- It passes through 3 convnets
- Then it is flatened
- Finally it passes through 2 FC layers
- It outputs a Q value for each actions

In [8]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name
        
        with tf.variable_scope(self.name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 100,120,4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.ISWeights_ = tf.placeholder(tf.float32, [None,1], name='IS_weights')
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            """
            First convnet:
            CNN
            BatchNormalization
            ELU
            """
            
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            

            
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")

            
            
            """
            Second convnet:
            CNN
            BatchNormalization
            ELU
            """
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")

            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")
            ## --> [9, 9, 64]
            
            
            """
            Third convnet:
            CNN
            BatchNormalization
            ELU
            """
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 128,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")
        
            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            ## --> [3, 3, 128]
            
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            ## --> [1152]
            
            ##This branch of the net calculates the value of the state, how good is to be at that state
            ##if you are about to die no matter what action you choose the state has low value
            ##--input: flatten layer, 1152
            ##--output: 512 neurons
            self.value_fc = tf.layers.dense(inputs = self.flatten,
                                            units = 512,
                                            activation = tf.nn.elu,
                                            kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                            name = "value_fc")
            
            ##--input: flatten layer, 512
            ##--output: 1 neuron with the V value
            self.value = tf.layers.dense(inputs = self.value_fc,
                                        units = 1,
                                        activation = None,
                                        kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                        name = "value")
            
            #calculates the advantage of an action over the rest for that state
            #input: flatten layer, 1152
            #output: 512 neurons
            self.advantage_fc = tf.layers.dense(inputs = self.flatten,
                                              units = 512,
                                              activation = tf.nn.elu,
                                              kernel_initializer= tf.contrib.layers.xavier_initializer(),
                                              name = "advantage_fc")
            
            self.advantage = tf.layers.dense(inputs = self.advantage_fc,
                                           units = self.action_size,
                                           activation = None,
                                           kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                            name = "advantage")
            
        
            #aggregation layer
            self.output = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage,axis = 1, keepdims = True))

            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            # The loss is modified because of PER 
            self.absolute_errors = tf.abs(self.target_Q - self.Q)# for updating Sumtree
              
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(self.ISWeights_ * tf.squared_difference(self.target_Q,self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [9]:
class SumTree(object):
    """
    This SumTree code is modified version of Morvan Zhou: 
    https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
    """
    data_pointer = 0
    
    """
    Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
    """
    def __init__(self, capacity):
        self.capacity = capacity # Number of leaf nodes (final nodes) that contains experiences
        
        # Generate the tree with all nodes values = 0
        # To understand this calculation (2 * capacity - 1) look at the schema above
        # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
        # Parent nodes = capacity - 1
        # Leaf nodes = capacity
        self.tree = np.zeros(2 * capacity - 1)
        
        """ tree:
            0
           / \
          0   0
         / \ / \
        0  0 0  0  [Size: capacity] it's at this line that there is the priorities score (aka pi)
        """
        
        # Contains the experiences (so the size of data is capacity)
        self.data = np.zeros(capacity, dtype=object)
    
    
    """
    Here we add our priority score in the sumtree leaf and add the experience in data
    """
    def add(self, priority, data):
        # Look at what index we want to put the experience
        tree_index = self.data_pointer + self.capacity - 1
        
        """ tree:
            0
           / \
          0   0
         / \ / \
tree_index  0 0  0  We fill the leaves from left to right
        """
        
        # Update data frame
        self.data[self.data_pointer] = data
        
        # Update the leaf
        self.update (tree_index, priority)
        
        # Add 1 to data_pointer
        self.data_pointer += 1
        
        if self.data_pointer >= self.capacity:  # If we're above the capacity, you go back to first index (we overwrite)
            self.data_pointer = 0
            
    
    """
    Update the leaf priority score and propagate the change through tree
    """
    def update(self, tree_index, priority):
        # Change = new priority score - former priority score
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        
        # then propagate the change through tree
        while tree_index != 0:    # this method is faster than the recursive loop in the reference code
            
            """
            Here we want to access the line above
            THE NUMBERS IN THIS TREE ARE THE INDEXES NOT THE PRIORITY VALUES
            
                0
               / \
              1   2
             / \ / \
            3  4 5  [6] 
            
            If we are in leaf at index 6, we updated the priority score
            We need then to update index 2 node
            So tree_index = (tree_index - 1) // 2
            tree_index = (6-1)//2
            tree_index = 2 (because // round the result)
            """
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change
    
    
    """
    Here we get the leaf_index, priority value of that leaf and experience associated with that index
    """
    def get_leaf(self, v):
        """
        Tree structure and array storage:
        Tree index:
             0         -> storing priority sum
            / \
          1     2
         / \   / \
        3   4 5   6    -> storing priority for experiences
        Array type for storing:
        [0,1,2,3,4,5,6]
        """
        parent_index = 0
        
        while True: # the while loop is faster than the method in the reference code
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            
            # If we reach bottom, end the search
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            
            else: # downward search, always search for a higher priority node
                
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                    
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
            
        data_index = leaf_index - self.capacity + 1

        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self):
        return self.tree[0] # Returns the root node

In [10]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DeepQNetwork = DQNetwork(state_size, action_size, learning_rate,"DeepQNetwork")

#create the QTargetNetwork
TargetQNetwork = DQNetwork(state_size,action_size, learning_rate, "TargetQNetwork")

In [11]:
def update_target_network():
    #every tau steps we copy the weights from our deep network to our TargetNetwok
    
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,"DQNetwork")
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,"TargetQNetwork")
    
    op_holder = []
    
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

## Step 6: Experience Replay 🔁
Now that we create our Neural Network, **we need to implement the Experience Replay method.** <br><br>
Here we'll create the Memory object that creates a deque.A deque (double ended queue) is a data type that **removes the oldest element each time that you add a new element.**

This part was taken from Udacity : <a href="https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb" Cartpole DQN</a>

In [12]:
class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
    PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
    PER_b = 0.4  # importance-sampling, from initial value increasing to 1
    
    PER_b_increment_per_sampling = 0.001
    
    absolute_error_upper = 1.  # clipped abs error

    def __init__(self, capacity):
        # Making the tree 
        """
        Remember that our tree is composed of a sum tree that contains the priority scores at his leaf
        And also a data array
        We don't use deque because it means that at each timestep our experiences change index by one.
        We prefer to use a simple array and to overwrite when the memory is full.
        """
        self.tree = SumTree(capacity)
        
    """
    Store a new experience in our tree
    Each new experience have a score of max_prority (it will be then improved when we use this exp to train our DDQN)
    """
    def store(self, experience):
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])
        
        # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected
        # So we use a minimum priority
        if max_priority == 0:
            max_priority = self.absolute_error_upper
        
        self.tree.add(max_priority, experience)   # set the max p for new p

        
    """
    - First, to sample a minibatch of k size, the range [0, priority_total] is / into k ranges.
    - Then a value is uniformly sampled from each range
    - We search in the sumtree, the experience where priority score correspond to sample values are retrieved from.
    - Then, we calculate IS weights for each minibatch element
    """
    def sample(self, n):
        # Create a sample array that will contains the minibatch
        memory_b = []
        
        b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)
        
        # Calculate the priority segment
        # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
        priority_segment = self.tree.total_priority / n       # priority segment
    
        # Here we increasing the PER_b each time we sample a new minibatch
        self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1
        
        # Calculating the max_weight
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.PER_b)
        
        for i in range(n):
            """
            A value is uniformly sample from each range
            """
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            
            """
            Experience that correspond to each value is retrieved
            """
            index, priority, data = self.tree.get_leaf(value)
            
            #P(j)
            sampling_probabilities = priority / self.tree.total_priority
            
            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight
                                   
            b_idx[i]= index
            
            experience = [data]
            
            memory_b.append(experience)
        
        return b_idx, memory_b, b_ISWeights
    
    """
    Update the priorities on the tree
    """
    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

Here we'll **deal with the empty memory problem**: we pre-populate our memory by taking random actions and storing the experience (state, action, reward, new_state).

In [13]:

memory = Memory(memory_size)

game.init()

# Render the environment
game.new_episode()

for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()

    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        #experience = np.hstack((state, [action, reward], next_state, done))
        
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        # Our state is now the next_state
        state = next_state
        
game.close()

## Step 7: Set up Tensorboard 📊
For more information about tensorboard, please watch this <a href="https://www.youtube.com/embed/eBbEDRsCmv4">excellent 30min tutorial</a> <br><br>
To launch tensorboard : `tensorboard --logdir=/tensorboard/dqn/1`

In [14]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DeepQNetwork.loss)

write_op = tf.summary.merge_all()

## Step 8: Train our Agent 🏃‍♂️

Our algorithm:
<br>
* Initialize the weights
* Init the environment
* Initialize the decay rate (that will use to reduce epsilon) 
<br><br>
* **For** episode to max_episode **do** 
    * Make new episode
    * Set step to 0
    * Observe the first state $s_0$
    <br><br>
    * **While** step < max_steps **do**:
        * Increase decay_rate
        * With $\epsilon$ select a random action $a_t$, otherwise select $a_t = \mathrm{argmax}_a Q(s_t,a)$
        * Execute action $a_t$ in simulator and observe reward $r_{t+1}$ and new state $s_{t+1}$
        * Store transition $<s_t, a_t, r_{t+1}, s_{t+1}>$ in memory $D$
        * Sample random mini-batch from $D$: $<s, a, r, s'>$
        * Set $\hat{Q} = r$ if the episode ends at $+1$, otherwise set $\hat{Q} = r + \gamma \max_{a'}{Q(s', a')}$
        * Make a gradient descent step with loss $(\hat{Q} - Q(s, a))^2$
    * **endfor**
    <br><br>
* **endfor**

    

In [None]:
"""
This function will do the part
With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        action = random.choice(possible_actions)
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, explore_probability

In [None]:
# Saver will help us to save our model
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        writer.add_graph(sess.graph)
        tau = 0
        
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0

        # Init the game
        game.init()
        
                
        # Update the parameters of our TargetNetwork with DQN_weights
        update_target = update_target_network()
        sess.run(update_target)

        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            game.new_episode()
            state = game.get_state().screen_buffer
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)

            while step < max_steps:
                step += 1
                tau += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

                # Do the action
                reward = game.make_action(action)

                # Look if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total reward
                episode_rewards.append(reward)

                # If the game is finished
                if done:
                    # the episode ends so no next state
                    next_state = np.zeros((84,84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability))

                    experience = state, action, reward, next_state, done
                    memory.store(experience)

                else:
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    

                    experience = state, action, reward, next_state, done
                    memory.store(experience)
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                tree_idx, batch, ISWeights_mb = memory.sample(batch_size)
                
                states_mb = np.array([each[0][0] for each in batch], ndmin=3)
                actions_mb = np.array([each[0][1] for each in batch])
                rewards_mb = np.array([each[0][2] for each in batch]) 
                next_states_mb = np.array([each[0][3] for each in batch], ndmin=3)
                dones_mb = np.array([each[0][4] for each in batch])

                target_Qs_batch = []

                 # Get Q values for next_state 
                Qs_next_state = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: next_states_mb})
                
                #calculate q target in the target network
                q_target = sess.run(TargetQNetwork.output, feed_dict={ TargetQNetwork.inputs_: next_states_mb})

                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    #a' for s'
                    action = np.argmax(Qs_next_state[i])
                    
                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * q_target[i][action]
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                _,loss,absolute_errors = sess.run([DeepQNetwork.optimizer, DeepQNetwork.loss, DeepQNetwork.absolute_errors],
                                    feed_dict={DeepQNetwork.inputs_: states_mb,
                                               DeepQNetwork.target_Q: targets_mb,
                                               DeepQNetwork.actions_: actions_mb,
                                              DeepQNetwork.ISWeights_ : ISWeights_mb})
                # Update priority
                memory.batch_update(tree_idx, absolute_errors)

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DeepQNetwork.inputs_: states_mb,
                                                   DeepQNetwork.target_Q: targets_mb,
                                                   DeepQNetwork.actions_: actions_mb,
                                                       DeepQNetwork.ISWeights_ : ISWeights_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                                
                #update the target weights by running the operations 
                if tau >= max_tau:
                    update_target_ops = update_target_network()
                    sess.run(update_target_ops)
                    tau = 0
                

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/ddqn/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: -115.99858093261719 Training loss: 0.3062 Explore P: 0.9996
Model Saved
Episode: 1 Total reward: -103.89242553710938 Training loss: 0.6124 Explore P: 0.9992
Episode: 2 Total reward: -102.23280334472656 Training loss: 0.4451 Explore P: 0.9988
Episode: 3 Total reward: -106.90557861328125 Training loss: 20.7345 Explore P: 0.9984
Episode: 4 Total reward: -89.5968017578125 Training loss: 0.1705 Explore P: 0.9980
Episode: 5 Total reward: -76.04985046386719 Training loss: 0.1700 Explore P: 0.9976
Model Saved
Episode: 6 Total reward: -115.51789855957031 Training loss: 0.1713 Explore P: 0.9972
Episode: 7 Total reward: -104.93006896972656 Training loss: 9.9324 Explore P: 0.9967
Episode: 8 Total reward: -115.19496154785156 Training loss: 0.3095 Explore P: 0.9956
Episode: 9 Total reward: -72.86117553710938 Training loss: 20.8214 Explore P: 0.9952
Episode: 10 Total reward: -107.9078369140625 Training loss: 10.3431 Explore P: 0.9948
Model Saved
Episode: 11 Total reward: -95.

Episode: 94 Total reward: -82.12098693847656 Training loss: 11.7134 Explore P: 0.9564
Episode: 95 Total reward: -115.98101806640625 Training loss: 0.4596 Explore P: 0.9560
Model Saved
Episode: 96 Total reward: -94.08897399902344 Training loss: 0.5619 Explore P: 0.9556
Episode: 97 Total reward: -113.15425109863281 Training loss: 0.3889 Explore P: 0.9553
Episode: 98 Total reward: -97.69773864746094 Training loss: 12.4555 Explore P: 0.9542
Episode: 99 Total reward: -111.57856750488281 Training loss: 2.2992 Explore P: 0.9537
Episode: 100 Total reward: -84.38200378417969 Training loss: 0.8334 Explore P: 0.9531
Model Saved
Episode: 101 Total reward: -89.73367309570312 Training loss: 1.4960 Explore P: 0.9526
Episode: 102 Total reward: -89.63766479492188 Training loss: 10.5701 Explore P: 0.9522
Episode: 103 Total reward: -112.34382629394531 Training loss: 1.2647 Explore P: 0.9518
Episode: 104 Total reward: -93.84669494628906 Training loss: 0.3860 Explore P: 0.9515
Episode: 105 Total reward: -9

Episode: 187 Total reward: -103.77287292480469 Training loss: 0.4769 Explore P: 0.9183
Episode: 188 Total reward: -114.32127380371094 Training loss: 1.3570 Explore P: 0.9179
Episode: 189 Total reward: -114.76132202148438 Training loss: 2.4155 Explore P: 0.9176
Episode: 190 Total reward: -87.49594116210938 Training loss: 0.9952 Explore P: 0.9172
Model Saved
Episode: 191 Total reward: -115.83930969238281 Training loss: 10.9802 Explore P: 0.9168
Episode: 192 Total reward: -97.80282592773438 Training loss: 0.2595 Explore P: 0.9164
Episode: 193 Total reward: -115.73477172851562 Training loss: 0.8558 Explore P: 0.9161
Episode: 194 Total reward: -62.76287841796875 Training loss: 0.8721 Explore P: 0.9157
Episode: 195 Total reward: -105.42851257324219 Training loss: 14.9905 Explore P: 0.9152
Model Saved
Episode: 196 Total reward: -104.12919616699219 Training loss: 0.2220 Explore P: 0.9148
Episode: 197 Total reward: -40.43463134765625 Training loss: 14.4782 Explore P: 0.9144
Episode: 198 Total r

Episode: 280 Total reward: -66.16658020019531 Training loss: 0.4431 Explore P: 0.8788
Model Saved
Episode: 281 Total reward: -112.11479187011719 Training loss: 21.0753 Explore P: 0.8784
Episode: 282 Total reward: -114.21015930175781 Training loss: 0.1380 Explore P: 0.8782
Episode: 283 Total reward: -79.78514099121094 Training loss: 0.7057 Explore P: 0.8779
Episode: 284 Total reward: -75.12216186523438 Training loss: 0.2505 Explore P: 0.8775
Episode: 285 Total reward: -115.77424621582031 Training loss: 0.1796 Explore P: 0.8771
Model Saved
Episode: 286 Total reward: -106.8531494140625 Training loss: 0.2215 Explore P: 0.8766
Episode: 287 Total reward: -94.25498962402344 Training loss: 10.7350 Explore P: 0.8763
Episode: 288 Total reward: -70.81039428710938 Training loss: 0.9406 Explore P: 0.8757
Episode: 289 Total reward: -76.46037292480469 Training loss: 14.6402 Explore P: 0.8751
Episode: 290 Total reward: -105.51100158691406 Training loss: 0.3838 Explore P: 0.8747
Model Saved
Episode: 29

Episode: 373 Total reward: -83.20372009277344 Training loss: 1.6205 Explore P: 0.8451
Episode: 374 Total reward: -70.28773498535156 Training loss: 1.7852 Explore P: 0.8446
Episode: 375 Total reward: -95.43223571777344 Training loss: 0.2728 Explore P: 0.8442
Model Saved
Episode: 376 Total reward: -114.42521667480469 Training loss: 0.3101 Explore P: 0.8440
Episode: 377 Total reward: -86.51982116699219 Training loss: 0.3361 Explore P: 0.8435
Episode: 378 Total reward: -100.27497863769531 Training loss: 14.2838 Explore P: 0.8431
Episode: 379 Total reward: -73.62715148925781 Training loss: 0.3647 Explore P: 0.8428
Episode: 380 Total reward: -50.926483154296875 Training loss: 0.3366 Explore P: 0.8423
Model Saved
Episode: 381 Total reward: -114.95895385742188 Training loss: 15.1790 Explore P: 0.8419
Episode: 382 Total reward: -103.85812377929688 Training loss: 16.6373 Explore P: 0.8415
Episode: 383 Total reward: -114.78256225585938 Training loss: 0.2259 Explore P: 0.8408
Episode: 384 Total re

Model Saved
Episode: 466 Total reward: -104.73997497558594 Training loss: 1.1153 Explore P: 0.8096
Episode: 467 Total reward: -102.33126831054688 Training loss: 5.3575 Explore P: 0.8092
Episode: 468 Total reward: -73.35488891601562 Training loss: 0.2021 Explore P: 0.8089
Episode: 469 Total reward: -87.968017578125 Training loss: 0.2317 Explore P: 0.8086
Episode: 470 Total reward: -84.24415588378906 Training loss: 0.7397 Explore P: 0.8082
Model Saved
Episode: 471 Total reward: -114.18896484375 Training loss: 0.5833 Explore P: 0.8079
Episode: 472 Total reward: -104.57099914550781 Training loss: 1.3057 Explore P: 0.8073
Episode: 473 Total reward: -89.37989807128906 Training loss: 0.3035 Explore P: 0.8069
Episode: 474 Total reward: -75.82432556152344 Training loss: 9.3056 Explore P: 0.8066
Episode: 475 Total reward: -114.98403930664062 Training loss: 0.5117 Explore P: 0.8062
Model Saved
Episode: 476 Total reward: -73.32766723632812 Training loss: 0.1577 Explore P: 0.8058
Episode: 477 Total

Episode: 559 Total reward: -110.93797302246094 Training loss: 0.7693 Explore P: 0.7746
Episode: 560 Total reward: -80.13720703125 Training loss: 0.2032 Explore P: 0.7743
Model Saved
Episode: 561 Total reward: -85.03221130371094 Training loss: 0.2777 Explore P: 0.7740
Episode: 562 Total reward: -65.24063110351562 Training loss: 1.9034 Explore P: 0.7736
Episode: 563 Total reward: -102.83773803710938 Training loss: 0.3091 Explore P: 0.7733
Episode: 564 Total reward: -69.25889587402344 Training loss: 1.0537 Explore P: 0.7729
Episode: 565 Total reward: -87.15437316894531 Training loss: 0.3314 Explore P: 0.7723
Model Saved
Episode: 566 Total reward: 9.619552612304688 Training loss: 0.2638 Explore P: 0.7715
Episode: 567 Total reward: -64.29466247558594 Training loss: 15.8557 Explore P: 0.7711
Episode: 568 Total reward: -64.50265502929688 Training loss: 0.2238 Explore P: 0.7708
Episode: 569 Total reward: -22.763015747070312 Training loss: 0.9759 Explore P: 0.7705
Episode: 570 Total reward: -94

Episode: 652 Total reward: -58.33209228515625 Training loss: 0.2402 Explore P: 0.7425
Episode: 653 Total reward: -87.84193420410156 Training loss: 12.6414 Explore P: 0.7422
Episode: 654 Total reward: -115.12496948242188 Training loss: 5.1549 Explore P: 0.7418
Episode: 655 Total reward: -79.83497619628906 Training loss: 0.7295 Explore P: 0.7414
Model Saved
Episode: 656 Total reward: -42.673126220703125 Training loss: 9.6777 Explore P: 0.7411
Episode: 657 Total reward: -80.4168701171875 Training loss: 0.5837 Explore P: 0.7407
Episode: 658 Total reward: -92.43092346191406 Training loss: 11.1675 Explore P: 0.7402
Episode: 659 Total reward: -106.82963562011719 Training loss: 14.2496 Explore P: 0.7401
Episode: 660 Total reward: -101.1947021484375 Training loss: 0.2416 Explore P: 0.7399
Model Saved
Episode: 661 Total reward: -20.066726684570312 Training loss: 0.2575 Explore P: 0.7396
Episode: 662 Total reward: -102.18011474609375 Training loss: 0.1720 Explore P: 0.7394
Episode: 663 Total rewa

Episode: 745 Total reward: -115.89054870605469 Training loss: 13.9441 Explore P: 0.7148
Model Saved
Episode: 746 Total reward: -95.88581848144531 Training loss: 17.5176 Explore P: 0.7145
Episode: 747 Total reward: -54.07597351074219 Training loss: 0.5659 Explore P: 0.7136
Episode: 748 Total reward: -94.544921875 Training loss: 15.1746 Explore P: 0.7133
Episode: 749 Total reward: -106.07919311523438 Training loss: 1.3968 Explore P: 0.7130
Episode: 750 Total reward: -103.4560546875 Training loss: 0.2607 Explore P: 0.7129
Model Saved
Episode: 751 Total reward: -64.7669677734375 Training loss: 0.4064 Explore P: 0.7125
Episode: 752 Total reward: -38.39347839355469 Training loss: 0.1737 Explore P: 0.7121
Episode: 753 Total reward: -61.87028503417969 Training loss: 1.5442 Explore P: 0.7117
Episode: 754 Total reward: -54.92510986328125 Training loss: 0.3489 Explore P: 0.7114
Episode: 755 Total reward: -34.69401550292969 Training loss: 0.6586 Explore P: 0.7106
Model Saved
Episode: 756 Total rew

Episode: 838 Total reward: -109.33152770996094 Training loss: 3.4704 Explore P: 0.6856
Episode: 839 Total reward: -53.52496337890625 Training loss: 5.3110 Explore P: 0.6854
Episode: 840 Total reward: -70.88014221191406 Training loss: 16.5006 Explore P: 0.6851
Model Saved
Episode: 841 Total reward: -84.19572448730469 Training loss: 0.2683 Explore P: 0.6848
Episode: 842 Total reward: -81.27275085449219 Training loss: 0.6302 Explore P: 0.6845
Episode: 843 Total reward: -90.20562744140625 Training loss: 0.2012 Explore P: 0.6843
Episode: 844 Total reward: -27.384475708007812 Training loss: 0.2481 Explore P: 0.6840
Episode: 845 Total reward: -70.51200866699219 Training loss: 5.4617 Explore P: 0.6838
Model Saved
Episode: 846 Total reward: -77.88473510742188 Training loss: 0.6215 Explore P: 0.6835
Episode: 847 Total reward: -27.852554321289062 Training loss: 1.8137 Explore P: 0.6833
Episode: 848 Total reward: -86.81159973144531 Training loss: 0.4564 Explore P: 0.6829
Episode: 849 Total reward:

Model Saved
Episode: 931 Total reward: -61.2415771484375 Training loss: 18.6300 Explore P: 0.6594
Episode: 932 Total reward: -41.49824523925781 Training loss: 0.4449 Explore P: 0.6591
Episode: 933 Total reward: -72.8021240234375 Training loss: 0.2623 Explore P: 0.6589
Episode: 934 Total reward: -48.18354797363281 Training loss: 0.2983 Explore P: 0.6586
Episode: 935 Total reward: -102.05839538574219 Training loss: 11.7334 Explore P: 0.6585
Model Saved
Episode: 936 Total reward: -93.12200927734375 Training loss: 0.1802 Explore P: 0.6583
Episode: 937 Total reward: -69.50863647460938 Training loss: 0.4184 Explore P: 0.6581
Episode: 938 Total reward: -79.55982971191406 Training loss: 0.2851 Explore P: 0.6579
Episode: 939 Total reward: -43.74687194824219 Training loss: 2.7371 Explore P: 0.6576
Episode: 940 Total reward: -43.46736145019531 Training loss: 1.1369 Explore P: 0.6573
Model Saved
Episode: 941 Total reward: -68.41046142578125 Training loss: 1.5289 Explore P: 0.6571
Episode: 942 Tota

Episode: 1024 Total reward: -63.71565246582031 Training loss: 0.4083 Explore P: 0.6350
Episode: 1025 Total reward: -113.44839477539062 Training loss: 0.2332 Explore P: 0.6349
Model Saved
Episode: 1026 Total reward: -97.58724975585938 Training loss: 0.1821 Explore P: 0.6347
Episode: 1027 Total reward: -12.840606689453125 Training loss: 0.5584 Explore P: 0.6343
Episode: 1028 Total reward: -44.70875549316406 Training loss: 0.1758 Explore P: 0.6340
Episode: 1029 Total reward: -78.897216796875 Training loss: 0.4320 Explore P: 0.6338
Episode: 1030 Total reward: -87.35614013671875 Training loss: 0.5264 Explore P: 0.6335
Model Saved
Episode: 1031 Total reward: -80.76669311523438 Training loss: 0.8729 Explore P: 0.6333
Episode: 1032 Total reward: -7.77655029296875 Training loss: 0.2503 Explore P: 0.6330
Episode: 1033 Total reward: -90.92988586425781 Training loss: 0.6613 Explore P: 0.6328
Episode: 1034 Total reward: -53.602996826171875 Training loss: 0.2359 Explore P: 0.6325
Episode: 1035 Total

Model Saved
Episode: 1116 Total reward: -72.46185302734375 Training loss: 0.3030 Explore P: 0.6116
Episode: 1117 Total reward: -64.90573120117188 Training loss: 0.3943 Explore P: 0.6113
Episode: 1118 Total reward: -40.18852233886719 Training loss: 0.2559 Explore P: 0.6111
Episode: 1119 Total reward: -49.9923095703125 Training loss: 3.7441 Explore P: 0.6109
Episode: 1120 Total reward: -77.79727172851562 Training loss: 0.2959 Explore P: 0.6107
Model Saved
Episode: 1121 Total reward: -8.741943359375 Training loss: 5.9518 Explore P: 0.6104
Episode: 1122 Total reward: -70.56138610839844 Training loss: 11.0764 Explore P: 0.6102
Episode: 1123 Total reward: -75.44866943359375 Training loss: 0.2586 Explore P: 0.6099
Episode: 1124 Total reward: -60.54100036621094 Training loss: 0.2526 Explore P: 0.6097
Episode: 1125 Total reward: -52.11228942871094 Training loss: 0.4043 Explore P: 0.6093
Model Saved
Episode: 1126 Total reward: -54.481231689453125 Training loss: 0.4024 Explore P: 0.6090
Episode: 

Episode: 1208 Total reward: -59.593017578125 Training loss: 0.7538 Explore P: 0.5885
Episode: 1209 Total reward: -64.252685546875 Training loss: 0.4909 Explore P: 0.5883
Episode: 1210 Total reward: 0.830352783203125 Training loss: 0.9149 Explore P: 0.5881
Model Saved
Episode: 1211 Total reward: -82.36257934570312 Training loss: 3.0975 Explore P: 0.5877
Episode: 1212 Total reward: -79.88844299316406 Training loss: 0.2812 Explore P: 0.5875
Episode: 1213 Total reward: -16.02667236328125 Training loss: 0.3698 Explore P: 0.5872
Episode: 1214 Total reward: -70.24151611328125 Training loss: 6.4088 Explore P: 0.5869
Episode: 1215 Total reward: -106.83871459960938 Training loss: 13.3005 Explore P: 0.5867
Model Saved
Episode: 1216 Total reward: -43.06178283691406 Training loss: 0.3183 Explore P: 0.5865
Episode: 1217 Total reward: -81.78581237792969 Training loss: 13.4245 Explore P: 0.5862
Episode: 1218 Total reward: -62.252349853515625 Training loss: 1.2333 Explore P: 0.5860
Episode: 1219 Total 

Episode: 1300 Total reward: -55.2110595703125 Training loss: 0.3056 Explore P: 0.5670
Model Saved
Episode: 1301 Total reward: -100.72206115722656 Training loss: 0.6914 Explore P: 0.5667
Episode: 1302 Total reward: -70.04103088378906 Training loss: 0.3193 Explore P: 0.5666
Episode: 1303 Total reward: -49.95391845703125 Training loss: 0.5045 Explore P: 0.5664
Episode: 1304 Total reward: -54.73457336425781 Training loss: 0.6541 Explore P: 0.5661
Episode: 1305 Total reward: -93.03265380859375 Training loss: 0.2391 Explore P: 0.5659
Model Saved
Episode: 1306 Total reward: 12.479217529296875 Training loss: 0.2057 Explore P: 0.5657
Episode: 1307 Total reward: -75.61476135253906 Training loss: 14.0661 Explore P: 0.5656
Episode: 1308 Total reward: -71.11785888671875 Training loss: 12.0187 Explore P: 0.5654
Episode: 1309 Total reward: -99.31944274902344 Training loss: 0.9118 Explore P: 0.5651
Episode: 1310 Total reward: -46.74273681640625 Training loss: 0.1754 Explore P: 0.5649
Model Saved
Episo

Episode: 1392 Total reward: -95.25445556640625 Training loss: 0.2091 Explore P: 0.5469
Episode: 1393 Total reward: -69.1083984375 Training loss: 1.6935 Explore P: 0.5467
Episode: 1394 Total reward: -45.42927551269531 Training loss: 1.1948 Explore P: 0.5464
Episode: 1395 Total reward: -5.01141357421875 Training loss: 0.9202 Explore P: 0.5462
Model Saved
Episode: 1396 Total reward: -42.57720947265625 Training loss: 0.1965 Explore P: 0.5460
Episode: 1397 Total reward: -15.10198974609375 Training loss: 0.2679 Explore P: 0.5458
Episode: 1398 Total reward: 5.1365966796875 Training loss: 0.5015 Explore P: 0.5456
Episode: 1399 Total reward: -76.43162536621094 Training loss: 1.0853 Explore P: 0.5453
Episode: 1400 Total reward: -71.07650756835938 Training loss: 0.2530 Explore P: 0.5451
Model Saved
Episode: 1401 Total reward: -87.95417785644531 Training loss: 1.0493 Explore P: 0.5448
Episode: 1402 Total reward: -41.091217041015625 Training loss: 0.4042 Explore P: 0.5447
Episode: 1403 Total reward

Episode: 1484 Total reward: -45.91960144042969 Training loss: 14.1671 Explore P: 0.5270
Episode: 1485 Total reward: -95.59530639648438 Training loss: 0.5708 Explore P: 0.5268
Model Saved
Episode: 1486 Total reward: -112.10321044921875 Training loss: 0.3573 Explore P: 0.5266
Episode: 1487 Total reward: 0.5525665283203125 Training loss: 10.3311 Explore P: 0.5264
Episode: 1488 Total reward: -45.32438659667969 Training loss: 0.9388 Explore P: 0.5262
Episode: 1489 Total reward: -13.90594482421875 Training loss: 0.7263 Explore P: 0.5259
Episode: 1490 Total reward: -71.04579162597656 Training loss: 0.4714 Explore P: 0.5257
Model Saved
Episode: 1491 Total reward: -16.681304931640625 Training loss: 0.2321 Explore P: 0.5255
Episode: 1492 Total reward: -109.56668090820312 Training loss: 0.3792 Explore P: 0.5252
Episode: 1493 Total reward: -84.73527526855469 Training loss: 0.7702 Explore P: 0.5250
Episode: 1494 Total reward: -46.289093017578125 Training loss: 0.2015 Explore P: 0.5248
Episode: 1495

Model Saved
Episode: 1576 Total reward: -54.39631652832031 Training loss: 0.2631 Explore P: 0.5086
Episode: 1577 Total reward: -78.21131896972656 Training loss: 0.6128 Explore P: 0.5084
Episode: 1578 Total reward: -56.736907958984375 Training loss: 0.1944 Explore P: 0.5082
Episode: 1579 Total reward: -24.927291870117188 Training loss: 0.4871 Explore P: 0.5080
Episode: 1580 Total reward: -79.22634887695312 Training loss: 0.4020 Explore P: 0.5078
Model Saved
Episode: 1581 Total reward: -15.131027221679688 Training loss: 10.8184 Explore P: 0.5076
Episode: 1582 Total reward: -6.572357177734375 Training loss: 0.3250 Explore P: 0.5074
Episode: 1583 Total reward: -7.88897705078125 Training loss: 0.7160 Explore P: 0.5072
Episode: 1584 Total reward: -44.94921875 Training loss: 0.9963 Explore P: 0.5070
Episode: 1585 Total reward: -7.37774658203125 Training loss: 0.2452 Explore P: 0.5068
Model Saved
Episode: 1586 Total reward: 16.663970947265625 Training loss: 0.3529 Explore P: 0.5066
Episode: 15

Episode: 1668 Total reward: 19.347747802734375 Training loss: 0.5436 Explore P: 0.4911
Episode: 1669 Total reward: -63.18296813964844 Training loss: 0.2983 Explore P: 0.4909
Episode: 1670 Total reward: -23.006378173828125 Training loss: 0.4634 Explore P: 0.4907
Model Saved
Episode: 1671 Total reward: -27.243301391601562 Training loss: 0.2748 Explore P: 0.4905
Episode: 1672 Total reward: 33.71142578125 Training loss: 0.2876 Explore P: 0.4903
Episode: 1673 Total reward: -48.82524108886719 Training loss: 15.5684 Explore P: 0.4900
Episode: 1674 Total reward: -40.75927734375 Training loss: 1.6972 Explore P: 0.4898
Episode: 1675 Total reward: -81.02975463867188 Training loss: 10.4458 Explore P: 0.4897
Model Saved
Episode: 1676 Total reward: -48.34629821777344 Training loss: 0.5639 Explore P: 0.4895
Episode: 1677 Total reward: -45.465545654296875 Training loss: 10.3093 Explore P: 0.4893
Episode: 1678 Total reward: 48.04345703125 Training loss: 9.3864 Explore P: 0.4891
Episode: 1679 Total rewa

Episode: 1760 Total reward: -51.69517517089844 Training loss: 0.6215 Explore P: 0.4735
Model Saved
Episode: 1761 Total reward: -21.567184448242188 Training loss: 0.2613 Explore P: 0.4732
Episode: 1762 Total reward: -17.883743286132812 Training loss: 0.5506 Explore P: 0.4730
Episode: 1763 Total reward: -52.482269287109375 Training loss: 8.8640 Explore P: 0.4728
Episode: 1764 Total reward: -18.89813232421875 Training loss: 0.1503 Explore P: 0.4726
Episode: 1765 Total reward: -68.19035339355469 Training loss: 0.4583 Explore P: 0.4725
Model Saved
Episode: 1766 Total reward: -72.83242797851562 Training loss: 1.6843 Explore P: 0.4723
Episode: 1767 Total reward: 3.027252197265625 Training loss: 0.2805 Explore P: 0.4722
Episode: 1768 Total reward: -22.565216064453125 Training loss: 12.3534 Explore P: 0.4720
Episode: 1769 Total reward: -100.80328369140625 Training loss: 0.8054 Explore P: 0.4719
Episode: 1770 Total reward: -73.239990234375 Training loss: 0.8070 Explore P: 0.4717
Model Saved
Epis

Episode: 1852 Total reward: -49.05070495605469 Training loss: 2.0653 Explore P: 0.4578
Episode: 1853 Total reward: -4.7795562744140625 Training loss: 0.2440 Explore P: 0.4576
Episode: 1854 Total reward: -44.197509765625 Training loss: 2.6010 Explore P: 0.4575
Episode: 1855 Total reward: -29.551620483398438 Training loss: 0.1904 Explore P: 0.4573
Model Saved
Episode: 1856 Total reward: -36.472808837890625 Training loss: 0.2085 Explore P: 0.4572
Episode: 1857 Total reward: -61.59342956542969 Training loss: 1.0569 Explore P: 0.4570
Episode: 1858 Total reward: -31.966293334960938 Training loss: 0.2433 Explore P: 0.4568
Episode: 1859 Total reward: 89.83642578125 Training loss: 0.2323 Explore P: 0.4566
Episode: 1860 Total reward: -1.123992919921875 Training loss: 3.5730 Explore P: 0.4564
Model Saved
Episode: 1861 Total reward: 19.174758911132812 Training loss: 0.3544 Explore P: 0.4562
Episode: 1862 Total reward: -58.41009521484375 Training loss: 0.3496 Explore P: 0.4560
Episode: 1863 Total r

Episode: 1944 Total reward: -102.42630004882812 Training loss: 2.0444 Explore P: 0.4424
Episode: 1945 Total reward: -68.28244018554688 Training loss: 2.1989 Explore P: 0.4423
Model Saved
Episode: 1946 Total reward: -29.183944702148438 Training loss: 1.2509 Explore P: 0.4421
Episode: 1947 Total reward: 67.87394714355469 Training loss: 0.3161 Explore P: 0.4419
Episode: 1948 Total reward: -52.136810302734375 Training loss: 0.5287 Explore P: 0.4417
Episode: 1949 Total reward: 30.264846801757812 Training loss: 0.4431 Explore P: 0.4416
Episode: 1950 Total reward: -16.54229736328125 Training loss: 0.2437 Explore P: 0.4413
Model Saved
Episode: 1951 Total reward: -14.96466064453125 Training loss: 0.3447 Explore P: 0.4412
Episode: 1952 Total reward: 11.146514892578125 Training loss: 0.1776 Explore P: 0.4410
Episode: 1953 Total reward: -43.7984619140625 Training loss: 1.3639 Explore P: 0.4408
Episode: 1954 Total reward: -11.09429931640625 Training loss: 3.2047 Explore P: 0.4406
Episode: 1955 Tota

Model Saved
Episode: 2036 Total reward: 50.20768737792969 Training loss: 2.3406 Explore P: 0.4276
Episode: 2037 Total reward: 53.92460632324219 Training loss: 0.2144 Explore P: 0.4275
Episode: 2038 Total reward: -45.93121337890625 Training loss: 0.7778 Explore P: 0.4273
Episode: 2039 Total reward: -1.27935791015625 Training loss: 0.3799 Explore P: 0.4271
Episode: 2040 Total reward: -22.203811645507812 Training loss: 0.3063 Explore P: 0.4269
Model Saved
Episode: 2041 Total reward: -8.868194580078125 Training loss: 1.5385 Explore P: 0.4267
Episode: 2042 Total reward: -18.177886962890625 Training loss: 0.2550 Explore P: 0.4266
Episode: 2043 Total reward: -11.360580444335938 Training loss: 0.2113 Explore P: 0.4264
Episode: 2044 Total reward: -12.715667724609375 Training loss: 0.2579 Explore P: 0.4263
Episode: 2045 Total reward: -108.8477783203125 Training loss: 0.2521 Explore P: 0.4260
Model Saved
Episode: 2046 Total reward: -13.126678466796875 Training loss: 1.4490 Explore P: 0.4258
Episo

Episode: 2128 Total reward: 5.0299072265625 Training loss: 0.5323 Explore P: 0.4133
Episode: 2129 Total reward: -37.68975830078125 Training loss: 1.6776 Explore P: 0.4131
Episode: 2130 Total reward: -37.3697509765625 Training loss: 0.9828 Explore P: 0.4130
Model Saved
Episode: 2131 Total reward: -77.851806640625 Training loss: 0.6023 Explore P: 0.4128
Episode: 2132 Total reward: -78.85560607910156 Training loss: 0.2488 Explore P: 0.4127
Episode: 2133 Total reward: 4.5402374267578125 Training loss: 0.5364 Explore P: 0.4126
Episode: 2134 Total reward: -51.85444641113281 Training loss: 0.4414 Explore P: 0.4124
Episode: 2135 Total reward: -34.200836181640625 Training loss: 0.4838 Explore P: 0.4122
Model Saved
Episode: 2136 Total reward: 1.275604248046875 Training loss: 2.1285 Explore P: 0.4121
Episode: 2137 Total reward: -47.65159606933594 Training loss: 0.2472 Explore P: 0.4120
Episode: 2138 Total reward: -79.62344360351562 Training loss: 0.3357 Explore P: 0.4119
Episode: 2139 Total rewar

Episode: 2220 Total reward: -15.617446899414062 Training loss: 0.2769 Explore P: 0.3997
Model Saved
Episode: 2221 Total reward: -52.860321044921875 Training loss: 0.7155 Explore P: 0.3995
Episode: 2222 Total reward: -60.461456298828125 Training loss: 13.0680 Explore P: 0.3994
Episode: 2223 Total reward: -41.72447204589844 Training loss: 0.9248 Explore P: 0.3992
Episode: 2224 Total reward: 16.802932739257812 Training loss: 0.3574 Explore P: 0.3991
Episode: 2225 Total reward: -70.36886596679688 Training loss: 0.7280 Explore P: 0.3990
Model Saved
Episode: 2226 Total reward: -72.93392944335938 Training loss: 0.2537 Explore P: 0.3989
Episode: 2227 Total reward: 31.724380493164062 Training loss: 1.8645 Explore P: 0.3987
Episode: 2228 Total reward: -26.133712768554688 Training loss: 0.2843 Explore P: 0.3986
Episode: 2229 Total reward: 98.41883850097656 Training loss: 0.2528 Explore P: 0.3984
Episode: 2230 Total reward: -29.225784301757812 Training loss: 0.2457 Explore P: 0.3983
Model Saved
Ep

Episode: 2312 Total reward: -0.8868865966796875 Training loss: 1.0343 Explore P: 0.3863
Episode: 2313 Total reward: 46.5220947265625 Training loss: 0.3236 Explore P: 0.3861
Episode: 2314 Total reward: 89.11686706542969 Training loss: 2.6473 Explore P: 0.3860
Episode: 2315 Total reward: -27.362136840820312 Training loss: 0.3314 Explore P: 0.3858
Model Saved
Episode: 2316 Total reward: -15.900146484375 Training loss: 0.1906 Explore P: 0.3856
Episode: 2317 Total reward: 4.6962738037109375 Training loss: 0.2923 Explore P: 0.3855
Episode: 2318 Total reward: -32.70780944824219 Training loss: 1.2767 Explore P: 0.3853
Episode: 2319 Total reward: 35.5758056640625 Training loss: 1.0274 Explore P: 0.3852
Episode: 2320 Total reward: -80.88156127929688 Training loss: 7.3514 Explore P: 0.3850
Model Saved
Episode: 2321 Total reward: -97.81341552734375 Training loss: 0.3799 Explore P: 0.3849
Episode: 2322 Total reward: 53.28514099121094 Training loss: 0.3109 Explore P: 0.3847
Episode: 2323 Total rewar

Episode: 2404 Total reward: -73.1607666015625 Training loss: 14.5019 Explore P: 0.3736
Episode: 2405 Total reward: -48.47508239746094 Training loss: 0.2950 Explore P: 0.3734
Model Saved
Episode: 2406 Total reward: 28.433837890625 Training loss: 0.4802 Explore P: 0.3733
Episode: 2407 Total reward: 21.669540405273438 Training loss: 0.1783 Explore P: 0.3732
Episode: 2408 Total reward: -69.20222473144531 Training loss: 3.5737 Explore P: 0.3731
Episode: 2409 Total reward: 50.41981506347656 Training loss: 0.4433 Explore P: 0.3729
Episode: 2410 Total reward: -76.35707092285156 Training loss: 0.2116 Explore P: 0.3728
Model Saved
Episode: 2411 Total reward: -13.51739501953125 Training loss: 0.5485 Explore P: 0.3727
Episode: 2412 Total reward: -20.434173583984375 Training loss: 0.2331 Explore P: 0.3725
Episode: 2413 Total reward: -47.94427490234375 Training loss: 0.1609 Explore P: 0.3723
Episode: 2414 Total reward: -39.752716064453125 Training loss: 1.0870 Explore P: 0.3722
Episode: 2415 Total r

Episode: 2496 Total reward: 16.16583251953125 Training loss: 0.4060 Explore P: 0.3611
Episode: 2497 Total reward: -18.386627197265625 Training loss: 0.7756 Explore P: 0.3610
Episode: 2498 Total reward: 27.787063598632812 Training loss: 0.2315 Explore P: 0.3608
Episode: 2499 Total reward: -10.233535766601562 Training loss: 0.3797 Explore P: 0.3607
Episode: 2500 Total reward: 62.781646728515625 Training loss: 0.9558 Explore P: 0.3605
Model Saved
Episode: 2501 Total reward: 19.265884399414062 Training loss: 0.3156 Explore P: 0.3604
Episode: 2502 Total reward: 63.953460693359375 Training loss: 0.1904 Explore P: 0.3603
Episode: 2503 Total reward: -15.7781982421875 Training loss: 0.3280 Explore P: 0.3602
Episode: 2504 Total reward: 5.4593353271484375 Training loss: 0.1859 Explore P: 0.3600
Episode: 2505 Total reward: 2.973388671875 Training loss: 5.9975 Explore P: 0.3599
Model Saved
Episode: 2506 Total reward: -49.84480285644531 Training loss: 0.3077 Explore P: 0.3598
Episode: 2507 Total rew

Episode: 2588 Total reward: -79.22816467285156 Training loss: 2.0441 Explore P: 0.3495
Episode: 2589 Total reward: -82.268310546875 Training loss: 0.4489 Explore P: 0.3495
Episode: 2590 Total reward: -59.94526672363281 Training loss: 0.3531 Explore P: 0.3494
Model Saved
Episode: 2591 Total reward: -7.368743896484375 Training loss: 0.2666 Explore P: 0.3493
Episode: 2592 Total reward: -7.0311279296875 Training loss: 0.5991 Explore P: 0.3491
Episode: 2593 Total reward: -33.73478698730469 Training loss: 11.0655 Explore P: 0.3490
Episode: 2594 Total reward: 46.13853454589844 Training loss: 0.9330 Explore P: 0.3489
Episode: 2595 Total reward: 36.60444641113281 Training loss: 0.2919 Explore P: 0.3487
Model Saved
Episode: 2596 Total reward: -22.8087158203125 Training loss: 0.7663 Explore P: 0.3486
Episode: 2597 Total reward: -29.433746337890625 Training loss: 18.6878 Explore P: 0.3485
Episode: 2598 Total reward: -38.75300598144531 Training loss: 0.7402 Explore P: 0.3484
Episode: 2599 Total rew

Model Saved
Episode: 2681 Total reward: -16.786361694335938 Training loss: 0.2978 Explore P: 0.3382
Episode: 2682 Total reward: -59.24674987792969 Training loss: 0.2092 Explore P: 0.3381
Episode: 2683 Total reward: 11.713226318359375 Training loss: 0.2923 Explore P: 0.3380
Episode: 2684 Total reward: 48.07037353515625 Training loss: 1.0223 Explore P: 0.3378
Episode: 2685 Total reward: 33.00053405761719 Training loss: 0.1904 Explore P: 0.3377
Model Saved
Episode: 2686 Total reward: 29.101791381835938 Training loss: 0.2021 Explore P: 0.3376
Episode: 2687 Total reward: 195.07901000976562 Training loss: 0.6414 Explore P: 0.3373
Episode: 2688 Total reward: 3.1396331787109375 Training loss: 0.2422 Explore P: 0.3372
Episode: 2689 Total reward: 37.428863525390625 Training loss: 1.9466 Explore P: 0.3370
Episode: 2690 Total reward: -75.57559204101562 Training loss: 15.5575 Explore P: 0.3369
Model Saved
Episode: 2691 Total reward: -8.518218994140625 Training loss: 0.3398 Explore P: 0.3368
Episode

Episode: 2773 Total reward: -66.00131225585938 Training loss: 0.6607 Explore P: 0.3266
Episode: 2774 Total reward: -1.1996917724609375 Training loss: 0.8246 Explore P: 0.3265
Episode: 2775 Total reward: -11.212356567382812 Training loss: 0.1941 Explore P: 0.3263
Model Saved
Episode: 2776 Total reward: 25.559494018554688 Training loss: 0.5767 Explore P: 0.3262
Episode: 2777 Total reward: 32.73443603515625 Training loss: 0.4228 Explore P: 0.3261
Episode: 2778 Total reward: 47.29740905761719 Training loss: 0.5938 Explore P: 0.3259
Episode: 2779 Total reward: 36.61305236816406 Training loss: 1.3928 Explore P: 0.3258
Episode: 2780 Total reward: 27.707595825195312 Training loss: 11.7284 Explore P: 0.3257
Model Saved


## Step 9: Watch our Agent play 👀
Now that we trained our agent, we can test it

In [None]:

with tf.Session() as sess:
    game = DoomGame()
    
    # Load the correct configuration (TESTING)
    game.load_config("deadly_corridor_testing.cfg")
    
    # Load the correct scenario (in our case deadly_corridor scenario)
    game.set_doom_scenario_path("deadly_corridor.wad")
    game.init()
    
    totalScore = 0
    
   
    # Load the model
    saver = tf.train.import_meta_graph("./models/ddqn/model.ckpt.meta")
    saver.restore(sess, tf.train.latest_checkpoint("./models/ddqn"))
    game.init()
                  
    for i in range(200):
        
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        while not game.is_episode_finished():
            exp_exp_tradeoff = np.random.rand()
            

            explore_probability = 0.01
    
            if (explore_probability > exp_exp_tradeoff):
                # Make a random action (exploration)
                action = random.choice(possible_actions)
            else:
                # Take the biggest Q value (= the best action)
                Qs = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
                
                choice = np.argmax(Qs)
                action = possible_actions[int(action)]

            game.make_action(action) 
            done = game.is_episode_finished()
            
            if done:
                break
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
                
        score = game.get_total_reward()
        time.sleep(1)
        print("TOTAL_SCORE", score)
    game.close()