In [1]:
import tensorflow as tf
import numpy as np
import retro
from skimage import transform
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
from collections import deque
import random
import warnings

env = retro.make(game='SpaceInvaders-Atari2600')
print("Frame size: ", env.observation_space)
print("Actions available: ", env.action_space.n)
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Frame size:  Box(210, 160, 3)
Actions available:  8


In [2]:
#Preprocessing params
stack_size = 4
stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4) #clear stack 

def preprocess_frame(frame):
    gray = rgb2gray(frame)
    cropped_frame = gray[8:-12, 4:-12]
    normalized_frame = cropped_frame/255.0
    preprocessed_frame = transform.resize(normalized_frame, [110, 84])
    return preprocessed_frame

#Skip four frames each timestep and stack those frames into queue to provide network sense of position, velocity, acceleration
#Appending frame to deque removes oldest frame on stack; formulate state
def stack_frames(stacked_frames, state, is_new_eps):
    frame = preprocess_frame(state)
    if is_new_eps: #on new eps
        stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range(stack_size)], maxlen=4) #clear stack 
        stacked_frames.append(frame) #init stack on eps
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)

        
        stacked_state = np.stack(stacked_frames, axis=2) #joins frames
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    return stacked_state, stacked_frames


In [3]:
#Model params defining MDP
state_size = [110, 84, 4] #provide stack 4 frames each 110 x 84; each action repeatedly performed for four frames of stack
action_size = env.action_space.n #8 possible actions
learning_rate = 0.00025 #alpha

#Training params defining how DQN will learn
total_episodes = 50
max_steps = 50000
batch_size = 64

#Exploration params for epsilon-greedy action selection
explore_start = 1.0 #max exploration prob
explore_stop = 0.01 #min exploration prob
decay_rate = 0.00001 #gamma param extremely low thus agent will value actions taken long ago

gamma = 0.9

#Q-learning params
pretrain_length = batch_size #experiences in mem at init
memory_size = 1000000 #experiences stored in mem to improve convergence time to optimal state-action values


training = True
episode_render = True


In [4]:
#Class to define architecture of DNN, no training/learning/init done 
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions_")
            
            #sample episilon-greedy action step into env, observe next states, rewards and provide to DNN upon init
            #purpose of stepping through env is to gain exp and iteratively train DNN by minimizing loss 
            #receive set of q-values per actions from DNN, select max and set as target_Q
            #store experience (reward, action, state, next_state) at time step in replay memory
            #using experiences, compute target Q-value which is desirable distribution DNN must approximate
            #iterate to following state
            #load up experiences and provide as vector of inputs to DNN and compute prediction per each experience
            #apply loss function that computes error between DNN output and optimal distribution from stored experiences in memory
            #backpropagate computations to tune weights & repeat; DNN is learning 
            self.target_Q = tf.placeholder(tf.float32, [None], name="target") 
            
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_, 
                                          filters=32,
                                          kernel_size=[8,8],
                                          strides=[4,4],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv1")
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out, 
                                          filters=64,
                                          kernel_size=[4,4],
                                          strides=[2,2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv2")
            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")

            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out, 
                                          filters=64,
                                          kernel_size=[3,3],
                                          strides=[2,2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv3")
            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc1")
            
            self.output = tf.layers.dense(inputs=self.fc,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units=self.action_size,
                                          activation=None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_)) #predicted Q-value computed by DNN
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)


In [5]:
tf.reset_default_graph()
DQNetwork = DQNetwork(state_size, action_size, learning_rate)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.


In [6]:
#Apply experience replay to ensure agent correctly behaves to previously trained envs 
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    def add(self, experience):
        self.buffer.append(experience)
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        #start from beg, increment by 1 & end of range at buffer_size; sample num from buffer & apply to batch
        #buffer reps all experiences collected upto max_size, sample from it and repeat op over batch_size
        index = np.random.choice(np.arange(buffer_size),
                                 size = batch_size,
                                 replace = False)
        return [self.buffer[i] for i in index]
            
#Populate memory by taking rand actions, storing experience (tuple of: state, action, reward, next_state)
memory = Memory(max_size = memory_size)
for i in range(pretrain_length):
    if i == 0: #first step
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    choice = random.randint(1, len(possible_actions)) - 1
    action = possible_actions[choice]
    next_state, reward, done, info = env.step(action)
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

    
    if done:
        next_state = np.zeros(state.shape)
        memory.add((state, action, reward, next_state, done))
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        memory.add((state, action, reward, next_state, done))
        state = next_state

In [7]:
# Setup TensorBoard Writer; used to visualize graphical struc of NN 
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [8]:
#Use epsilon-greedy action selection
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    exp_tradeoff = np.random.rand()
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    #Explore random action
    if(explore_probability > exp_tradeoff):
        choice = random.randint(1, len(possible_actions)) - 1
        action = possible_actions[choice]
    #Choose most valuable greedy action using DQN
    else:
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        choice = np.argmax(Qs)
        action = possible_actions[choice]
    return action, explore_probability

In [None]:
#Train DQN using stored experiences and estimated network output at each time-step until loss is minimal & func approx optimized
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        for episode in range(total_episodes):
            step = 0
            episode_rewards = []
            state = env.reset()
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            while step < max_steps:
                step += 1
                decay_step += 1
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                next_state, reward, done, information = env.step(action)
                if episode_render:
                    env.render()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((110, 84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    step = max_steps
                    total_reward = np.sum(episode_rewards)
                    print('Episode: {}'.format(episode),'Total reward: {}'.format(total_reward),'Explore P: {:.4f}'.format(explore_probability), 'Training Loss {:.4f}'.format(loss))
                    memory.add((state, action, reward, next_state, done))
                else:
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                #Retrieve information from memory 
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin = 3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch])
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                #Obtain expected Q-vals per all following states to compute Q-val for current state: target_Q
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                #Set Q_target = r if the episode ends at next-state, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                #Collect Q-vals of current states for entire batch of actions        
                targets_mb = np.array([each for each in target_Qs_batch])
                
                #Compute loss by comparing calculated Q_target val against predicted Q-val found via DQN
                #Feed vals to placeholders of tf graph for computation
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer], feed_dict={DQNetwork.inputs_:states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb})
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_:states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved") 
                
                

In [9]:
with tf.Session() as sess:
    total_test_rewards = []
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *state_size))

            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state})
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            next_state, reward, done, _ = env.step(action)
            env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
    env.close()

NameError: name 'saver' is not defined