In [17]:
%matplotlib inline
import numpy as np
import tensorflow as tf
import gym
from gym.wrappers import Monitor
import os
import random
import itertools
import sys
import psutil #library for retrieving information on running processes and system utilization
from collections import namedtuple

In [2]:
env = gym.make('Breakout-v0')

In [3]:
#ATARI Actions: 0 (noop), 1(fire), 2(left), 3(right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
class StateProcessor():
    '''
    Processes raw atari images. First convert to grayscale and then resize
    '''
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope('state_processor'):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)# Why 34, 0 are needed?
            self.output = tf.image.resize_images(self.output,
                [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)
    
    def process(self, sess, state):
        '''
        Args:
            sess: A TensorFLow session Object
            state: A [210, 160, 3] Atari RGB State
        
        Returns:
            A processed [84, 84] state representing grayscale values
        '''
        return sess.run(self.output, feed_dict = {self.input_state: state})

In [66]:
class Network():
    '''
    Q-Value Estimator Neural Network
    This Network will be used to create the Q-Network and the Target Network
    '''
    def __init__(self, scope='network', summaries_dir=None):
        self.scope = scope
        #Writes TensorBoard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            #Build the graph
            self.build_graph()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
    
    def build_graph(self):
        '''
        Builds the Tensorflow Graph
        '''
        
        # Placeholders for our inputs
        # Input is 4 RGB frames of shape [84, 84] each
        self.X_ph = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name='X')
        
        #The TD Target value
        self.y_ph = tf.placeholder(shape=[None], dtype=tf.float32, name='y')
        
        #Integer id of which action was selected
        self.actions_ph = tf.placeholder(shape=[None], dtype=tf.int32, name='actions')

        batch_size = tf.shape(self.X_ph)[0]
        
        #Normalize the input
        X = tf.to_float(self.X_ph)/255.0
                
        # 3 convolutional layers
        conv1 = tf.layers.conv2d(inputs=X, filters=32, kernel_size=8, strides=4, activation=tf.nn.relu)
        conv2 = tf.layers.conv2d(inputs=conv1, filters=64, kernel_size=4, strides=2, activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(inputs=conv2, filters=64, kernel_size=3, strides=1, activation=tf.nn.relu)
        
        # Fully connected layers
        flatten = tf.layers.flatten(conv3)
        fc1 = tf.layers.dense(flatten, 512, activation=tf.nn.relu)
        self.predictions = tf.layers.dense(fc1, len(VALID_ACTIONS), activation=tf.nn.relu)
        
        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_ph
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        
        # Calculate the loss
        self.losses = tf.squared_difference(self.y_ph, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)
        
        # Optimizer parameters from original paper 
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate = 0.00025,
                                                   decay = 0.99,
                                                   momentum = 0.0,
                                                   epsilon = 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.train.get_global_step())
        
        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar('loss', self.loss),
            tf.summary.histogram('loss_hist', self.losses),
            tf.summary.histogram('q_values_hist', self.predictions),
            tf.summary.scalar('max_q_value', tf.reduce_max(self.predictions))
        ])
    
    def predict(self, sess, s):
        '''
        Predicts action values
        
        Args:
            sess: Tensorflow session object
            s: Input images of shape [batch_size, 84, 84, 4]
            
        Returns:
            A tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated action values
        '''
        Q_vals = sess.run(self.predictions, feed_dict={self.X_ph: s})
        return Q_vals
    
    def update(self, sess, s, a, y):
        '''
        Updates the model towards the given target
        
        Args:
            sess: Tensorflow session object
            s: Input images of shape [batch_size, 84, 84, 4]
            a: Chosen actions of shape [batch_size]
            y: Targets of shape [batch_size]
        
        Returns:
            The calculated loss on the batch
        '''
        feed_dict = {self.X_ph: s, self.actions_ph: a, self.y_ph: y}
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.train.get_global_step(), self.train_op, self.loss],
            feed_dict = feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [6]:
# For testing...

tf.reset_default_graph()
global_step = tf.Variable(0, name='global_step', trainable=False)

test_model = Network(scope='test')
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Example Observation Batch
    observation = env.reset()
    processed_obs = sp.process(sess, observation)
    stacked_obs = np.stack([processed_obs]*4, axis=2)
    observations = np.array([stacked_obs] * 2) # Assume a batch_size of 2
    
    # Test prediction
    predictions = test_model.predict(sess, observations)
    
    
    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    loss = test_model.update(sess, observations, a, y)
    
    #gather_indices is calculated to account for change in action indices when the array is stretched out
    gather_indices = sess.run(tf.range(2) * tf.shape(test_model.predict(sess, observations)[1])) + a
    print("Loss:", loss)
    print("Q Values for all actions: \n", predictions)
    print("Q Values stretched out: \n", sess.run(tf.reshape(predictions, [-1])))
    print('Actions based on which Q Values are be selected from stretched out array:', gather_indices)
    print('Selected Q Values based on actions are:',
          sess.run(tf.gather(tf.reshape(predictions, [-1]), gather_indices)))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Loss: 100.0
Q Values for all actions: 
 [[0.04632431 0.         0.0071764  0.        ]
 [0.04632431 0.         0.0071764  0.        ]]
Q Values stretched out: 
 [0.04632431 0.         0.0071764  0.         0.04632431 0.
 0.0071764  0.        ]
Actions based on which Q Values are be selected from stretched out array: [1 7]
Selected Q Values based on actions are: [0. 0.]


In [7]:
class update_target_network():
    '''
    Copy Q Network paramters to Target Network
    '''
    def __init__(self, model1, model2):
        '''
        Defines the copy graph
        Args:
            model1: model to copy paramters from (DQNetwork)
            model2: model to copy paramters to (TargetNetwork)
        '''
        
        #Get parameters of the Q Network
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,model1.scope)
        
        # Get paramters of the Target Network
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, model2.scope)
        
        self.update_ops = []
        
        for from_var, to_var in zip(from_vars, to_vars):
            self.update_ops.append(to_var.assign(from_var))
    
    def make(self, sess):
        '''
        Makes the copy
        Args:
            sess: Tensorflow session object
        '''
        sess.run(self.update_ops)

In [64]:
def make_epsilon_greedy_policy(model, nA):
    '''
    Creates an spilon greedy policy based on the Q model and epsilon
    
    Args:
        model: The model that returns the Q values for a given state
        nA: Number of actions
    
    Returns:
        
    '''
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        #print(np.shape(np.expand_dims(observation,0)[0]))
        q_values = model.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [63]:
def deep_q_learning(sess,
                   env,
                   q_model,
                   target_model,
                   state_processor,
                   num_episodes,
                   experiment_dir,
                   replay_memory_size = 500000,
                   replay_memory_init_size = 50000,
                   update_target_model_every = 10000,
                   gamma = 0.99,
                   epsilon_start = 1.0,
                   epsilon_end = 0.1,
                   epsilon_decay_steps = 500000,
                   batch_size = 32,
                   record_video_every = 50):
    '''
    Q Learning alg for off policy TD learning using function approximation
    Finds the optimal greedy policy while following an epsilon greedy policy
    
    Args:
    Returns:
        An EpisodeStats object with 2 numpy arrays for episode_lengths and episode_rewards
    '''
    EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    # The replay memory
    replay_memory = []
    
    # Make model copier object 
    copy_params = update_target_network(q_model, target_model)
    
    # Keep track of useful statistics
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                        episode_rewards=np.zeros(num_episodes))
    
    # For system summaries, useful to check if current process looks healthy
    current_process = psutil.Process()
    
    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    checkpoint_path = os.path.join(checkpoint_dir, 'model')
    monitor_path = os.path.join(experiment_dir, 'monitor')
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
        
    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print('Loading model checkpoint {}...\n'.format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.train.get_global_step())
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # The policy we are following
    policy = make_epsilon_greedy_policy(q_model, len(VALID_ACTIONS))
    
    # Populate the replay memory with initial experience
    print('Populating replay memory')
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        #print(i)
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p= action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Add env Monitor wrapper
    env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every
                  == 0, resume= True)
        
    for i_episode in range(num_episodes):
            
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)
            
        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None
            
        # One step in the environment
        #itertools.counts() iterates infinitely, until break 
        for t in itertools.count(): 
               
            # Compute epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
                
            # Update target network
            if total_t % update_target_model_every == 0:
                copy_params.make(sess)
                print('\nTarget Network Updated')
                
            # Print out which step we r on, useful for debugging
            print('\rStep {} ({}) @ Episode {}/{}, loss: {}'.format(
                t, total_t, i_episode+1, num_episodes, loss), end="")
            sys.stdout.flush()
                
            #Take a step
            action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
            action = np.random.choice(np.arange(len(action_probs)), p= action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
                
            # If our reply memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)
                
            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))
                
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] += t
                
            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, actions_batch, rewards_batch, next_states_batch, dones_batch = map(np.array, 
                                                                                                zip(*samples))
                
            # Calculate Q values and targets
            q_values_next = target_model.predict(sess, next_states_batch)
            targets_batch = rewards_batch + np.invert(dones_batch).astype(np.float32) * gamma * np.amax(q_values_next, axis=1)
                
            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_model.update(sess, states_batch, actions_batch, targets_batch)
                
            if done:
                break
                
            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length")
        episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent")
        episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent")
        q_model.summary_writer.add_summary(episode_summary, i_episode)
        q_model.summary_writer.flush()
        
        yield total_t, EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])
        
    return stats

In [67]:
tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath('./experiments/{}'.format(env.spec.id))

# Create a global step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

# Create Models
q_model = Network(scope='q_model', summaries_dir=experiment_dir)
target_model = Network(scope='target_model')

# State Processor
state_processor = StateProcessor()

# Run it!
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                   env,
                                   q_model=q_model,
                                   target_model=target_model,
                                   state_processor=state_processor,
                                   num_episodes = 10000,
                                   experiment_dir = experiment_dir,
                                   replay_memory_size = 500000,
                                   replay_memory_init_size = 50000,
                                   update_target_model_every = 10000,
                                   gamma = 0.99,
                                   epsilon_start = 1.0,
                                   epsilon_end = 0.1,
                                   epsilon_decay_steps = 500000,
                                   batch_size = 32,
                                   record_video_every = 50):
        print('\nEpisode Reward: {}'.format(stats.episode_rewards[-1]))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating replay memory

Target Network Updated
Step 228 (228) @ Episode 1/10000, loss: 4.7760153393028304e-05
Episode Reward: 1.0
Step 274 (502) @ Episode 2/10000, loss: 2.5199138690368272e-05
Episode Reward: 2.0
Step 234 (736) @ Episode 3/10000, loss: 4.92715735163074e-0555
Episode Reward: 2.0
Step 171 (907) @ Episode 4/10000, loss: 0.00025852100225165486
Episode Reward: 0.0
Step 281 (1188) @ Episode 5/10000, loss: 0.00030659703770652413
Episode Reward: 2.0
Step 284 (1472) @ Episode 6/10000, loss: 8.309585973620415e-054
Episode Reward: 2.0
Step 232 (1704) @ Episode 7/10000, loss: 1.1474141501821578e-05
Episode Reward: 1.0
Step 203 (1907) @ Episode 8/10000, loss: 2.3410702851833776e-05
Episode Reward: 1.0
Step 185 (2092) @ Episode 9/10000, loss: 1.382572554575745e-055
Episode Reward: 0.0
Step 381 (2473) @ Episode 10/10000, loss: 2.7261648938292637e-05
Episode Reward: 3.0
Step 249 (2722) @ Episode 11/10000, loss: 2.1620740881189704e-05
Episode Reward: 1.0
Step 176 (2898) @ Episode 12/

OSError: [Errno 12] Cannot allocate memory