# Modular Abstraction Transfer Suite

### A Deep Mind Based Experimental Platform For Reasearch In Abstaction And Generalization

We set out to assemble a team of Algorithms to fight the evil of an unseen game. Unfortunately, this task became nearly insurmountable. First iterations of the code were based implementations using a deque for replay memory, and using a single instance training loop. These systems did show promise in early experiments but were so inefficient that it would have likely taken months of training to reach a respectable level let alone expert level game play.

The solution it seems is multi-part, so we will spread out the explanations here.

|
Great References:

https://towardsdatascience.com/tutorial-double-deep-q-learning-with-dueling-network-architectures-4c1b3fb7f756

https://keras.io/examples/rl/deep_q_network_breakout/

https://stackoverflow.com/questions/15455048/releasing-memory-in-python


#### Interactive Playgound (run after notebook)

In [1]:
# # # Run From Top (Hint: It must be loaded commented first)
# myPlayground()

#### Selector Menu

In [2]:
##   Game Selector
ENVIRONMENT_NAME = 'BreakoutNoFrameskip-v4' #@param ['Atlantis-v0', 'DemonAttack-v0', 'Phoenix-v0', 'Riverraid-v0', 'Solaris-v0', 'Asterix-v0', 'Breakout-v0', 'Boxing-v0', 'Pong-v0', 'BattleZone-v0', 'SpaceInvaders-v0', 'BeamRider-v0','AtlantisNoFrameskip-v4', 'DemonAttackNoFrameskip-v4', 'PhoenixNoFrameskip-v4', 'RiverraidNoFrameskip-v4', 'SolarisNoFrameskip-v4', 'AsterixNoFrameskip-v4', 'BreakoutNoFrameskip-v4', 'BoxingNoFrameskip-v4', 'PongNoFrameskip-v4', 'BattleZoneNoFrameskip-v4', 'SpaceInvadersNoFrameskip-v4', 'BeamRiderNoFrameskip-v4']

# Render gameplay in cell or viewer
RENDER = False #@param {type:"boolean"}

# Slow down game play and discontinue training for observation
OBSERVATION_MODE = False #@param {type:"boolean"}

# Display Text Output to Observe Activations and Q Updates
DIAGNOSTIC_MODE = False #@param {type:"boolean"}

## Chippie the Progress Bot Settings       (Name courtesy of: Kylie Locker)
CHIPPIE_PROGESS_REPORTS = True #@param {type:"boolean"}
# Row for progress bot stacking ( 1, 8, 12 )
MAX_CHIPPIES_ROW = 8 #@param {type:"integer"}

# Provide GPU information
USE_GPU_SUPPORT = True #@param {type:"boolean"}


## Data Settings
# Import and Unzip Hosted Dataset
CREATE_MISSING_DIRECTORIES = True #@param {type:"boolean"}
IMPORT_MAIN_DATA = True #@param {type:"boolean"}
IMPORT_SENTIMENT_DATA = True #@param {type:"boolean"}
IMPORT_MODEL_DATA = True #@param {type:"boolean"}


## Agent Settings
# Parameters
n_episodes = 50000 #@param {type:"integer"}
BATCH_SIZE =  32#@param {type:"integer"}
action_steps =  4#@param {type:"integer"}
skip_start = 1 #@param {type:"integer"}
agent_gamma = 0.985 #@param {type:"slider", min:0, max:1, step:0.0001}
agent_epsilon = 1.0 #@param {type:"slider", min:0, max:1, step:0.0001}
agent_epsilon_decay = 0.9999 #@param {type:"slider", min:0.5, max:1.000, step:0.0001}
agent_epsilon_min = 0.1 #@param {type:"slider", min:0, max:1.000, step:0.0001}
agent_learning_rate = 0.00025 #@param {type:"number", min:0.0000, max:0.0200, step:0.00001}
# lr=0.00042 # lr=0.00125 #lr=0.00025
# agent_learning_rate = 0.00025 # works??
## Buffer Settings
STARTING_MEMORY_SIZE = 75000#@param {type:"integer"}
MAX_MEMORY_LENGTH = 100000 #@param {type:"integer"}
Q_UPDATE_FREQUENCY = 10000 #@param {type:"integer"}

# Pre Buffer Guidance
WINDOW_SIZE = (84, 84)

# Sub-Model Settings
SLICE_ONE_NAME = "s1_MAT_ImageClassifier_v5" #@param {type:"string"}
SLICE_ONE_TRAINABLE = True #@param {type:"boolean"}
SLICE_ONE_CHECKPOINT = True #@param {type:"boolean"}

SLICE_TWO_NAME = "s2_MAT_ImageClassifier_v5" #@param {type:"string"}
SLICE_TWO_TRAINABLE = True #@param {type:"boolean"}
SLICE_TWO_CHECKPOINT = True #@param {type:"boolean"}

SLICE_THREE_NAME = "s3_MAT_ImageClassifier_v2" #@param {type:"string"}
SLICE_THREE_TRAINABLE = True #@param {type:"boolean"}
SLICE_THREE_CHECKPOINT = True #@param {type:"boolean"}

FULL_MODEL_NAME = "s2_MAT_Agent_Testing_v2" #@param {type:"string"}
FULL_MODEL_TRAINABLE = True #@param {type:"boolean"}
FULL_MODEL_CHECKPOINT = True #@param {type:"boolean"}

CHECKPOINT_FREQUENCY =  1000#@param {type:"integer"}

SAVE_AGENT = True #@param {type:"boolean"}

## Game selector feedback
print("Selected Game: " + ENVIRONMENT_NAME)


Selected Game: BreakoutNoFrameskip-v4


#### Setup Model Output Directory

In [3]:
%%capture

import os

try:
  import google.colab
  !pip install baselines
  COLAB = True
  
except:
  !pip install gdown
  COLAB = False

model_dir = 'ISAR_Model_Data'
data_dir = 'ISAR_Main_Classification'
transfer_dir = 'ISAR_Sentiment_Transfer'


if IMPORT_MAIN_DATA :
    if not os.path.exists(data_dir): ## Make it if it doesn't exist
      !gdown https://drive.google.com/uc?id=1P7o1x4ZpPbd16VQDwaMzllbN-tlfqqIH
      !unzip ISAR_Main_Classification.zip
      !rm ISAR_Main_Classification.zip
if IMPORT_SENTIMENT_DATA:
    if not os.path.exists(transfer_dir): ## Make it if it doesn't exist
      !gdown https://drive.google.com/uc?id=1UDUNnw04q5cvms5ibM6pNn-wXtJphXxZ
      !unzip ISAR_Sentiment_Transfer.zip
      !rm ISAR_Sentiment_Transfer.zip
if IMPORT_MODEL_DATA:
    if not os.path.exists(model_dir): ## Make it if it doesn't exist
      !gdown https://drive.google.com/uc?id=1DIc_J6XyKzNDSMjSbGkt3vzQep4YUEYU
      !unzip ISAR_Model_Data.zip
      !rm ISAR_Model_Data.zip
      

In [4]:
if CREATE_MISSING_DIRECTORIES:  
  if not os.path.exists(data_dir): ## Make it if it doesn't exist
    print("Creating Main Data Directory")
    os.makedirs(data_dir)
  else:
    print("Main Data Directory Found")
  if not os.path.exists(transfer_dir): ## Make it if it doesn't exist
    print("Creating Transfer Data Directory")
    os.makedirs(transfer_dir)
  else:
    print("Transfer Data Directory Found")
  if not os.path.exists(model_dir): ## Make it if it doesn't exist
    print("Creating Model Directory")
    os.makedirs(model_dir)
  else:
    print("Model Directory Found")
    

Main Data Directory Found
Transfer Data Directory Found
Model Directory Found


#### Setup Imports

In [5]:
## Begin by importing . . .  oh . . . everything!
try:
  import math
  import random
  import numpy as np

  import glob
  import io
  import base64
  from time import sleep

  from collections import deque

  import gym
  import tensorflow as tf
  import tensorflow_hub as hub
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, BatchNormalization
  from tensorflow.keras.optimizers import Nadam, Adam
  from tensorflow import keras

  import matplotlib
  import matplotlib.pyplot as plt
  %matplotlib inline

  from IPython.display import HTML
  from IPython import display as ipythondisplay
  from IPython.display import clear_output

except:
  %%capture
  if COLAB:
    ## For colab we must install some dependancies
    !apt-get install -y xvfb x11-utils
    ## Next we will need to install a virtual display and correct Open AI installation
    !pip install gym[all]==0.17.3
    !pip install pyvirtualdisplay==0.2.* 
    !pip install PyOpenGL==3.1.* 
    !pip install PyOpenGL-accelerate==3.1.*
    !pip install pyglet
    # So let's setup the virtual display
    import pyvirtualdisplay
    # use False with Xvfb
    _display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
    _ = _display.start()
    # Now Check the Display
    !echo $DISPLAY

if USE_GPU_SUPPORT:
  print("TF version:", tf.__version__)
  print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

  # Set Memory Growth
  gpus = tf.config.experimental.list_physical_devices('GPU')
  if gpus:
    try:
      # Currently, memory growth needs to be the same across GPUs
      for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        print(gpu)
      logical_gpus = tf.config.experimental.list_logical_devices('GPU')
      print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
      # Memory growth must be set before GPUs have been initialized
      print(e)

TF version: 2.4.1
Num GPUs Available:  1
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
1 Physical GPUs, 1 Logical GPUs


#### Setup Frame PreProcessor

After managing to make batches acceptable to the TensorFlow's .fit() method, we noticed a massive increase in training speed. The victory was short lived, however, as memory usage skyrocket outside the limits of both colaboratory and one local workstation (128GB). So, both the deque and image pre-processors had to be re-examined

In [6]:
### Custom Pre-Processor
# Stack frames and average on axis=-1 to produce a Single 210, 160, 1 greyscale image #old crop = img = frame[1:176:2, ::2]
# Rescale to 104, 80, 1 then stack in three's (3) to produce a single RGB compatible representation of evironmental space-time
class TFramePreBuffer:

    def __init__(self, t_size=3, setting=0, scaling=1):
        self.un_flicker_memory = []
        self.temporal_memory = []
        self.temporal_size = t_size
        self.processor_setting = setting
        self.scale = scaling


    def process_frame(self, frame):
        if self.scale == 0:
            if self.processor_setting == 0:
                return np.array(frame[1:209:2, ::2]).astype(np.uint8)*(1/255.0)

            if self.processor_setting == 1:
                if len(self.temporal_memory) >= self.temporal_size:
                      del self.temporal_memory[:1]
                while len(self.temporal_memory) < self.temporal_size:
                    self.temporal_memory.append(frame[1:209:2, ::2].mean(axis=2,keepdims=True).astype(np.uint8)) 
                temporal_image = np.concatenate((self.temporal_memory[0],
                                                  self.temporal_memory[1],
                                                  self.temporal_memory[2]), 
                                                axis=-1)*(1/255.0)
                
                return temporal_image

            if self.processor_setting == 2:
                if len(self.temporal_memory) >= self.temporal_size:
                    del self.temporal_memory[:1]
                while len(self.un_flicker_memory) < 2:
                    self.un_flicker_memory.append(frame[1:209:2, ::2].astype(np.uint8))
                while len(self.temporal_memory) < self.temporal_size:
                    self.temporal_memory.append(np.concatenate((self.un_flicker_memory[0].mean(axis=2,keepdims=True),
                                                                self.un_flicker_memory[1].mean(axis=2,keepdims=True)),
                                                                axis=-1).max(axis=2))
                temporal_image = np.concatenate(np.expand_dims((self.temporal_memory[0], 
                                                                self.temporal_memory[1], 
                                                                self.temporal_memory[2]), 
                                                              axis=-1), axis=-1)*(1/255.0)

                return temporal_image


        if self.scale == 1:
            if self.processor_setting == 0:
                return (np.array(frame[1:209:2, ::2]).astype(np.uint8)-128)*(1/128.0)

            if self.processor_setting == 1:
                if len(self.temporal_memory) >= self.temporal_size:
                    del self.temporal_memory[:1]
                    # self.temporal_memory.pop(1)
                while len(self.temporal_memory) < self.temporal_size:
                    self.temporal_memory.append(frame[1:209:2, ::2].mean(axis=2,keepdims=True).astype(np.uint8)) 
                temporal_image = (np.concatenate((self.temporal_memory[0],
                                                  self.temporal_memory[1],
                                                  self.temporal_memory[2]), 
                                                axis=-1)-128)*(1/128.0)
                return temporal_image

            if self.processor_setting == 2:
                if len(self.temporal_memory) >= self.temporal_size:
                    del self.temporal_memory[:1]
                while len(self.un_flicker_memory) < 2:
                    self.un_flicker_memory.append(frame[1:209:2, ::2].astype(np.uint8))
                while len(self.temporal_memory) < self.temporal_size:
                    self.temporal_memory.append(np.concatenate((self.un_flicker_memory[0].mean(axis=2,keepdims=True),
                                                                self.un_flicker_memory[1].mean(axis=2,keepdims=True)),
                                                                axis=-1).max(axis=2))
                temporal_image = (np.concatenate(np.expand_dims((self.temporal_memory[0], 
                                                                self.temporal_memory[1], 
                                                                self.temporal_memory[2]), 
                                                              axis=-1), axis=-1)-128)/128
                return temporal_image


#### Instantiate and call with
# tFrame.process_frame(frame)

## Resize Only
# tFrame = TFramePreBuffer(t_size=3, setting=0, scaling=1)

## Resize Grayscale and Stack Temporally
tFrame = TFramePreBuffer(t_size=3, setting=1, scaling=1)

# ## Resize Grayscale De-Flicker and Stack Temporally
# tFrame = TFramePreBuffer(t_size=3, setting=2, scaling=1)



Implementing the image pre-processor buffer using numpy arrays saved a few extra operations so I am confident this helped alot

#### Setup Progress Bot

In [7]:
# import gc
import sys

class ChippieProgressBot:

    def __init__(self, window_size=100, row_configuration=8, log_size=2000):
        self.rowCount = 0
        self.miniScore = 0
        self.movingAverage = []
        self.windowSize = window_size
        self.facelist1 = ["~{0_0}~","~{o_o}~","~{o_0}~","~{0_o}~"]
        self.row_configuration = row_configuration
        self.lossLog = []
        self.logSize = log_size
        self.lessChippies = True
        self.lessesChippies = 250
        self.chippieCount = 0

    def training(self, score):
      if not self.lessChippies:
        if self.miniScore < score:
            self.miniScore += 1
            print(self.facelist1[int(self.miniScore % 4)] + str(int(score))+"   ", end='')
            self.rowCount += 1
            if self.rowCount >= self.row_configuration:
                print("\n")
                self.rowCount = 0
        else:
           return
      else:
        return


    def q_update(self):
      if not self.lessChippies:
        print("""  {-_-}    """, end='')
        self.rowCount += 1
        if self.rowCount >= MAX_CHIPPIES_ROW:
            print("\n")
            self.rowCount = 0
      else:
        return

    def dead(self, score, totalScore, episode, completion_target,  survived, experiance, memory, epsilon):
      
        if len(self.movingAverage) >= self.windowSize:
          del self.movingAverage[:1]
        self.movingAverage.append(score)
        if len(self.lossLog) > 2:
          trailing = sum(self.lossLog[:len(self.lossLog)//2])/(len(self.lossLog)*0.5)

          leading = sum(self.lossLog[len(self.lossLog)//2:])/(len(self.lossLog)*0.5)

        self.rowCount = 0
        self.miniScore = 0
        if not self.lessChippies:

          # running_reward = np.mean(episode_reward_history)
          print(""" `{x_X}~   """+"\n")

          print("Episode: {}/{}, Episode Score: {}, Avg Episode Score: {:.4}, Survival Time: {}"
          .format(episode+1, completion_target, score, totalScore/(episode+1), survived)+"\n")

          print("Your agent has a running average Score per 100 episodes of: \{^,^}~"+"{:.3}"
          .format(sum(self.movingAverage)/len(self.movingAverage))+"\n")

          print("Total Steps: {}, Memory Size: {}, Current Epsilon Value: {:.2}, Leading Loss: {:.4}/ Trailing Loss: {:.4}"
          .format(experiance, memory, epsilon, leading, trailing)+"\n\n")

        
        else:
          if (episode+1) % self.lessesChippies == 0:

            print("\n"+"Episode: {}/{}, Episode Score: {}, Avg Episode Score: {:.4}, Survival Time: {}"
            .format(episode+1, completion_target, score, totalScore/(episode+1), survived))

            print("Your agent has a running average Score per 100 episodes of: \{^,^}~"+"{:.3}"
            .format(sum(self.movingAverage)/len(self.movingAverage)))

            print("Total Steps: {}, Memory Size: {}, Current Epsilon Value: {:.2}, Leading Loss: {:.4}/ Trailing Loss: {:.4}"
            .format(experiance, memory, epsilon, leading, trailing)) 
          else:
            # self.chippieCount += 1
            return

        # gc.collect()
        # print(sys.getrefcount(tFrame.temporal_memory))
        # print(sys.getrefcount(agent.done_memory[0]))

    def logLoss(self, loss):
        if len(self.lossLog) >= self.logSize:
            del self.lossLog[:1]
        self.lossLog.append(loss)
        # trailing = sum(self.lossLog[:len(self.lossLog)//2])/(len(self.lossLog)*0.5)
        # print(leading)
        # leading = sum(self.lossLog[len(self.lossLog)//2:])/(len(self.lossLog)*0.5)
        # print(trailing)

# chippie.training(score)
chippie = ChippieProgressBot(window_size=100, row_configuration=MAX_CHIPPIES_ROW)

#### New Replay Memory

In [8]:
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = np.empty(max_size, dtype=np.object)
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = np.random.randint(self.size, size=batch_size)
        return self.buffer[indices]

#### Define the agent

In the search for optimization, we found a few pointers to use variance scaling in layer declaration. I include the links for a deeper dive, however essentially this means that the initiallization of weights and biases are resricted to a range with a standard deviation of <= 2.0. This makes sense since the Q-values we are hoping for will yield need to be close to each other to promote further explorations.


After, careful examination of a much faster implementation attempting to solve the traditional 4-frame 84x84 version was very helpful. So, we took time to implement the network using GradientTape(), which made the code run about twice as fast.

In [9]:
class DQNAgent:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.next_state_memory = []
        self.done_memory = []
        self.replay_memory = ReplayMemory(max_size=100000)
        # self.replay_memory = deque(max_size=100000)
        
        self.update_rate = Q_UPDATE_FREQUENCY # Number of steps until updating the target network

        self.gamma = agent_gamma # decay or discount rate

        self.epsilon = agent_epsilon # exploration rate
        self.epsilon_decay = agent_epsilon_decay # exploration decay
        self.epsilon_min = agent_epsilon_min # min exploration

        self.learning_rate = agent_learning_rate # SGD or Nadam rate param

        # self.model, self.sliceOne, self.sliceTwo, self.sliceThree = self._build_new_model()
        # self.target_model, self.target_sliceOne, self.target_sliceTwo, self.target_sliceThree = self._build_new_model()

        # self.model, self.sliceOne, self.sliceTwo, self.sliceThree = self._build_1S_transfer_model()
        # self.target_model, self.target_sliceOne, self.target_sliceTwo, self.target_sliceThree = self._build_1S_transfer_model()

        self.model, self.sliceOne, self.sliceTwo, self.sliceThree = self._build_2S_transfer_model()
        self.target_model, self.target_sliceOne, self.target_sliceTwo, self.target_sliceThree = self._build_2S_transfer_model()

        # self.model, self.sliceOne, self.sliceTwo, self.sliceThree = self._build_3S_transfer_model()
        # self.target_model, self.target_sliceOne, self.target_sliceTwo, self.target_sliceThree = self._build_3S_transfer_model()

        self.target_model.set_weights(self.model.get_weights()) # create Q-target network
        
        self.model.summary()

  
    def _build_new_model(self): # private method
        # Slice One
        s1_input_layer = tf.keras.Input(shape=WINDOW_SIZE + (3,))
        s1_conv1 = tf.keras.layers.Conv2D(32, kernel_size=(8, 8),strides=4, activation='relu', name='S1_Conv1', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s1_input_layer)
        s1_output = tf.keras.layers.Conv2D(64, kernel_size=(4, 4),strides=2, activation='relu', name='S1_Conv2', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s1_conv1)
        s1_model = tf.keras.Model(inputs=s1_input_layer, outputs=s1_output)
        # s1_model.summary()

       # Slice Two
        s2_input_layer = tf.keras.Input(shape=(11, 8, 64))
        s2_conv1 = tf.keras.layers.Conv2D(64, kernel_size=(3, 3),strides=1, activation='relu', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s2_input_layer)
        s2_flat_1 = tf.keras.layers.Flatten(name='S2_Flat1')(s2_conv1)
        s2_output = tf.keras.layers.Dense(512, activation='relu', name='s2_Dense_1', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s2_flat_1)
        s2_model = tf.keras.Model(inputs=s2_input_layer, outputs=s2_output)
        s2_model._name = SLICE_TWO_NAME
        # s2_model.summary()

        # Slice Three
        s3_input_layer = tf.keras.Input(shape=(512))
        s3_output = tf.keras.layers.Dense(self.action_size, activation='linear', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s3_input_layer)
        s3_model = tf.keras.Model(inputs=s3_input_layer, outputs=s3_output)
        s3_model._name = SLICE_THREE_NAME
        # s3_model.summary()

        full_model_input = tf.keras.Input(shape=(WINDOW_SIZE+(3,)))
        s1_pass = s1_model(full_model_input) #, training=False)
        s2_pass = s2_model(s1_pass)
        s3_final_output = s3_model(s2_pass)

        
        full_model = tf.keras.Model(inputs=full_model_input, outputs=s3_final_output)
        full_model._name = FULL_MODEL_NAME
        # full_model.summary()

        s1_model.trainable = SLICE_ONE_TRAINABLE # ref param above
        s2_model.trainable = SLICE_TWO_TRAINABLE # ref param above
        s3_model.trainable = SLICE_THREE_TRAINABLE # ref param above
        full_model.trainable = FULL_MODEL_TRAINABLE # ref param above
        
        # # Using Nadam because its awesome
        self.optimizer = keras.optimizers.Nadam(learning_rate=self.learning_rate, clipnorm=1.0)
        # # Using Huber loss for stability
        self.loss_function = keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
        
        return full_model, s1_model, s2_model, s3_model
        

    def _build_1S_transfer_model(self): # private method    

        # Slice One
        s1_input_layer = tf.keras.Input(shape=WINDOW_SIZE + (3,))
        s1_model = tf.keras.models.load_model(model_dir + '/' + SLICE_ONE_NAME)
        s1_model._name = SLICE_ONE_NAME
        # s1_model.summary()

        # Slice Two
        s2_input_layer = tf.keras.Input(shape=(11, 8, 64))
        s2_conv1 = tf.keras.layers.Conv2D(64, kernel_size=(3, 3),strides=1, activation='relu', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s2_input_layer)
        s2_flat_1 = tf.keras.layers.Flatten(name='S2_Flat1')(s2_conv1)
        s2_output = tf.keras.layers.Dense(512, activation='relu', name='s2_Dense_1', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s2_flat_1)
        s2_model = tf.keras.Model(inputs=s2_input_layer, outputs=s2_output)
        s2_model._name = SLICE_TWO_NAME
        # s2_model.summary()

        # Slice Three
        s3_input_layer = tf.keras.Input(shape=(512))
        s3_output = tf.keras.layers.Dense(self.action_size, activation='linear', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s3_input_layer)
        s3_model = tf.keras.Model(inputs=s3_input_layer, outputs=s3_output)
        s3_model._name = SLICE_THREE_NAME
        # s3_model.summary()

        full_model_input = tf.keras.Input(shape=(WINDOW_SIZE+(3,)))
        s1_pass = s1_model(full_model_input) #, training=False)
        s2_pass = s2_model(s1_pass)
        s3_final_output = s3_model(s2_pass)

        
        full_model = tf.keras.Model(inputs=full_model_input, outputs=s3_final_output)
        full_model._name = FULL_MODEL_NAME
        # full_model.summary()

        s1_model.trainable = SLICE_ONE_TRAINABLE # ref param above
        s2_model.trainable = SLICE_TWO_TRAINABLE # ref param above
        s3_model.trainable = SLICE_THREE_TRAINABLE # ref param above
        full_model.trainable = FULL_MODEL_TRAINABLE # ref param above
        
        # # Using Nadam because its awesome
        self.optimizer = keras.optimizers.Nadam(learning_rate=self.learning_rate, clipnorm=1.0)
        # # Using Huber loss for stability
        self.loss_function = keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
        
        return full_model, s1_model, s2_model, s3_model


    def _build_2S_transfer_model(self): # private method    

        # Slice One
        s1_input_layer = tf.keras.Input(shape=(3,)+ WINDOW_SIZE)
        s1_model = tf.keras.models.load_model(model_dir + '/' + SLICE_ONE_NAME)
        s1_model._name = SLICE_ONE_NAME
        # s1_model.summary()

        # Slice Two
        s2_input_layer = tf.keras.Input(shape=(9,9,64))
        # s2_input_layer = tf.keras.Input(shape=(s1_model.shape[1::1]))
        s2_model = tf.keras.models.load_model(model_dir + '/' + SLICE_TWO_NAME)
        s2_model._name = SLICE_TWO_NAME
        # s2_model.summary()

        # Slice Three
        s3_input_layer = tf.keras.Input(shape=(512))
        s3_output = tf.keras.layers.Dense(self.action_size, activation='linear', \
                                    kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0))(s3_input_layer)
        s3_model = tf.keras.Model(inputs=s3_input_layer, outputs=s3_output)
        s3_model._name = SLICE_THREE_NAME
        # s3_model.summary()

        full_model_input = tf.keras.Input(shape=(WINDOW_SIZE+(3,)))
        s1_pass = s1_model(full_model_input) #, training=False)
        s2_pass = s2_model(s1_pass)
        s3_final_output = s3_model(s2_pass)

        
        full_model = tf.keras.Model(inputs=full_model_input, outputs=s3_final_output)
        full_model._name = FULL_MODEL_NAME
        # full_model.summary()

        s1_model.trainable = SLICE_ONE_TRAINABLE # ref param above
        s2_model.trainable = SLICE_TWO_TRAINABLE # ref param above
        s3_model.trainable = SLICE_THREE_TRAINABLE # ref param above
        full_model.trainable = FULL_MODEL_TRAINABLE # ref param above
        
        # # Using Nadam because its awesome
        self.optimizer = keras.optimizers.Nadam(learning_rate=self.learning_rate, clipnorm=1.0)
        # # Using Huber loss for stability
        self.loss_function = keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
        
        return full_model, s1_model, s2_model, s3_model


    def _build_3S_transfer_model(self): # private method    

        # Slice One
        s1_input_layer = tf.keras.Input(shape=WINDOW_SIZE + (3,))
        s1_model = tf.keras.models.load_model(model_dir + '/' + SLICE_ONE_NAME)
        s1_model._name = SLICE_ONE_NAME
        # s1_model.summary()

        # Slice Two
        s2_input_layer = tf.keras.Input(shape=(11, 8, 64))
        s2_model = tf.keras.models.load_model(model_dir + '/' + SLICE_TWO_NAME)
        s2_model._name = SLICE_TWO_NAME
        # s2_model.summary()

        # Slice Three                                   #### ndim 4?
        s3_input_layer = tf.keras.Input(shape=(512))
        s3_model = tf.keras.models.load_model(model_dir + '/' + SLICE_THREE_NAME)
        s3_model._name = SLICE_THREE_NAME
        # s3_model.summary()

        full_model_input = tf.keras.Input(shape=(WINDOW_SIZE+(3,)))
        s1_pass = s1_model(full_model_input) #, training=False)
        s2_pass = s2_model(s1_pass)
        s3_final_output = s3_model(s2_pass)

        
        full_model = tf.keras.Model(inputs=full_model_input, outputs=s3_final_output)
        full_model._name = FULL_MODEL_NAME
        # full_model.summary()

        s1_model.trainable = SLICE_ONE_TRAINABLE # ref param above
        s2_model.trainable = SLICE_TWO_TRAINABLE # ref param above
        s3_model.trainable = SLICE_THREE_TRAINABLE # ref param above
        full_model.trainable = FULL_MODEL_TRAINABLE # ref param above
        
        # # Using Nadam because its awesome
        self.optimizer = keras.optimizers.Nadam(learning_rate=self.learning_rate, clipnorm=1.0)
        # # Using Huber loss for stability
        self.loss_function = keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
        
        return full_model, s1_model, s2_model, s3_model


    # def rememberOld(self, state, action, reward, next_state, done):
    #     # My newly implemented memory, Hopefully it makes us faster
    #     self.state_memory.append(state)
    #     self.action_memory.append(action)
    #     self.reward_memory.append(reward)
    #     self.next_state_memory.append(next_state)
    #     self.done_memory.append(done)
    #     if len(self.reward_memory) > MAX_MEMORY_LENGTH:
    #         del self.reward_memory[:1]
    #         del self.state_memory[:1]
    #         del self.next_state_memory[:1]
    #         del self.action_memory[:1]
    #         del self.done_memory[:1]

    def remember(self, state, action, reward, next_state, done):
        # My newly reimplemented memory, Hopefully it makes us faster
        self.replay_memory.append((state, action, reward, next_state, done))

    def jit_sampler(self, batch_size):
        # idx = np.random.choice(range(len(self.reward_memory)), size=batch_size)
        # idx = np.random.randint(self.replay_memory.size, size=batch_size)
        batch = self.replay_memory.sample(batch_size)

        # batch = [self.replay_memory[i] for i in idx]
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = [
                np.array([exp[fidx] for exp in batch])
                for fidx in range(5)]
        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

                 
        #  = np.array([self.state_memory[i] for i in idx])
        #  = [self.action_memory[i] for i in idx]
        #  = [self.reward_memory[i] for i in idx]
        #  = np.array([self.next_state_memory[i] for i in idx])
        #  = tf.convert_to_tensor([float(self.done_memory[i]) for i in idx])

    def train(self, batch_size): # method that trains agent model

        ### The New Implementation
        # idx = np.random.choice(range(len(self.reward_memory)), size=batch_size)

        # state_batch = np.array([self.state_memory[i] for i in idx])
        # action_batch = [self.action_memory[i] for i in idx]
        # reward_batch = [self.reward_memory[i] for i in idx]
        # next_state_batch = np.array([self.next_state_memory[i] for i in idx])
        # done_batch = tf.convert_to_tensor([float(self.done_memory[i]) for i in idx])
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.jit_sampler(batch_size)
        ## This is where all the magic happens
        qValueF = self.target_model.predict_on_batch(next_state_batch) # approximate future reward

        qValueUpdate = reward_batch + self.gamma * tf.reduce_max(qValueF, axis=1)

        qValueUpdate = qValueUpdate * (1 - done_batch) - done_batch

        oneHotMask = tf.one_hot(action_batch, self.action_size)

        with tf.GradientTape() as tape:
            # Train the model on the states and updated Q-values
            OqValue = self.model(state_batch)

            # Apply the masks to the Q-values to get the Q-value for action taken
            OqValue_action = tf.reduce_sum(tf.multiply(OqValue, oneHotMask), axis=1)

            # Calculate loss between new Q-value and old Q-value
            loss = self.loss_function(qValueUpdate, OqValue_action)
            if CHIPPIE_PROGESS_REPORTS:
                  chippie.logLoss(loss)

        if DIAGNOSTIC_MODE:
          print("Q-Values: ", qValueF[1])
          print("Q-Values Update: ", qValueUpdate[1])
          print("Masked Update:   ", oneHotMask[1])
          print("Old Activations: ", OqValue[1])

        # Backpropagation
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        if DIAGNOSTIC_MODE:
          newQ = self.model.predict_step(state_batch)
          print("New Activations: ", newQ[1], "\n")

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


    def act(self, state):
        if np.random.rand() <= self.epsilon: 
            return random.randrange(self.action_size) # Do something stupid! (i.e. take a random action)
        action_values = self.model.predict_step(tf.expand_dims(tf.convert_to_tensor(state), 0))
        # action_values = self.model(tf.expand_dims(tf.convert_to_tensor(state), 0), training=False)
        return tf.argmax(action_values[0]).numpy()


    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    

    def save(self, name):
        self.model.save_weights(name)


    def load(self, name):
        self.model.load_weights(name)



At the very end, in a last ditch effort to get enough speed to succeed the replay buffer was re-implemented using numpy and slicing notation. This change provided about ten times as much speed and allowed the first test to reach 2500 episodes before a timeout or crash!

#### The Wrappers
So here we will use some wrappers from the baselines package. This broke my Tensorflow installation on my home server so I avoided wrappers during the majority of this project however, in the end using the optimized implementations in the wrapper drove speed up 1000% or so. So, if you are implementing your own version of this experiment, I cannot stress enough how important it is to study up on these wrappers to start.

In [10]:
# Make Environment

from gym.wrappers.atari_preprocessing import AtariPreprocessing
# from gym.wrappers.frame_stack import FrameStack
from baselines.common.atari_wrappers import make_atari, EpisodicLifeEnv, WarpFrame, ScaledFloatFrame, ClipRewardEnv, FrameStack, FireResetEnv

# env = FrameStack(AtariPreprocessing(gym.make(ENVIRONMENT_NAME), frame_skip=4, 
#                                   screen_size=(84), 
#                                   terminal_on_life_loss=True, 
#                                   grayscale_obs=True),3)
env = make_atari(ENVIRONMENT_NAME)
env = EpisodicLifeEnv(env)

if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)

env = WarpFrame(env)

env = ScaledFloatFrame(env)

env = ClipRewardEnv(env)

env = FrameStack(env, 3)

# myWrappedEnv = FrameStack(atariWrap, 3)                  env_wrappers=[lambda env: ActionRepeat(env, times=4)])
env.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
# env = gym.make(ENVIRONMENT_NAME)
state_size = (84, 84, 3)
action_size = env.action_space.n

## Intialize Our Agent
agent = DQNAgent(state_size, action_size)

Model: "s2_MAT_Agent_Testing_v2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 84, 84, 3)]       0         
_________________________________________________________________
s1_MAT_ImageClassifier_v5 (F (None, 9, 9, 64)          39008     
_________________________________________________________________
s2_MAT_ImageClassifier_v5 (F (None, 512)               1643072   
_________________________________________________________________
s3_MAT_ImageClassifier_v2 (F (None, 4)                 2052      
Total params: 1,684,132
Trainable params: 1,684,132
Non-trainable params: 0
_________________________________________________________________


#### Define Training Loop Environment

Getting your reward function is pretty paramount. In this case we will clip rewards to +1 and -1, with -1 being awarded for reaching a terminal state.

In [11]:
def myPlayground():

    # Empty Rewards Counter
    reward = 0
    all_rewards = 0
    total_time = 0
    done = False
    # print("checkpoint 1")
    # Display in Colab
    if COLAB:
        if RENDER == True:
            done, ax = plt.subplots(1, 1)
            img = ax.imshow(env.render('rgb_array'))    

    # Make Some Starting data
    # state = tFrame.process_frame(env.reset())
    state = env.reset() # Reset state for new episode
    # print(type(state))
    # print(state.shape)
    while agent.replay_memory.size < STARTING_MEMORY_SIZE:
      action = random.randrange(agent.action_size)
      next_frame, reward, done, _ = env.step(action) # Agent sends action to env wrapper and gets feedback
      # next_state = tFrame.process_frame(next_frame)
      next_state = next_frame
      if reward > 0.0: # Check and modify reward
        reward = 1.0
      if reward < 0.0:
        reward = -1.0
      if done:
        reward = -1.0
      agent.remember(state, action, reward, next_state, done) # Store sequence in replay memory
      state = next_state # Update state
      if done:
        # state = tFrame.process_frame(env.reset())
        state = env.reset()
    print("Starting Frames Acquired")

    # Training Loop
    for e in range(n_episodes): ## Go Eat Cherries
        # print("checkpoint 3")
        done = False
        time = 0
        game_score = 0
        reward = 0

        # state = tFrame.process_frame(env.reset()) # Reset state for new episode
        state = env.reset()
        
        for skip in range(skip_start): # Skip the start of each game
            env.step(0)

        while not done:
        
            time += 1
            total_time += 1

            # Display
            if RENDER:
                if COLAB:
                    img.set_data(env.render(mode='rgb_array')) 
                    ax.axis('off')
                    ipythondisplay.display(plt.gcf())
                    ipythondisplay.clear_output(wait=True)
                else:
                    env.render()
                    if OBSERVATION_MODE:
                        sleep(0.02)

            # Update Target Network
            if total_time % agent.update_rate == 0:
                agent.update_target_model()
                if CHIPPIE_PROGESS_REPORTS:
                  chippie.q_update()

            # Transition Dynamics
            action = agent.act(state) # Get action from agent
            next_frame, reward, done, _ = env.step(action) # Agent sends action to env wrapper and gets feedback
            # next_state = tFrame.process_frame(next_frame)
            next_state = next_frame
            
            # Sternly Validate Reward
            if reward > 0.0: 
              reward = 1.0
              game_score += reward
            if reward < 0.0:
              reward = -1.0
            if done:
              reward = -1.0
              all_rewards += game_score
              if CHIPPIE_PROGESS_REPORTS:
                chippie.dead(score=game_score, totalScore=all_rewards, episode=e, completion_target=n_episodes, survived=time, 
                              experiance=total_time, memory=agent.replay_memory.size, epsilon=agent.epsilon)
              agent.remember(state, action, reward, next_state, done) # Store death in replay memory
              break

            agent.remember(state, action, reward, next_state, done) # Store sequence in replay memory
            state = next_state # Update state

            if agent.replay_memory.size > STARTING_MEMORY_SIZE:
              if not OBSERVATION_MODE:
                if total_time % action_steps == 0:
                  agent.train(BATCH_SIZE)
                  if CHIPPIE_PROGESS_REPORTS:
                    chippie.training(game_score)

        if e % CHECKPOINT_FREQUENCY == 0 and agent.replay_memory.size > STARTING_MEMORY_SIZE+100:
          print("")
          if SLICE_ONE_CHECKPOINT:
              saved_model_path = model_dir + "/ISAR_Production_Models/" + ENVIRONMENT_NAME + SLICE_ONE_NAME + '{:04d}'.format(int(e/CHECKPOINT_FREQUENCY)) 
              tf.saved_model.save(agent.sliceOne, saved_model_path)

          if SLICE_TWO_CHECKPOINT:
              saved_model_path = model_dir + "/ISAR_Production_Models/" + ENVIRONMENT_NAME + SLICE_TWO_NAME + '{:04d}'.format(int(e/CHECKPOINT_FREQUENCY))
              tf.saved_model.save(agent.sliceTwo, saved_model_path)
          
          if SLICE_THREE_CHECKPOINT:
              saved_model_path = model_dir + "/ISAR_Production_Models/" + ENVIRONMENT_NAME + SLICE_TWO_NAME + '{:04d}'.format(int(e/CHECKPOINT_FREQUENCY))
              tf.saved_model.save(agent.sliceTwo, saved_model_path)

          if FULL_MODEL_CHECKPOINT:
              saved_model_path = model_dir + "/ISAR_Production_Models/" + ENVIRONMENT_NAME + FULL_MODEL_NAME + '{:04d}'.format(int(e/CHECKPOINT_FREQUENCY))
              tf.saved_model.save(agent.model, saved_model_path)

          if SAVE_AGENT:
              agent.save(model_dir+ "/ISAR_Production_Models/" + ENVIRONMENT_NAME + FULL_MODEL_NAME + "agent_weights_" + '{:04d}'.format(int(e/CHECKPOINT_FREQUENCY)) +".hdf5")
          


#### Interactive Playground (runs automatically)

As the agent finally blitzed through the code, we are getting close to the deadline and what began as an awesome undertaking in abstraction might formally (and temporarily) become a pure comparision of how the transfer learned weights from notebook 1 affect training speed. However, there is still yet another problem, because with only 9 days left to finish, and each test taking a full 12 hours to see results the results come in and the model is still showing no progress. Luckily, one more day of debugging and preparation found a simple bug causing enourmous amounts of pre-buffer data to be labelled terminal. After this fix, the agent has begun to train comparably to the only other working example we found (linked above). This implementation is still slower, however, in that example the baselines wrapper provided by DeepMind is used, which contains significant speed and memory optimizations which unfortunately, only appear to offer frame stacking at 4 frames. Since this would impair our transfer learning basis this is currently the fastest implementation we can provide.

In [None]:
# #### Run From Bottom (Can be run with one click)
# agent.epsilon = 0.999
# agent.learning_rate = 0.000125

## This will take about 3 minutes in colab to gather starting data and output some feedback
## after this you can expect chippieBot limited updates about every 5 minutes for the next 7-12 hours.
myPlayground()

Starting Frames Acquired

Episode: 250/50000, Episode Score: 0, Avg Episode Score: 0.224, Survival Time: 22
Your agent has a running average Score per 100 episodes of: \{^,^}~0.26
Total Steps: 7840, Memory Size: 82839, Current Epsilon Value: 0.83, Leading Loss: 0.08731/ Trailing Loss: 0.1067

Episode: 500/50000, Episode Score: 1.0, Avg Episode Score: 0.266, Survival Time: 50
Your agent has a running average Score per 100 episodes of: \{^,^}~0.25
Total Steps: 16389, Memory Size: 91388, Current Epsilon Value: 0.67, Leading Loss: 0.08261/ Trailing Loss: 0.08596

Episode: 750/50000, Episode Score: 0, Avg Episode Score: 0.2467, Survival Time: 22
Your agent has a running average Score per 100 episodes of: \{^,^}~0.25
Total Steps: 23820, Memory Size: 98819, Current Epsilon Value: 0.56, Leading Loss: 0.08357/ Trailing Loss: 0.08526

Episode: 1000/50000, Episode Score: 0, Avg Episode Score: 0.247, Survival Time: 22
Your agent has a running average Score per 100 episodes of: \{^,^}~0.22
Total St

#### Notes

When we are all done building this last box will zip our progress for download so we can continue our work later

In [None]:
ZIP_OUTPUT = False #@param {type:"boolean"}

In [None]:
%%capture
if ZIP_OUTPUT:
  zip = !zip -r ISAR_Model_Data_Output.zip ISAR_Model_Data

###### Reference Notes

In [None]:
# Executing (12h 4m 14s)

In [None]:
# import gc
# gc.collect()

In [None]:
# running reward: 0.38 at episode 293, frame count 10000
# running reward: 0.31 at episode 589, frame count 20000
# running reward: 0.19 at episode 905, frame count 30000
# running reward: 0.23 at episode 1214, frame count 40000
# running reward: 0.39 at episode 1512, frame count 50000
# running reward: 0.33 at episode 1802, frame count 60000
# running reward: 0.32 at episode 2087, frame count 70000
# running reward: 0.23 at episode 2390, frame count 80000
# running reward: 0.33 at episode 2680, frame count 90000
# running reward: 0.17 at episode 2977, frame count 100000
# running reward: 0.19 at episode 3274, frame count 110000
# running reward: 0.28 at episode 3558, frame count 120000
# running reward: 0.38 at episode 3839, frame count 130000
# running reward: 0.39 at episode 4110, frame count 140000
# running reward: 0.48 at episode 4354, frame count 150000
# running reward: 0.38 at episode 4600, frame count 160000
# running reward: 0.37 at episode 4858, frame count 170000
# running reward: 0.47 at episode 5099, frame count 180000
# running reward: 0.47 at episode 5315, frame count 190000
# running reward: 0.60 at episode 5529, frame count 200000
# running reward: 0.60 at episode 5744, frame count 210000
# running reward: 0.63 at episode 5956, frame count 220000
# running reward: 0.59 at episode 6181, frame count 230000
# running reward: 0.59 at episode 6407, frame count 240000
# running reward: 0.53 at episode 6625, frame count 250000
# running reward: 0.62 at episode 6842, frame count 260000
# running reward: 0.62 at episode 7057, frame count 270000
# running reward: 0.50 at episode 7285, frame count 280000
# running reward: 0.68 at episode 7505, frame count 290000
# running reward: 0.55 at episode 7726, frame count 300000
# running reward: 0.65 at episode 7930, frame count 310000
# running reward: 0.70 at episode 8136, frame count 320000
# running reward: 0.86 at episode 8321, frame count 330000
# running reward: 0.57 at episode 8535, frame count 340000
# running reward: 0.69 at episode 8732, frame count 350000
# running reward: 0.51 at episode 8953, frame count 360000
# running reward: 0.76 at episode 9144, frame count 370000
# running reward: 0.72 at episode 9357, frame count 380000
# running reward: 0.86 at episode 9550, frame count 390000
# running reward: 0.63 at episode 9750, frame count 400000
# running reward: 0.55 at episode 9989, frame count 410000
# running reward: 0.88 at episode 10175, frame count 420000
# running reward: 0.68 at episode 10371, frame count 430000
# running reward: 0.78 at episode 10569, frame count 440000
# running reward: 0.83 at episode 10736, frame count 450000
# running reward: 1.00 at episode 10900, frame count 460000
# running reward: 0.82 at episode 11090, frame count 470000
# running reward: 1.11 at episode 11237, frame count 480000
# running reward: 1.11 at episode 11393, frame count 490000
# running reward: 1.16 at episode 11543, frame count 500000
# running reward: 1.38 at episode 11686, frame count 510000
# running reward: 1.23 at episode 11833, frame count 520000
# running reward: 1.31 at episode 11976, frame count 530000
# running reward: 1.32 at episode 12122, frame count 540000
# running reward: 1.59 at episode 12246, frame count 550000
# running reward: 1.65 at episode 12371, frame count 560000
# running reward: 1.53 at episode 12500, frame count 570000
# running reward: 1.44 at episode 12643, frame count 580000
# running reward: 1.54 at episode 12776, frame count 590000
# running reward: 1.61 at episode 12900, frame count 600000
# running reward: 1.83 at episode 13012, frame count 610000
# running reward: 1.98 at episode 13123, frame count 620000
# running reward: 2.08 at episode 13230, frame count 630000
# running reward: 1.96 at episode 13336, frame count 640000
# running reward: 2.01 at episode 13449, frame count 650000
# running reward: 2.21 at episode 13548, frame count 660000
# running reward: 2.29 at episode 13648, frame count 670000
# running reward: 2.35 at episode 13744, frame count 680000
# running reward: 2.40 at episode 13837, frame count 690000
# running reward: 2.29 at episode 13934, frame count 700000
# running reward: 2.54 at episode 14026, frame count 710000
# running reward: 2.30 at episode 14121, frame count 720000
# running reward: 2.20 at episode 14218, frame count 730000
# running reward: 2.79 at episode 14301, frame count 740000
# running reward: 2.73 at episode 14387, frame count 750000
# running reward: 2.93 at episode 14465, frame count 760000
# running reward: 2.72 at episode 14553, frame count 770000
# running reward: 2.57 at episode 14640, frame count 780000
# running reward: 2.60 at episode 14719, frame count 790000
# running reward: 2.79 at episode 14797, frame count 800000
# running reward: 3.20 at episode 14865, frame count 810000
# running reward: 3.27 at episode 14938, frame count 820000
# running reward: 3.06 at episode 15006, frame count 830000
# running reward: 3.50 at episode 15073, frame count 840000
# running reward: 3.63 at episode 15139, frame count 850000
# running reward: 3.78 at episode 15203, frame count 860000
# running reward: 3.67 at episode 15270, frame count 870000
# running reward: 3.69 at episode 15333, frame count 880000
# running reward: 4.40 at episode 15384, frame count 890000
# running reward: 4.70 at episode 15439, frame count 900000
# running reward: 5.28 at episode 15482, frame count 910000
# running reward: 5.98 at episode 15525, frame count 920000
# running reward: 5.74 at episode 15575, frame count 930000
# running reward: 6.22 at episode 15612, frame count 940000
# running reward: 5.76 at episode 15662, frame count 950000
# running reward: 6.37 at episode 15695, frame count 960000
# running reward: 7.02 at episode 15736, frame count 970000
# running reward: 7.24 at episode 15777, frame count 980000
# running reward: 7.35 at episode 15816, frame count 990000
# running reward: 7.41 at episode 15852, frame count 1000000
# running reward: 8.06 at episode 15889, frame count 1010000
# running reward: 8.18 at episode 15928, frame count 1020000
# running reward: 8.00 at episode 15965, frame count 1030000
# running reward: 7.38 at episode 16008, frame count 1040000
# running reward: 7.24 at episode 16048, frame count 1050000
# running reward: 6.93 at episode 16091, frame count 1060000
# running reward: 7.73 at episode 16125, frame count 1070000
# running reward: 7.16 at episode 16172, frame count 1080000
# running reward: 7.08 at episode 16210, frame count 1090000
# running reward: 7.20 at episode 16246, frame count 1100000
# running reward: 7.89 at episode 16285, frame count 1110000
# running reward: 7.83 at episode 16321, frame count 1120000
# running reward: 7.19 at episode 16366, frame count 1130000
# running reward: 7.99 at episode 16399, frame count 1140000
# running reward: 7.90 at episode 16440, frame count 1150000
# running reward: 8.47 at episode 16474, frame count 1160000
# running reward: 8.40 at episode 16511, frame count 1170000
# running reward: 7.72 at episode 16555, frame count 1180000
# running reward: 8.15 at episode 16591, frame count 1190000
# running reward: 8.84 at episode 16628, frame count 1200000
# running reward: 8.93 at episode 16662, frame count 1210000
# running reward: 9.66 at episode 16695, frame count 1220000
# running reward: 9.07 at episode 16733, frame count 1230000
# running reward: 8.91 at episode 16769, frame count 1240000
# running reward: 8.84 at episode 16803, frame count 1250000
# running reward: 9.18 at episode 16838, frame count 1260000
# running reward: 8.79 at episode 16875, frame count 1270000
# running reward: 9.07 at episode 16908, frame count 1280000
# running reward: 9.27 at episode 16943, frame count 1290000
# running reward: 9.23 at episode 16979, frame count 1300000
# running reward: 8.59 at episode 17015, frame count 1310000
# running reward: 8.49 at episode 17055, frame count 1320000
# running reward: 8.39 at episode 17091, frame count 1330000
# running reward: 9.11 at episode 17128, frame count 1340000
# running reward: 9.36 at episode 17164, frame count 1350000
# running reward: 9.98 at episode 17198, frame count 1360000
# running reward: 9.91 at episode 17236, frame count 1370000
# running reward: 10.31 at episode 17272, frame count 1380000
# running reward: 10.09 at episode 17305, frame count 1390000
# running reward: 10.39 at episode 17337, frame count 1400000
# running reward: 9.91 at episode 17371, frame count 1410000
# running reward: 10.86 at episode 17402, frame count 1420000
# running reward: 10.22 at episode 17438, frame count 1430000
# running reward: 10.34 at episode 17474, frame count 1440000
# running reward: 10.38 at episode 17505, frame count 1450000
# running reward: 10.91 at episode 17538, frame count 1460000
# running reward: 10.25 at episode 17576, frame count 1470000
# running reward: 9.19 at episode 17613, frame count 1480000
# running reward: 9.26 at episode 17652, frame count 1490000
# running reward: 9.89 at episode 17685, frame count 1500000
# running reward: 9.43 at episode 17724, frame count 1510000
# running reward: 9.27 at episode 17758, frame count 1520000
# running reward: 9.21 at episode 17791, frame count 1530000
# running reward: 9.28 at episode 17831, frame count 1540000
# running reward: 9.60 at episode 17866, frame count 1550000
# running reward: 9.39 at episode 17902, frame count 1560000
# running reward: 10.39 at episode 17937, frame count 1570000
# running reward: 9.72 at episode 17975, frame count 1580000
# running reward: 10.11 at episode 18005, frame count 1590000
# running reward: 9.97 at episode 18040, frame count 1600000
# running reward: 10.07 at episode 18077, frame count 1610000
# running reward: 9.36 at episode 18114, frame count 1620000
# running reward: 9.33 at episode 18151, frame count 1630000
# running reward: 8.81 at episode 18191, frame count 1640000
# running reward: 7.91 at episode 18230, frame count 1650000
# running reward: 8.76 at episode 18264, frame count 1660000
# running reward: 9.51 at episode 18304, frame count 1670000
# running reward: 8.86 at episode 18346, frame count 1680000
# running reward: 8.52 at episode 18383, frame count 1690000
# running reward: 8.88 at episode 18416, frame count 1700000
# running reward: 9.83 at episode 18451, frame count 1710000
# running reward: 9.74 at episode 18487, frame count 1720000
# running reward: 10.33 at episode 18520, frame count 1730000
# running reward: 10.37 at episode 18551, frame count 1740000
# running reward: 10.67 at episode 18590, frame count 1750000
# running reward: 9.78 at episode 18624, frame count 1760000

