In [40]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive') # Comment after mounting drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# After executing the cell above, Drive
# files will be present in "/content/drive/My Drive". 
# This cell presents the contents of the supporting files directory
# Next cell gets you to the supporting files directory
!ls "/content/drive/MyDrive/ktd1g20_COMP6247/DQNDynamicMaze"

COMP6247Maze20212022.npy  DQNdynamic.ipynb  __pycache__  read_maze.py


In [42]:
cd "/content/drive/MyDrive/ktd1g20_COMP6247/DQNDynamicMaze" 

/content/drive/MyDrive/ktd1g20_COMP6247/DQNStaticMaze


In [43]:
# The libraries that are needed

from __future__ import print_function
import os, sys, time, datetime, json, random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from keras.layers.advanced_activations import PReLU 
from tensorflow.keras import initializers
import matplotlib.pyplot as plt

%matplotlib inline

In [44]:
# This imports the functions from the read_maze.py file.
from read_maze import *

In [45]:
# This will load the maze
load_maze()

In [46]:
# Just an array to visualise parts of the maze for visualisation purposes.
example_array = np.zeros((10,10), dtype=np.float64)
for i in range(10):
  for j in range(10):
    example_array[i][j] = get_local_maze_information(i,j)[1][1][0]
    

In [47]:
# We need a function that gives us the state the agent is in.
# The function takes as input the coordinates (x,y) of the agent's current location.
# The output is a list of 20 elements
# The first 9 elements correspond to the walls surrounding the agent
# The second 9 elements correspond to the fire surrounding the agent
# The last two elements correspond to (x,y) coordinates of the agent's position (similar to the input)

def get_state(x,y): 
  info_array = get_local_maze_information(int(x), int(y))
  sides_array = np.zeros((3,3))
  fires_array = np.zeros((3,3))
  for i in range(3):
    for j in range(3):
      sides_array[i][j] = info_array[i][j][0] # The spatial geometry
      fires_array[i][j] = info_array[i][j][1] # The fire geometry

  my_list = sides_array.flatten().tolist() + fires_array.flatten().tolist() 
  my_list.append(x)
  my_list.append(y)
  state_array = np.array([my_list])
  
  return state_array

In [48]:
# This function gets the action from the Q-network.
# The inputs are the Q-network and the state the agent is in.
# The output is the action the agent should take, represented as a number

def get_action_from_Qvalue(model, state):
  q = model.predict(state)
  action = float(np.argmax(q[0])) 

  return action 

In [49]:
# This function is used to check whether we ended up in a new bad state and updates the set of BadStates accordingly.
def newBadState(state):
  global BadStates
  my_state = state
  local_x = my_state[0][-2]
  local_y = my_state[0][-1]
  if [local_x, local_y] in BadStates: # [x,y] already in BadStates
    return False
  
  # If the sum of cross elements is 1 then we are in a bad state.
  cross_sum = my_state[0][1] + my_state[0][3] + my_state[0][5] + my_state[0][7] 
  if cross_sum <= 1:
    # modify the set of BadStates accordingly and report on that by returning True
    BadStates.append([local_x,local_y]) 
    return True 
  else:
    return False 

In [50]:
# This is the function that enables the agent to move. 
# The inputs are the state the agent is in and the action it will take based on the Q-table.
# The outputs are the new_state, the immdediate reward a Boolean variable 
# and two Boolean variables: the first one checkes whether the maze has been successfully completed
# while the second one records whether the agent should be reset according to the proposed framework adopted.

def act(state, action):
  game_won = False
  temp_action = float(action)
  reward = 0
  resetAgent = False 

  start_x = state[0][-2] # Our starting coordinates before the action
  start_y = state[0][-1]
  old_state = state
  isIntersectionMove = [start_x, start_y, float(action)] # This will help to check whether we reached a new intersection

  new_x = state[0][-2] # Our coordinates after the action. Initialised as the starting coordinates.
  new_y = state[0][-1]
  new_state = state

  # Check if you have already made this bad intersection move.
  if isIntersectionMove in SetBadIntersectionMoves: 
    reward -= 1000000 # It was -1 initially
    new_state = get_state(new_x, new_y)
    resetAgent = True
    return new_state, reward, game_won, resetAgent

  if (temp_action == 0) and (old_state[0][3] == 1) and (old_state[0][12] == 0): # Valid left
    new_y -= 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state , reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent 
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 1) and (old_state[0][1] == 1) and (old_state[0][10] == 0): # Valid up
    new_x -= 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state , reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent # When reset agent is false update the badstates through the newBadState function and bad intersection moves through the resetAgent list
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 2) and (old_state[0][5] == 1) and (old_state[0][14] == 0): # Valid right
    new_y += 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent 
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 3) and (old_state[0][7] == 1) and (old_state[0][16] == 0): # Valid down
    new_x += 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent # When reset agent is false update the badstates through the newBadState function and bad intersection moves through the resetAgent list
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 4): # Stay put
    new_state = get_state(new_x, new_y) # This has to be run so that we can get the fire off.
    reward -= 100000 
    return new_state, reward, game_won, resetAgent
  
  else: # Invalid move
    reward -= 1000000 # This is the case where the agent tried to walk into a wall or a fire.
    return old_state, reward, game_won, resetAgent 

In [51]:
# Our neural network model.
def build_model(lr=0.001):
    lr = lr
    model = Sequential()
    model.add(Dense(128, kernel_initializer= initializers.RandomUniform(minval=-1, maxval=1, seed=None), input_shape=(20,))) # Our first dense layer accepts the vector of size 20 as input
    model.add(PReLU()) # maybe change that
    model.add(Dense(128, kernel_initializer= initializers.RandomUniform(minval=-1, maxval=1, seed=None))) 
    model.add(PReLU())
    model.add(Dense(num_actions, kernel_initializer= initializers.RandomUniform(minval=-1, maxval=1, seed=None))) 
    opt = Adam(learning_rate=lr)
    model.compile(optimizer= opt, loss='mse') 
    return model

In [52]:
# This will be used to return the output of the neural network:
def predict(model, state):
  return model.predict(state)[0]

In [53]:
# This function will be used to sample from our experience buffer and train the network 
# We must input the desired data size and the memory we have so far as well as the output size of the NN.
# We also need the discount factor for this as it is used to copute the expected reward of the next step

def get_data( data_size, memory, num_actions, updateModel, targetModel, discount):
  state_size = memory[0][0].shape[1] # This is our flattened vector for the local "image" of the maze.
                                         
  mem_size = len(memory) # This is the running length for the memory.
  data_size = min (mem_size, data_size) # For the initial steps mem_size < data_size
  inputs = np.zeros((data_size, state_size)) # These will serve as our input data to the NN
  targets = np.zeros((data_size, num_actions)) # These are the labels to the NN, corresponding to the Q_value per action

  # Now we generate our samples with no replacement by pulling #data_size samples from the memory
  for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace = False)):
    state, action, reward, new_state, game_won = memory[j]
    inputs[i] = state # Our input instance
    targets[i] = predict(updateModel, state) # This is the current answer we get from the NN

    # Use the targetNetwork to predict the Q value 
    # but first find the argmax of the next state based on the
    # updateNetwork
    index_Q_sa = np.argmax(predict(updateModel, new_state)) # We use the updateModel to find the next action
    Q_sa = predict(targetModel, new_state)[index_Q_sa] # But we evaluate the Q value of the next (state,action) pair based on the targetModel for stability
    if game_won:
      targets[i, int(action)] = reward
    else:
      targets[i, int(action)] = reward + discount * Q_sa
  return inputs, targets

In [54]:
# This function both checks if a state is an intersection and ,if it indeed is, it adds it to 
# the listOfIintersections if it was not already there while also checking whether a triplet 
# of x,y,action is a bad Intersection move.

def checkIfIntersection(state):
  global listOfIntersections
  global BadStates
  global SetBadIntersectionMoves 
  state = state
  sum_cross = state[0][1] + state[0][3] + state[0][5] + state[0][7]
  if (sum_cross >= 3) and ([state[0][-2], state[0][-1]] not in BadStates):
    if [state[0][-2],state[0][-1]] not in listOfIntersections:
      listOfIntersections.append([state[0][-2],state[0][-1]])
      if (state[0][1] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(1)])
      if (state[0][3] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(0)])
      if (state[0][5] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(2)])
      if (state[0][7] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(3)])
    return True
  else:
    return False

In [55]:
# Check if an intersection is bad or not

def isBadIntersection(x,y): 
  counter = 0
  if [x,y,float(0)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(1)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(2)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(3)] in SetBadIntersectionMoves:
     counter += 1
  
  if (counter >= 3):
    return True
  else:
    return False

In [56]:
# The training function for our dqn implementation
def qtrain( epsilon, n_epoch, max_memory, data_size, discount, num_actions, resumingTraining, TrainingFromStart, **opt):
  global visited
  global lastIntersection
  global SetBadIntersectionMoves
  global listOfIntersections
  global memory
  global BadStates
  global last_state_coordinates
  global actions_dict
  global updateModel
  global targetModel


  bigReset = False # This Boolean value handles reseting the agent in the case of reaching a bad itersection
  n_epoch = n_epoch # The number of epochs you wish to train the models

  # Auxiliary variables to help with training on Google Colab
  resumingTraining = resumingTraining
  TrainingFromStart = TrainingFromStart

  max_memory = max_memory # The memory of the replay epxerience
  data_size = data_size   # The size of the data that are being used to update the models

  start_time = datetime.datetime.now() # A counter for time
                
  discount = discount # The discount factor for delayed reward.
  num_actions = num_actions # This is simply the number of available actions at each tile of the maze

  # Auxiliary variables for the user interface
  random_move = ""
  model_update_counter = 0
  game_over = False

  for epoch in range(n_epoch):
    if game_over == True:
      break
    print("We are now in epoch #" , (epoch+1) , ".This means that the agent has been reset (epoch > 1) or started (epoch = 1)")
    loss = 0.0

    if bigReset == True:
      print("We will BIG reset to the last valid new intersection we found: ", [listOfIntersections[-1][0],listOfIntersections[-1][1]])
      start_x = listOfIntersections[-1][0] # This is initialised  at [[1,1]]
      start_y = listOfIntersections[-1][1]
    else: 
      print("We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain: ", [lastIntersection[0],lastIntersection[1]])
      start_x = lastIntersection[0] 
      start_y = lastIntersection[1]

    if resumingTraining == True:
      resumingTraining = False
      start_x = last_state_coordinates[0] 
      start_y = last_state_coordinates[1]
    
    if TrainingFromStart == True:
      TrainingFromStart = False
      start_x = 1.0
      start_y = 1.0

    game_over = False

    resetAgent = False

    # get initial state
    state = get_state(start_x, start_y)
    
    n_episodes = 0

    while not game_over and resetAgent == False:
      prev_state = state
      prev_state_x = int(prev_state[0][-2])
      prev_state_y = int(prev_state[0][-1])
      last_state_coordinates = [state[0][-2],state[0][-1]]
      # Get next action
      if np.random.rand() < epsilon:
        action = float(random.choice(actions_list))
        random_move = "random"
      else:
        action = float(np.argmax(predict(updateModel, prev_state)))
        random_move = "deterministic"

      if checkIfIntersection(state) == True:
        lastIntersection = [state[0][-2], state[0][-1], float(action)]
        print("We reached a new intersection: [x,y,action] = ", lastIntersection)
      elif [state[0][-2], state[0][-1]] == [1,1]: # Take into account the first try from [1,1] as well
        lastIntersection = [state[0][-2], state[0][-1], float(action)]
        print("We reached a new intersection: [x,y,action] = ", lastIntersection)


      # Apply action, get reward and new envstate
      new_state, reward, game_won, resetAgent = act(state, action)

      new_state_x = int(new_state[0][-2])
      new_state_y = int(new_state[0][-1])

       # Keep track of the trajectory:
      if [new_state[0][-2], new_state[0][-1]] not in visited:
        visited.append([new_state[0][-2], new_state[0][-1]])

      if [new_state[0][-2], new_state[0][-1]] in BadStates:
        SetBadIntersectionMoves.append(lastIntersection)

      if isBadIntersection(lastIntersection[0], lastIntersection[1]) == True:
        if [lastIntersection[0], lastIntersection[1]] not in BadStates:
          BadStates.append([lastIntersection[0], lastIntersection[1]])
        if [lastIntersection[0], lastIntersection[1]] in listOfIntersections:
          listOfIntersections.remove([lastIntersection[0], lastIntersection[1]])
        bigReset = True
        
      # Checkpoint
      print([prev_state[0][-2], prev_state[0][-1]], " -> ", random_move , " ", actions_dict[int(action)], " -> ", [new_state[0][-2], new_state[0][-1]])

      if game_won == True:
        game_over = True
        print("Game over. We reached the (199,199) tile.")
        n_episodes += 1

        episode = [state, action, reward, new_state, game_won]
        print("The last episode was: ", state,"(state), ", action, "(action), ", reward, "(reward), ", new_state, "(new_state), ", game_won, "(game_won).")
        memory.append(episode)
        
        if len(memory) > max_memory:
          del memory[0]
        inputs, targets = get_data(data_size, memory, num_actions, updateModel, targetModel, discount )
        h = updateModel.fit( # we update the model based on the targets
          inputs,
          targets,
          epochs=10,
          batch_size=25,
          verbose=0,
        )
        loss = updateModel.evaluate(inputs, targets, verbose=0)
        state = new_state
       
        # Save the results
        updateModel.save_weights('updateModelDQNwinner.h5')
        targetModel.save_weights('targetModelDQNwinner.h5')
        model_update_counter += 1
        with open("VisitedTilesWinner", "w") as fp:
          json.dump(visited, fp)
        with open("SetBadIntersectionMovesWinner", "w") as fp:
          json.dump(SetBadIntersectionMoves, fp)
        with open("BadStatesWinner", "w") as fp:
          json.dump(BadStates, fp)
        with open("listOfIntersectionsWinner", "w") as fp:
          json.dump(listOfIntersections, fp)
        with open("lastIntersectionWinner", "w") as fp:
          json.dump(lastIntersection, fp)
        break

      else:
        game_over = False
        
        n_episodes += 1

        # Store episode for experience
        episode = [state, action, reward, new_state, game_won]
        memory.append(episode)
        
        if len(memory) > max_memory:
          del memory[0]
        # Train neural network model
        inputs, targets = get_data(data_size, memory, num_actions, updateModel, targetModel, discount)
        h = updateModel.fit(
          inputs,
          targets,
          epochs=10,
          batch_size=25,
          verbose=0,
        )
        loss = updateModel.evaluate(inputs, targets, verbose=0)
        state = new_state 

        last_state_coordinates = [new_state[0][-2],new_state[0][-1]]

      model_update_counter += 1

      # Update the network
      if (model_update_counter % 50) == 0:
        print("We reached ", model_update_counter, " tries, let's update the target network and continue")
        targetModel.set_weights(updateModel.get_weights()) # use that to copy the update's model weights to the target model

      # Save the results regularly
      if (model_update_counter % 1000) == 0:
        print("We reached ", model_update_counter, " tries, let's save the results and continue")
        updateModel.save_weights('updateModelDQN.h5')
        targetModel.save_weights('targetModelDQN.h5')
        with open("VisitedTiles", "w") as fp:
          json.dump(visited, fp)
        with open("SetBadIntersectionMoves", "w") as fp:
          json.dump(SetBadIntersectionMoves, fp)
        with open("BadStates", "w") as fp:
          json.dump(BadStates, fp)
        with open("listOfIntersections", "w") as fp:
          json.dump(listOfIntersections, fp)
        with open("lastIntersection", "w") as fp:
          json.dump(lastIntersection, fp)
        with open("last_state_coordinates", "w") as fp:
          json.dump(last_state_coordinates, fp)
        
    
    dt = datetime.datetime.now() - start_time
    t = format_time(dt.total_seconds())
    template = "Epoch: {:03d}/{:d} | Episodes: {:d} | time: {}"
    print(template.format(epoch, n_epoch-1, n_episodes, t))




# This is a small utility for printing readable time strings:
def format_time(seconds):
    if seconds < 400:
        s = float(seconds)
        return "%.1f seconds" % (s,)
    elif seconds < 4000:
        m = seconds / 60.0
        return "%.2f minutes" % (m,)
    else:
        h = seconds / 3600.0
        return "%.2f hours" % (h,)

In [57]:
# We are going to need a function that gives out the immediate rewards 
# according to the (state, action) pair 

# We will encode the actions as follows:
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3
REMAIN = 4 

# Actions dictionary
actions_dict = {
    LEFT: 'left',
    UP: 'up',
    RIGHT: 'right',
    DOWN: 'down' ,
    REMAIN: 'remain'
}

num_actions = len(actions_dict) 
actions_list= [0,1,2,3,4]

# Exploration factor in order to pick random moves as well during training.
epsilon = 0.1

In [58]:
target = (199, 199) # Set the target tile as you wish

max_memory = 1000 # The maximum amount of tuples in the experience buffer
discount = 0.999 # The discount in the Bellman equation. Alter it accordingly

memory = list() # our memory for the moves we have made

SetBadIntersectionMoves = [[1.0 , 1.0 , 1.0]] # Initialise the set correctly
BadStates = list()
visited = list()
listOfIntersections = list()
lastIntersection = [1.0,1.0]
last_state_coordinates = list()


# Load previous models (optional). If you don't want to load weights then simply run the qtrain by commenting the two load_weigths commands below first 
updateModel = build_model(lr = 0.001)
targetModel = build_model(lr = 0.001)

targetModel.set_weights(updateModel.get_weights()) # It is better if you have both models the same from the start.



# Comment or uncomment this section depending on whether you read previous files from your hard drive or want to start fresh
'''
with open("SetBadIntersectionMoves", "r") as fp:
  SetBadIntersectionMoves = json.load(fp)
with open("BadStates", "r") as fp:
  BadStates = json.load(fp)
with open("listOfIntersections", "r") as fp:
  listOfIntersections = json.load(fp)
with open("lastIntersection", "r") as fp:
  lastIntersection = json.load(fp)
'''

'''
with open("VisitedTiles", "r") as fp:
  visited = json.load(fp)
with open("last_state_coordinates", "r") as fp:
  last_state_coordinates = json.load(fp)

updateModel.load_weights("updateModelDQN.h5")
targetModel.load_weights("targetModelDQN.h5")
'''

'\nwith open("VisitedTiles", "r") as fp:\n  visited = json.load(fp)\nwith open("last_state_coordinates", "r") as fp:\n  last_state_coordinates = json.load(fp)\n\nupdateModel.load_weights("updateModelDQN.h5")\ntargetModel.load_weights("targetModelDQN.h5")\n'

In [59]:
#  epsilon, n_epoch, max_memory, data_size, discount, num_actions, **opt):

qtrain(epsilon, 100000, 1000 , 500, discount, num_actions, resumingTraining = False,   TrainingFromStart = True)

We are now in epoch # 1 .This means that the agent has been reset (epoch > 1) or started (epoch = 1)
We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain:  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 1.0]
[1.0, 1.0]  ->  deterministic   up  ->  [1.0, 1.0]
Epoch: 000/99999 | Episodes: 1 | time: 1.0 seconds
We are now in epoch # 2 .This means that the agent has been reset (epoch > 1) or started (epoch = 1)
We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain:  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 4.0]
[1.0, 1.0]  ->  deterministic   remain  ->  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 4.0]
[1.0, 1.0]  ->  deterministic   remain  ->  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 0.0]
[1.0, 1.0]  ->  deterministic   left  ->  [1.0, 1.0]
We reached 

KeyboardInterrupt: ignored

In [None]:
# After the initial training phase has reached the (199,199) tile retrain until sufficient results have been produced.

for i in range(1000):
  target = (199, 199) 
 
  epsilon = 0.1

  discount = 0.999 

  SetBadIntersectionMoves = [[1.0 , 1.0 , 1.0]]
  BadStates = list()
  visited = list()
  listOfIntersections = list()
  lastIntersection = [1.0,1.0]
  last_state_coordinates = list()

  # Load previous models (optional). If you don't want to load weights then simply run the qtrain by commenting the two load_weigths commands below first 
  updateModel = build_model(lr = 0.001)
  targetModel = build_model(lr = 0.001)

  updateModel.load_weights("updateModelDQNWinner.h5")
  targetModel.load_weights("targetModelDQNWinner.h5")

  
  # Load the previous results to facilitate training
  with open("VisitedTilesWinner", "r") as fp: 
    visited = json.load(fp)

  with open("SetBadIntersectionMovesWinner", "r") as fp:   
    SetBadIntersectionMoves = json.load(fp)

  with open("BadStatesWinner", "r") as fp:  
    BadStates = json.load(fp)

  qtrain(epsilon, 100000, 1000 , 500, discount, num_actions, resumingTraining = False,   TrainingFromStart = True)