In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive') # Comment after mounting drive

Mounted at /content/drive


In [None]:
# After executing the cell above, Drive
# files will be present in "/content/drive/My Drive". 
# This cell presents the contents of the supporting files directory
# Next cell gets you to the supporting files directory
!ls "/content/drive/MyDrive/ktd1g20_COMP6247/QLearningDynamicMaze"

COMP6247Maze20212022.npy  QLearningDynamicMaze.ipynb  read_maze.py


In [None]:
cd "/content/drive/MyDrive/ktd1g20_COMP6247/QLearningDynamicMaze" 

/content/drive/MyDrive/ktd1g20_COMP6247/QLearningDynamicMaze


In [None]:
# The libraries that are needed

from __future__ import print_function
import os, sys, time, datetime, json, random
import numpy as np
import pandas as pd                                                  
import matplotlib.pyplot as plt
import json

%matplotlib inline

In [None]:
# This imports the functions from the read_maze.py file.
from read_maze import *

In [None]:
# This will load the maze
load_maze()

In [None]:
# We need a function that gives us the state the agent is in.
# The function takes as input the coordinates (x,y) of the agent's current location.
# The output is a list of 20 elements
# The first 9 elements correspond to the walls surrounding the agent
# The second 9 elements correspond to the fire surrounding the agent
# The last two elements correspond to (x,y) coordinates of the agent's position (similar to the input)

def get_state(x,y): 
  info_array = get_local_maze_information(int(x), int(y))
  sides_array = np.zeros((3,3))
  fires_array = np.zeros((3,3))
  for i in range(3):
    for j in range(3):
      sides_array[i][j] = info_array[i][j][0] # The spatial geometry
      fires_array[i][j] = info_array[i][j][1] # The fire geometry

  my_list = sides_array.flatten().tolist() + fires_array.flatten().tolist() 
  my_list.append(x)
  my_list.append(y)
  state_array = np.array([my_list])
  
  return state_array 

In [None]:
# This fucntion simplifies the representation of the state regarding the fire information
def simplifyFiresArray(state):
  local_state = state
  left_fire = 0
  up_fire = 0
  right_fire = 0
  down_fire = 0
  if state[0][10] != 0:
    up_fire = 1
  if state[0][12] != 0:
    left_fire = 1
  if state[0][14] != 0:
    right_fire = 1
  if state[0][16] != 0:
    down_fire = 1

  return left_fire, up_fire, right_fire, down_fire

In [None]:
# This function gets the action from the Q-table by comparing the corresponding Q-values.
# The inputs are the Q-table and the state the agent is in.
# The output is the action the agent should take, represented as a number

def get_action_from_Qtable(Qtable, state): 
  x_state = int(state[0][-2])
  y_state = int(state[0][-1])
  local_state = state
  left_fire, up_fire, right_fire, down_fire = simplifyFiresArray(local_state)
  q = Qtable[x_state][y_state][left_fire][up_fire][right_fire][down_fire]
  action = float(np.argmax(q)) 

  return action 

In [None]:
# This function is used to check whether we ended up in a new bad state and updates the set of BadStates accordingly.
def newBadState(state):
  global BadStates
  my_state = state
  local_x = my_state[0][-2]
  local_y = my_state[0][-1]
  if [local_x, local_y] in BadStates: # [x,y] already in BadStates
    return False
  
  # If the sum of cross elements is 1 then we are in a bad state.
  cross_sum = my_state[0][1] + my_state[0][3] + my_state[0][5] + my_state[0][7] 
  if cross_sum <= 1:
    # modify the set of BadStates accordingly and report on that by returning True
    BadStates.append([local_x,local_y]) 
    return True 
  else:
    return False 

In [None]:
# This is the function that enables the agent to move. 
# The inputs are the state the agent is in and the action it will take based on the Q-table.
# The outputs are the new_state, the immdediate reward a Boolean variable 
# and two Boolean variables: the first one checkes whether the maze has been successfully completed
# while the second one records whether the agent should be reset according to the proposed framework adopted.

def act(state, action):
  game_won = False
  temp_action = float(action)
  reward = 0
  resetAgent = False 

  start_x = state[0][-2] # Our starting coordinates before the action
  start_y = state[0][-1]
  old_state = state
  isIntersectionMove = [start_x, start_y, float(action)] # This will help to check whether we reached a new intersection

  new_x = state[0][-2] # Our coordinates after the action. Initialised as the starting coordinates.
  new_y = state[0][-1]
  new_state = state

  # Check if you have already made this bad intersection move.
  if isIntersectionMove in SetBadIntersectionMoves: 
    reward -= 1000000 # It was -1 initially
    new_state = get_state(new_x, new_y)
    resetAgent = True
    return new_state, reward, game_won, resetAgent

  if (temp_action == 0) and (old_state[0][3] == 1) and (old_state[0][12] == 0): # Valid left
    new_y -= 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state , reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent 
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 1) and (old_state[0][1] == 1) and (old_state[0][10] == 0): # Valid up
    new_x -= 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state , reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent # When reset agent is false update the badstates through the newBadState function and bad intersection moves through the resetAgent list
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 2) and (old_state[0][5] == 1) and (old_state[0][14] == 0): # Valid right
    new_y += 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent 
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 3) and (old_state[0][7] == 1) and (old_state[0][16] == 0): # Valid down
    new_x += 1
    # Initially check whether the winning tile has been reached.
    if new_x == target[0] and new_y == target[1]:
      game_won = True
      reward += 10000000
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent
    # Then check whether we have reached a previously discovered bad state
    elif [new_x, new_y] in BadStates:
      reward -= 1000000
      resetAgent = True
      new_state = get_state(new_x, new_y)
      return new_state, reward, game_won, resetAgent 
    new_state = get_state(new_x, new_y)
    # Then check whether we have reached a new bad state
    if newBadState(new_state) == True:
      reward -= 1000000
      resetAgent = True
      return new_state, reward, game_won, resetAgent # When reset agent is false update the badstates through the newBadState function and bad intersection moves through the resetAgent list
    # If none of the above is true then continue.
    else:
      reward -= 100000
      # Check if we have already visited this tile
      if [new_x, new_y] in visited:
        reward += 0
      else:
        reward += 95000
      return new_state, reward, game_won, resetAgent

  elif (temp_action == 4): # Stay put
    new_state = get_state(new_x, new_y) # This has to be run so that we can get the fire off.
    reward -= 100000 
    return new_state, reward, game_won, resetAgent
  
  else: # Invalid move
    reward -= 1000000 # This is the case where the agent tried to walk into a wall or a fire.
    return old_state, reward, game_won, resetAgent 

In [None]:
# This function both checks if a state is an intersection and ,if it indeed is, it adds it to 
# the listOfIintersections if it was not already there while also checking whether a triplet 
# of x,y,action is a bad Intersection move.

def checkIfIntersection(state):
  global listOfIntersections
  global BadStates
  global SetBadIntersectionMoves 
  state = state
  sum_cross = state[0][1] + state[0][3] + state[0][5] + state[0][7]
  if (sum_cross >= 3) and ([state[0][-2], state[0][-1]] not in BadStates):
    if [state[0][-2],state[0][-1]] not in listOfIntersections:
      listOfIntersections.append([state[0][-2],state[0][-1]])
      if (state[0][1] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(1)])
      if (state[0][3] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(0)])
      if (state[0][5] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(2)])
      if (state[0][7] == 0):
        SetBadIntersectionMoves.append([state[0][-2],state[0][-1], float(3)])
    return True
  else:
    return False

In [None]:
# Check if an intersection is bad or not

def isBadIntersection(x,y): 
  counter = 0
  if [x,y,float(0)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(1)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(2)] in SetBadIntersectionMoves:
     counter += 1
  if [x,y,float(3)] in SetBadIntersectionMoves:
     counter += 1
  
  if (counter >= 3):
    return True
  else:
    return False

In [None]:
# The training function for our implementation
def qtrain( epsilon, n_epoch, discount, num_actions, resumingTraining, TrainingFromStart, **opt):
  global visited
  global lastIntersection
  global SetBadIntersectionMoves
  global listOfIntersections
  global BadStates
  global last_state_coordinates
  global actions_dict
  global QtableA
  global QtableB
  global lr
  global s_a_learning_rate

  bigReset = False # This Boolean value handles reseting the agent in the case of reaching a bad itersection
  n_epoch = n_epoch # The number of epochs you wish to train the models

  # Auxiliary variables to help with training on Google Colab
  resumingTraining = resumingTraining
  TrainingFromStart = TrainingFromStart

  start_time = datetime.datetime.now() # A counter for time
                
  discount = discount # The discount factor for delayed reward.
  num_actions = num_actions # This is simply the number of available actions at each tile of the maze

  # Auxiliary variables for the user interface
  random_move = ""
  model_update_counter = 0
  game_over = False

  for epoch in range(n_epoch):
    if game_over == True:
      break
    print("We are now in epoch #" , (epoch+1) , ".This means that the agent has been reset (epoch > 1) or started (epoch = 1)")
    loss = 0.0
    if bigReset == True:
      print("We will BIG reset to the last valid new intersection we found: ", [listOfIntersections[-1][0],listOfIntersections[-1][1]])
      start_x = listOfIntersections[-1][0] # This is initialised  at [[1,1]]
      start_y = listOfIntersections[-1][1]
    else: 
      print("We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain: ", [lastIntersection[0],lastIntersection[1]])
      start_x = lastIntersection[0] 
      start_y = lastIntersection[1]

    if resumingTraining == True:
      resumingTraining = False
      start_x = last_state_coordinates[0] 
      start_y = last_state_coordinates[1]
    
    if TrainingFromStart == True:
      TrainingFromStart = False
      start_x = 1.0
      start_y = 1.0

    game_over = False

    resetAgent = False

    # get initial state
    state = get_state(start_x, start_y)
    
    n_episodes = 0
    while not game_over and resetAgent == False:
      prev_state = state
      prev_state_x = int(prev_state[0][-2])
      prev_state_y = int(prev_state[0][-1])
      prev_left_fire, prev_up_fire, prev_right_fire, prev_down_fire = simplifyFiresArray(prev_state)
      last_state_coordinates = [state[0][-2],state[0][-1]]
      # Get next action
      if np.random.rand() < epsilon:
        action = float(random.choice(actions_list))
        random_move = "random"
      else:
        # If you use double Q-learning decomment the command below and comment the one lower
        # q_table = QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire] + QtableB[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire]
        q_table = QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire] 
        max_q = np.where(np.max(q_table) == q_table)[0]
        action = float(np.random.choice(max_q))
        random_move = "deterministic"

      if checkIfIntersection(state) == True:
        lastIntersection = [state[0][-2], state[0][-1], float(action)]
        print("We reached a new intersection: [x,y,action] = ", lastIntersection)
      elif [state[0][-2], state[0][-1]] == [1,1]: # Take into account the first try from [1,1] as well
        lastIntersection = [state[0][-2], state[0][-1], float(action)]
        print("We reached a new intersection: [x,y,action] = ", lastIntersection)

      # Apply action, get reward and new state
      new_state, reward, game_won, resetAgent = act(state, action)
      new_left_fire, new_up_fire, new_right_fire, new_down_fire = simplifyFiresArray(new_state)

      new_state_x = int(new_state[0][-2])
      new_state_y = int(new_state[0][-1])

      # Keep track of the trajectory:
      if [new_state[0][-2], new_state[0][-1]] not in visited:
        visited.append([new_state[0][-2], new_state[0][-1]])

      if [new_state[0][-2], new_state[0][-1]] in BadStates:
        SetBadIntersectionMoves.append(lastIntersection)

      if isBadIntersection(lastIntersection[0], lastIntersection[1]) == True:
        if [lastIntersection[0], lastIntersection[1]] not in BadStates:
          BadStates.append([lastIntersection[0], lastIntersection[1]])
        if [lastIntersection[0], lastIntersection[1]] in listOfIntersections:
          listOfIntersections.remove([lastIntersection[0], lastIntersection[1]])
        bigReset = True
    
      # Checkpoint
      print([state[0][-2], state[0][-1]], " -> ", random_move , " ", actions_dict[int(action)], " -> ", [new_state[0][-2], new_state[0][-1]])

      if game_won == True:
        game_over = True
        print("Game over. We reached the (199,199) tile.")
        n_episodes += 1
        
        # These are helpful only once the model is sufficiently trained, in order to accelerate convergence. Simply change lr to alpha in the below if statement.
        s_a_learning_rate[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += 1
        alpha = (float(1.0)/s_a_learning_rate[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])**1000

        # Depending on whether you wish to use double Q-learning you can use the appropriate formulas by commenting and decommenting the relevant parts. By default a single Q-table is used.
        if np.random.rand() < 0.5:
          # If Update(A) in double Q-learning: (decomment the below line and comment the lower one)
          # QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableB[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)]) # Double Q-learning

          
          QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])
        
        
        # Depending on whether you wish to use double Q-learning you can use the appropriate formulas by commenting and decommenting the relevant parts. By default a single Q-table is used.
        else:
          # If Update(B) in double Q-learning: (decomment the below line and comment the lower one)
          # QtableB[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableB[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableB[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)]) # Double Q-learning
          
          QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])

        state = new_state 

        # Save the results
        dfA = pd.DataFrame(QtableA.reshape(QtableA.shape[0], -1))
        dfB = pd.DataFrame(QtableB.reshape(QtableB.shape[0], -1))
        dflearning_rate = pd.DataFrame(s_a_learning_rate.reshape(s_a_learning_rate.shape[0], -1))
        dfA.to_excel("QtableA_reshapedWinner.xlsx")
        dfB.to_excel("QtableB_reshapedWinner.xlsx") 
        dflearning_rate.to_excel("s_a_learning_rateWinner.xlsx") 
        model_update_counter += 1
        with open("VisitedTilesWinner", "w") as fp:
          json.dump(visited, fp)
        with open("SetBadIntersectionMovesWinner", "w") as fp:
          json.dump(SetBadIntersectionMoves, fp)
        with open("BadStatesWinner", "w") as fp:
          json.dump(BadStates, fp)
        with open("listOfIntersectionsWinner", "w") as fp:
          json.dump(listOfIntersections, fp)
        with open("lastIntersectionWinner", "w") as fp:
          json.dump(lastIntersection, fp)
        break

      else:
        game_over = False

        n_episodes += 1
        
        # These are helpful only once the model is sufficiently trained, in order to accelerate convergence. Simply change lr to alpha in the below if statement.
        s_a_learning_rate[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += 1
        alpha = (float(1.0)/s_a_learning_rate[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])**1000

        if np.random.rand() < 0.5:
          # If Update(A) in double Q-learning: (decomment the below line and comment the lower one)
          # QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableB[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)]) # Double Q-learning

          
          QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])
        
        else:
          # If Update(b) in double Q-learning: (decomment the below line and comment the lower one)
          # QtableB[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableB[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableB[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)]) # Double Q-learning
          
          
          
          QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)] += lr * (reward + discount * QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire][np.argmax(QtableA[new_state_x][new_state_y][new_left_fire][new_up_fire][new_right_fire][new_down_fire])] - QtableA[prev_state_x][prev_state_y][prev_left_fire][prev_up_fire][prev_right_fire][prev_down_fire][int(action)])
          
          

        state = new_state 
        last_state_coordinates = [new_state[0][-2],new_state[0][-1]]

      model_update_counter += 1
      
      # Save the results regularly
      if (model_update_counter % 20000) == 0:
        print("We reached ", model_update_counter, " tries, let's save the results and continue")
        dfA = pd.DataFrame(QtableA.reshape(QtableA.shape[0], -1))
        dfB = pd.DataFrame(QtableB.reshape(QtableB.shape[0], -1))
        dflearning_rate = pd.DataFrame(s_a_learning_rate.reshape(s_a_learning_rate.shape[0], -1))
        dfA.to_excel("QtableA_reshaped.xlsx")
        dfB.to_excel("QtableB_reshaped.xlsx")   
        dflearning_rate.to_excel("s_a_learning_rate.xlsx")  
        with open("VisitedTiles", "w") as fp:
          json.dump(visited, fp)
        with open("SetBadIntersectionMoves", "w") as fp:
          json.dump(SetBadIntersectionMoves, fp)
        with open("BadStates", "w") as fp:
          json.dump(BadStates, fp)
        with open("listOfIntersections", "w") as fp:
          json.dump(listOfIntersections, fp)
        with open("lastIntersection", "w") as fp:
          json.dump(lastIntersection, fp)
        with open("last_state_coordinates", "w") as fp:
          json.dump(last_state_coordinates, fp)

        
    
    dt = datetime.datetime.now() - start_time
    t = format_time(dt.total_seconds())
    template = "Epoch: {:03d}/{:d} | Episodes: {:d} | time: {}"
    print(template.format(epoch, n_epoch-1, n_episodes, t))




# This is a small utility for printing readable time strings:
def format_time(seconds):
    if seconds < 400:
        s = float(seconds)
        return "%.1f seconds" % (s,)
    elif seconds < 4000:
        m = seconds / 60.0
        return "%.2f minutes" % (m,)
    else:
        h = seconds / 3600.0
        return "%.2f hours" % (h,)

In [None]:

# We will encode the actions as follows:
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3
REMAIN = 4 

# Actions dictionary
actions_dict = {
    LEFT: 'left',
    UP: 'up',
    RIGHT: 'right',
    DOWN: 'down' ,
    REMAIN: 'remain' 
}

num_actions = len(actions_dict) 
actions_list= [0,1,2,3,4] 

# Exploration factor in order to pick random moves as well during training.
epsilon = 0.1

In [None]:
target = (199, 199) # Set the target tile as you wish

discount = 0.999 # The discount in the Bellman equation. Alter it accordingly

lr = 0.1
SetBadIntersectionMoves = [[1.0 , 1.0 , 1.0]] # Initialise the set correctly
BadStates = list()
visited = list()
listOfIntersections = list()
lastIntersection = [1.0,1.0]
last_state_coordinates = list()

# Initialise the Q-tables
QtableA = np.zeros((201,201,2,2,2,2,num_actions))
QtableB = np.zeros((201,201,2,2,2,2,num_actions))
s_a_learning_rate = np.zeros((201,201,2,2,2,2,num_actions))


# Only decoment the below if you have changed lr to alpha in order to use the past information to make the Q-table converge.
'''
with open("SetBadIntersectionMovesWinner", "r") as fp:   # Your most important file
  SetBadIntersectionMoves = json.load(fp)

with open("BadStatesWinner", "r") as fp:  # Your second most important file
  BadStates = json.load(fp)
'''

'\nwith open("SetBadIntersectionMovesWinner", "r") as fp:   # Your most important file\n  SetBadIntersectionMoves = json.load(fp)\n\nwith open("BadStatesWinner", "r") as fp:  # Your second most important file\n  BadStates = json.load(fp)\n'

In [None]:
# Train the model
qtrain( epsilon, 10000000, discount, num_actions, resumingTraining = False, TrainingFromStart = True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
We reached a new intersection: [x,y,action] =  [11.0, 7.0, 4.0]
[11.0, 7.0]  ->  deterministic   remain  ->  [11.0, 7.0]
We reached a new intersection: [x,y,action] =  [11.0, 7.0, 3.0]
[11.0, 7.0]  ->  deterministic   down  ->  [11.0, 7.0]
Epoch: 030/99999 | Episodes: 2 | time: 55.2 seconds
We are now in epoch # 32 .This means that the agent has been reset (epoch > 1) or started (epoch = 1)
We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain:  [11.0, 7.0]
We reached a new intersection: [x,y,action] =  [11.0, 7.0, 2.0]
[11.0, 7.0]  ->  deterministic   right  ->  [11.0, 8.0]
[11.0, 8.0]  ->  deterministic   remain  ->  [11.0, 8.0]
[11.0, 8.0]  ->  deterministic   right  ->  [11.0, 9.0]
[11.0, 9.0]  ->  deterministic   down  ->  [11.0, 9.0]
[11.0, 9.0]  ->  deterministic   right  ->  [11.0, 10.0]
[11.0, 10.0]  ->  deterministic   left  ->  [11.0, 9.0]
[11.

KeyboardInterrupt: ignored

In [None]:
# After the initial training phase has reached the (199,199) tile retrain until sufficient results have been produced.

for i in range(1000):
  target = (199, 199) 
 
  epsilon = 0.1

  discount = 0.999 

  lr = 0.1
  SetBadIntersectionMoves = [[1.0 , 1.0 , 1.0]]
  BadStates = list()
  visited = list()
  listOfIntersections = list()
  lastIntersection = [1.0,1.0]
  last_state_coordinates = list()

  # Initialise the Q-tables
  QtableA = np.zeros((201,201,2,2,2,2,num_actions))
  QtableB = np.zeros((201,201,2,2,2,2,num_actions))
  s_a_learning_rate = np.zeros((201,201,2,2,2,2,num_actions))

  
  # Load the previous results to facilitate training
  with open("VisitedTilesWinner", "r") as fp: 
    visited = json.load(fp)

  with open("SetBadIntersectionMovesWinner", "r") as fp:   
    SetBadIntersectionMoves = json.load(fp)

  with open("BadStatesWinner", "r") as fp:  
    BadStates = json.load(fp)

  loaded_QtableA = pd.read_excel('QtableA_reshapedWinner.xlsx', index_col=0  )
  loaded_QtableA = loaded_QtableA.to_numpy()

  first = loaded_QtableA.reshape(
    loaded_QtableA.shape[0], loaded_QtableA.shape[1] // QtableA.shape[6], QtableA.shape[6])
  
  second = first.reshape(
    first.shape[0], first.shape[1] // QtableA.shape[5], QtableA.shape[5], QtableA.shape[6])
  
  third = second.reshape(
    second.shape[0], second.shape[1] // QtableA.shape[4], QtableA.shape[4], QtableA.shape[5], QtableA.shape[6])
  fourth = third.reshape(
    third.shape[0], third.shape[1] // QtableA.shape[3], QtableA.shape[3], QtableA.shape[4], QtableA.shape[5], QtableA.shape[6])
  QtableA = fourth.reshape(
    fourth.shape[0], fourth.shape[1] // QtableA.shape[2], QtableA.shape[2], QtableA.shape[3], QtableA.shape[4], QtableA.shape[5], QtableA.shape[6])


  loaded_QtableB = pd.read_excel('QtableB_reshapedWinner.xlsx', index_col=0  )
  loaded_QtableB = loaded_QtableB.to_numpy()

  first = loaded_QtableB.reshape(
    loaded_QtableB.shape[0], loaded_QtableB.shape[1] // QtableB.shape[6], QtableB.shape[6])
  
  second = first.reshape(
    first.shape[0], first.shape[1] // QtableB.shape[5], QtableB.shape[5], QtableB.shape[6])
  
  third = second.reshape(
    second.shape[0], second.shape[1] // QtableB.shape[4], QtableB.shape[4], QtableB.shape[5], QtableB.shape[6])
  fourth = third.reshape(
    third.shape[0], third.shape[1] // QtableB.shape[3], QtableB.shape[3], QtableB.shape[4], QtableB.shape[5], QtableB.shape[6])
  QtableB = fourth.reshape(
    fourth.shape[0], fourth.shape[1] // QtableB.shape[2], QtableB.shape[2], QtableB.shape[3], QtableB.shape[4], QtableB.shape[5], QtableB.shape[6])



  
  # The following is useful only in the case that the set of bad intersection moves has been finalised. In that case replace lr with alpha in the qtrain function.
  loaded_learning_rate = pd.read_excel('s_a_learning_rateWinner.xlsx', index_col=0  )
  loaded_learning_rate = loaded_learning_rate.to_numpy()

  first = loaded_learning_rate.reshape(
    loaded_learning_rate.shape[0], loaded_learning_rate.shape[1] // s_a_learning_rate.shape[6], s_a_learning_rate.shape[6])
  
  second = first.reshape(
    first.shape[0], first.shape[1] // s_a_learning_rate.shape[5], s_a_learning_rate.shape[5], s_a_learning_rate.shape[6])
  
  third = second.reshape(
    second.shape[0], second.shape[1] // s_a_learning_rate.shape[4], s_a_learning_rate.shape[4], s_a_learning_rate.shape[5], s_a_learning_rate.shape[6])
  fourth = third.reshape(
    third.shape[0], third.shape[1] // s_a_learning_rate.shape[3], s_a_learning_rate.shape[3], s_a_learning_rate.shape[4], s_a_learning_rate.shape[5], s_a_learning_rate.shape[6])
  s_a_learning_rate = fourth.reshape(
    fourth.shape[0], fourth.shape[1] // s_a_learning_rate.shape[2], s_a_learning_rate.shape[2], s_a_learning_rate.shape[3], s_a_learning_rate.shape[4], s_a_learning_rate.shape[5], s_a_learning_rate.shape[6])

  qtrain( epsilon, 100000, discount, num_actions, resumingTraining = False, TrainingFromStart = True)

We are now in epoch # 1 .This means that the agent has been reset (epoch > 1) or started (epoch = 1)
We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain:  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 4.0]
[1.0, 1.0]  ->  deterministic   remain  ->  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 3.0]
[1.0, 1.0]  ->  deterministic   down  ->  [2.0, 1.0]
[2.0, 1.0]  ->  deterministic   up  ->  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 2.0]
[1.0, 1.0]  ->  deterministic   right  ->  [1.0, 1.0]
Epoch: 000/99999 | Episodes: 4 | time: 0.1 seconds
We are now in epoch # 2 .This means that the agent has been reset (epoch > 1) or started (epoch = 1)
We will reset to the last valid intersection we crossed unless resumingTraining was True when calling the function qtrain:  [1.0, 1.0]
We reached a new intersection: [x,y,action] =  [1.0, 1.0, 4.0]
[1.0, 1.0]  

KeyboardInterrupt: ignored

In [None]:
# Check whether the model has been sufficiently trained. After a certain point following the proposed framework the agent should reach tile (199,199) 
# consistently in about 4000-4500 steps. However, there is no way to tell beforehand when to stop training.

starting_point_x = 1.0
starting_point_y = 1.0
number_steps = 0

state = get_state(starting_point_x, starting_point_y)

while (starting_point_x != 199.0) or (starting_point_y != 199.0):
  
  action = get_action_from_Qtable(QtableA, state)
  print(action)
  new_state, reward, game_won, resetAgent  = act(state, action)

  print([state[0][-2], state[0][-1]], " -> " , actions_dict[int(action)] , " -> ", [new_state[0][-2], new_state[0][-1]])
  print("Game won: ", game_won)
  number_steps += 1
  starting_point_x = new_state[0][-2]
  starting_point_y = new_state[0][-1]
  state = new_state

print("It took us: ", number_steps, " steps to reach the (199,199) tile.")

4.0
[1.0, 1.0]  ->  remain  ->  [1.0, 1.0]
Game won:  False
4.0
[1.0, 1.0]  ->  remain  ->  [1.0, 1.0]
Game won:  False
3.0
[1.0, 1.0]  ->  down  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]
Game won:  False
4.0
[2.0, 1.0]  ->  remain  ->  [2.0, 1.0]

KeyboardInterrupt: ignored