## Experiment: Type-1

In [4]:
import numpy as np
import tensorflow as tf

# Define the maze
maze = np.array([
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
])

# Define the Q-network
class QNetwork(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation='relu', input_shape=(state_size,))
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.output_layer = tf.keras.layers.Dense(action_size, activation='linear')

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.output_layer(x)

# Hyperparameters
state_size = 3  # [0, 0, 1]
action_size = 3  # 4 possible actions: left, right, up, down
learning_rate = 0.001
discount_factor = 0.95

# Instantiate the Q-network
model = QNetwork(state_size, action_size)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')

# Q-learning training loop
epochs = 1000
for epoch in range(epochs):
    current_state = [0, 0, 1]  # Start at the top-left corner of the maze
    done = False
    total_reward = 0

    while not done:
        # Choose an action using epsilon-greedy strategy
        epsilon = 0.1
        if np.random.rand() < epsilon:
            action = np.random.randint(action_size)
        else:
            q_values = model.predict(np.array([current_state]))
            action = np.argmax(q_values)

        # Take the chosen action and observe the next state and reward
        next_state = np.copy(current_state)
        if action == 0:  # Move left
            next_state[0] = max(0, current_state[0] - 1)
        elif action == 1:  # Move right
            next_state[1] = min(1, current_state[1] + 1)
        elif action == 2:  # Move up
            next_state[2] = max(0, current_state[2] - 1)
        elif action == 3:  # Move down
            next_state[2] = min(1, current_state[2] + 1)

        reward = -1 if maze[next_state[2], next_state[1]] == 0 else -100  # -1 for each step, -100 for hitting an obstacle

        # Update the Q-value using the Bellman equation
        q_values = model.predict(np.array([current_state]))
        next_q_values = model.predict(np.array([next_state]))
        q_values[0][action] = reward + discount_factor * np.max(next_q_values)

        # Train the model on the current transition
        model.train_on_batch(np.array([current_state]), q_values)

        total_reward += reward
        current_state = next_state

        # Check if the episode is done
        if maze[current_state[2], current_state[1]] == 1 or current_state == [1, 10, 1]:
            done = True

    print(f"Epoch: {epoch + 1}, Total Reward: {total_reward}")


Epoch: 1, Total Reward: -100
Epoch: 2, Total Reward: -100
Epoch: 3, Total Reward: -100
Epoch: 4, Total Reward: -100
Epoch: 5, Total Reward: -100
Epoch: 6, Total Reward: -100
Epoch: 7, Total Reward: -100
Epoch: 8, Total Reward: -100
Epoch: 9, Total Reward: -100
Epoch: 10, Total Reward: -100
Epoch: 11, Total Reward: -100
Epoch: 12, Total Reward: -100
Epoch: 13, Total Reward: -100
Epoch: 14, Total Reward: -100
Epoch: 15, Total Reward: -100
Epoch: 16, Total Reward: -100
Epoch: 17, Total Reward: -100
Epoch: 18, Total Reward: -100
Epoch: 19, Total Reward: -100
Epoch: 20, Total Reward: -100
Epoch: 21, Total Reward: -100
Epoch: 22, Total Reward: -100
Epoch: 23, Total Reward: -100
Epoch: 24, Total Reward: -100
Epoch: 25, Total Reward: -100
Epoch: 26, Total Reward: -100
Epoch: 27, Total Reward: -100
Epoch: 28, Total Reward: -100
Epoch: 29, Total Reward: -100
Epoch: 30, Total Reward: -100
Epoch: 31, Total Reward: -100
Epoch: 32, Total Reward: -100
Epoch: 33, Total Reward: -100
Epoch: 34, Total Re

## Our Code

In [41]:
# Define the maze
maze = [
    [1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 0, 0, 0],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 1, 1, 1, 1],
    [1, 1, 1, 0, 0, 0, 0, 0, 0],
    [1, 1, 1, 1, 1, 1, 1, 1, 1]
]
source = (2,8)
destination = (13,8)

# Define the maze
maze = np.array([
    [1, 1, 0, 1],
    [1, 1, 0, 0],
    [1, 1, 0, 1],
    [1, 0, 0, 1],
    [1, 0, 0, 0]
])
source = (1,3)
destination = (4,3)

In [42]:
def binary_to_decimal(binary):
    decimal = int(binary, 2)
    return decimal

# Example usage:
binary_number = "101"
decimal_value = binary_to_decimal(binary_number)
print("Binary:", binary_number)
print("Decimal:", decimal_value)


Binary: 101
Decimal: 5


In [43]:

def get_current_state(maze, current_location, facing):
    row, col = current_location
    # Calculate the next location based on the facing direction
    if facing == 'up':
        front_row, front_col = row - 1, col
        left_row, left_col = row, col - 1
        right_row, right_col = row, col + 1
        # Check if there is a path to the left
        left_value = 0 if left_col > 0 and maze[left_row][left_col] == 0 else 1
        # Check if there is a path to the right
        right_value = 0 if right_col < len(maze[0]) and maze[right_row][right_col] == 0 else 1
        # Check if there is a path to the front
        front_value = 0 if front_row > 0 and maze[front_row][front_col] == 0 else 1
        # Check if there is an obstacle in front
    elif facing == 'down':
        front_row, front_col = row + 1, col
        left_row, left_col = row, col + 1
        right_row, right_col = row, col - 1
        # Check if there is a path to the left
        left_value = 0 if left_col < len(maze[0]) and maze[left_row][left_col] == 0 else 1
        # Check if there is a path to the right
        right_value = 0 if right_col > 0 and maze[right_row][right_col] == 0 else 1
        # Check if there is a path to the front
        front_value = 0 if front_row < len(maze) and maze[front_row][front_col] == 0 else 1
    elif facing == 'left':
        front_row, front_col = row, col - 1
        left_row, left_col = row + 1, col
        right_row, right_col = row - 1, col
        # Check if there is a path to the left
        left_value = 0 if left_row < len(maze) and maze[left_row][left_col] == 0 else 1
        # Check if there is a path to the right
        right_value = 0 if right_row > 0 and maze[right_row][right_col] == 0 else 1
        # Check if there is a path to the front
        front_value = 0 if front_col > 0 and maze[front_row][front_col] == 0 else 1
    elif facing == 'right':
        front_row, front_col = row, col + 1
        left_row, left_col = row - 1, col
        right_row, right_col = row + 1, col
        # Check if there is a path to the left
        left_value = 0 if left_row >0 and maze[left_row][left_col] == 0 else 1
        # Check if there is a path to the right
        right_value = 0 if right_row <  len(maze) and maze[right_row][right_col] == 0 else 1
        # Check if there is a path to the front
        front_value = 0 if front_col <  len(maze[0]) and maze[front_row][front_col] == 0 else 1
    return (left_value, right_value, front_value), binary_to_decimal(str(left_value) + str(right_value) + str(front_value))

# Example usage:
# current_location = (2, 8)  # Example starting location
current_location = (1, 3)  # Example starting location
# facing_direction = 'up'    # Example starting facing direction
get_current_state(maze, current_location, facing='up') # output: [0, 1, 1]<binary>, 3 <decimal>
get_current_state(maze, current_location, facing='left') # output: [0, 1, 1]<binary>, 6 <decimal>
get_current_state(maze, current_location, facing='right') # output: [1, 1, 1]<binary>, 7 <decimal>
get_current_state(maze, current_location, facing='down') # output: [1, 0, 1] <binary> 5 <decimal>


((1, 0, 1), 5)

In [44]:
def is_valid_coords(maze, coordinates):
    x, y = coordinates
    if (x>0) and (x < len(maze)) and (y > 0) and (y < len(maze[0])):
        return True
    else:
        return False

def move(maze, facing, action, current_coordinates):
    x, y = current_coordinates
    new_facing = facing
    new_coordinates = current_coordinates

    # Perform the chosen action
    if action == 0:
      action = "left"
    elif action == 1:
      action = "right"
    elif action == 2:
      action = "forward"
    elif action not in ["left", "right", "forward"]:
      raise ValueError(f"Amigo! action is not supposed to be {action}")  # Raises a ValueError

    if action == 'left':
        if facing == 'up':
            new_facing = 'left'
        elif facing == 'down':
            new_facing = 'right'
        elif facing == 'left':
            new_facing = 'down'
        elif facing == 'right':
            new_facing = 'up'
        # new_coordinates = current_coordinates
    elif action == 'right':
        if facing == 'up':
            new_facing = 'right'
        elif facing == 'down':
            new_facing = 'left'
        elif facing == 'left':
            new_facing = 'up'
        elif facing == 'right':
            new_facing = 'down'
        # new_coordinates = current_coordinates
    elif action == "forward":
        if facing == 'up':
            new_coordinates = (x - 1, y)
        elif facing == 'down':
            new_coordinates = (x + 1, y)
        elif facing == 'left':
            new_coordinates = (x, y - 1)
        elif facing == 'right':
            new_coordinates = (x, y + 1)
        if not is_valid_coords(maze, new_coordinates):
            new_coordinates = current_coordinates
        # new_facing = facing
    return new_coordinates, new_facing

# Example usage:
current_facing = 'up'      # Example current facing action
action = 'left'  # Example rotation action ('left' or 'right')

current_coordinates = (0, 0)  # Example starting coordinates
new_coordinates, new_facing = move(maze, current_facing, action, current_coordinates)

print("New Coordinates:", new_coordinates)
print("New Facing Direction:", new_facing)

New Coordinates: (0, 0)
New Facing Direction: left


In [49]:
from IPython.display import clear_output
def display(maze, source, destination, current_position, path=None):
    clear_output()
    for i in range(len(maze)):
        for j in range(len(maze[0])):
            if (i, j) == source:
                print('S', end=' ')
            elif (i, j) == destination:
                print('D', end=' ')
            elif (i, j) == current_position:
                print('C', end=' ')
            elif path and (i, j) in path:
                print('*', end=' ')  # Mark the path with '*'
            elif maze[i][j] == 1:
                print('#', end=' ')
            else:
                print('.', end=' ')
        print()


## Training Q-network

In [72]:
import numpy as np

# Define Q-learning parameters
num_actions = 4  # Up, Down, Left, Right
learning_rate = 0.1
discount_factor = 0.9
exploration_rate = 0.1
exploration_rate_decay = 0.87
num_episodes = 100000

# Define the number of states and actions
num_states = 2 ** 3  # 3 binary inputs
num_actions = 3      # left, right, forward

# Initialize Q-table with random values
q_values = np.zeros((num_states, num_actions))

for episode in range(num_episodes):
    # state = (0, 0)  # Starting position
    current_coordinates = source
    facing = "left"
    state_binary, state = get_current_state(maze, current_coordinates, facing)
    path = [state]

    # while state != (maze.shape[0] - 1, maze.shape[1] - 1):  # Continue until reaching the goal
    while current_coordinates != destination:
        # Choose an action using epsilon-greedy strategy
        if np.random.rand() < exploration_rate:
            action = np.random.choice(num_actions)
        else:
            action = np.argmax(q_values[state])

        new_coordinates, new_facing = move(maze, facing, action, current_coordinates)
        new_state_binary, new_state = get_current_state(maze, new_coordinates, new_facing)


        # Update Q-value using the Bellman equation
        reward = -1 if maze[new_coordinates[0], new_coordinates[1]] == 0 else -5  # Penalize hitting a wall
        reward = +30 if new_coordinates == destination else reward # Reward for reaching destination
        q_values[state, action] += learning_rate * (
                reward + discount_factor * np.max(q_values[new_state]) - q_values[state, action])

        # Move to the new state
        facing = new_facing
        current_coordinates = new_coordinates
        state = new_state
        path.append(current_coordinates)

    # Decrease the exploration rate
    exploration_rate *= exploration_rate_decay
    if episode % 100 == 0:
        display(maze=maze, path=path, source = source, destination= (maze.shape[0] - 1, maze.shape[1] - 1), current_position=(maze.shape[0] - 1, maze.shape[1] - 1))
        print(f'path: len.{len(path)} {path}')

print("Training complete!")
# visualize_path(path)
display(maze=maze, path=path, source = source, destination= (maze.shape[0] - 1, maze.shape[1] - 1), current_position=(maze.shape[0] - 1, maze.shape[1] - 1))
print(f'path: len.{len(path)} {path}')

# # . # 
# # * S 
# # * # 
# . * # 
# . * D 
path: len.8 [6, (1, 2), (1, 2), (2, 2), (3, 2), (4, 2), (4, 2), (4, 3)]


# Using Trained Q-Network

In [74]:
source  = (1,3)
destination  = (4,3)

current_coordinates = source
facing = "left"
state_binary, state = get_current_state(maze, current_coordinates, facing)
path = [state]

# while state != (maze.shape[0] - 1, maze.shape[1] - 1):  # Continue until reaching the goal
while current_coordinates != destination:
    # Choose an action using epsilon-greedy strategy
    # if np.random.rand() < exploration_rate:
    #     action = np.random.choice(num_actions)
    # else:
    action = np.argmax(q_values[state])

    new_coordinates, new_facing = move(maze, facing, action, current_coordinates)
    new_state_binary, new_state = get_current_state(maze, new_coordinates, new_facing)


    # Update Q-value using the Bellman equation
    # reward = -1 if maze[new_coordinates[0], new_coordinates[1]] == 0 else -5  # Penalize hitting a wall
    # reward = +5 if new_coordinates == destination else reward # Reward for reaching destination
    # q_values[state, action] += learning_rate * (
    #         reward + discount_factor * np.max(q_values[new_state]) - q_values[state, action])

    # Move to the new state
    facing = new_facing
    current_coordinates = new_coordinates
    state = new_state
    path.append(current_coordinates)

# Decrease the exploration rate
exploration_rate *= exploration_rate_decay
if episode % 100 == 0:
    display(maze=maze, path=path, source = (0,0), destination= (maze.shape[0] - 1, maze.shape[1] - 1), current_position=(maze.shape[0] - 1, maze.shape[1] - 1))

print("Training complete!")
# visualize_path(path)
display(maze=maze, path=path, source = source, destination= (maze.shape[0] - 1, maze.shape[1] - 1), current_position=(maze.shape[0] - 1, maze.shape[1] - 1))
print(f'path: len.{len(path)} {path}')

# # . # 
# # * S 
# # * # 
# . * # 
# . * D 
path: len.8 [6, (1, 2), (1, 2), (2, 2), (3, 2), (4, 2), (4, 2), (4, 3)]


In [73]:
q_values

array([[-0.14567543,  0.02899212, -0.26580594],
       [15.50059371, -0.15679   , -0.57513834],
       [-0.5       , -0.76106002, 19.25654426],
       [16.33088984, -2.59484078, -2.47422423],
       [-0.85218008, -0.70064572, 12.95053434],
       [-2.28421257, -2.77678508, -2.6859972 ],
       [-1.61504568, -1.84850008, 12.09657983],
       [-2.71166051, -3.13316787, -3.18291114]])

## Flatten Q-Table

In [77]:
# Flatten the Q-table for copying to Arduino code
flattened_q_values = q_values.flatten()

# Print the flattened Q-table
print("Flattened Q-table:")
print(list(flattened_q_values))

# '''
# flattened_q_values = [-9.18164711, -9.17507953, -7.068638, -9.17692409, -9.18298807 , -9.77915116, -9.39796754, -9.38007022, -6.2751935, -6.75351522 , -10.24702078, -10.24567276, -9.40852981, -7.3617742, -9.44444133 , -9.94720034, -9.25595313, -9.86207442, -9.46110306, -9.51361719 , -7.36628637, -10.18411287, -10.44989276, -10.24705776]
# '''

Flattened Q-table:
[-0.14567542786524684, 0.028992119894766844, -0.2658059392757915, 15.500593710459835, -0.15679, -0.5751383360655641, -0.5, -0.7610600150158306, 19.256544264254092, 16.330889837828664, -2.594840781380889, -2.474224227054556, -0.852180077055561, -0.7006457235470309, 12.950534339413846, -2.2842125693118476, -2.776785077874974, -2.6859971955040907, -1.6150456834794744, -1.8485000844551536, 12.096579828480882, -2.7116605133971876, -3.133167873327339, -3.182911142097555]


In [79]:
f=[-0.14567542786524684, 0.028992119894766844, -0.2658059392757915, 15.500593710459835, -0.15679, -0.5751383360655641, -0.5, -0.7610600150158306, 19.256544264254092, 16.330889837828664, -2.594840781380889, -2.474224227054556, -0.852180077055561, -0.7006457235470309, 12.950534339413846, -2.2842125693118476, -2.776785077874974, -2.6859971955040907, -1.6150456834794744, -1.8485000844551536, 12.096579828480882, -2.7116605133971876, -3.133167873327339, -3.182911142097555]
f

[-0.14567542786524684,
 0.028992119894766844,
 -0.2658059392757915,
 15.500593710459835,
 -0.15679,
 -0.5751383360655641,
 -0.5,
 -0.7610600150158306,
 19.256544264254092,
 16.330889837828664,
 -2.594840781380889,
 -2.474224227054556,
 -0.852180077055561,
 -0.7006457235470309,
 12.950534339413846,
 -2.2842125693118476,
 -2.776785077874974,
 -2.6859971955040907,
 -1.6150456834794744,
 -1.8485000844551536,
 12.096579828480882,
 -2.7116605133971876,
 -3.133167873327339,
 -3.182911142097555]