In [None]:
import agentpy as ap
import numpy as np
import random, json
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns, IPython
from matplotlib import pyplot as plt, cm
from enum import Enum
import matplotlib.colors as mcolors

class DirectionEnum(Enum):
    UP = 0
    RIGHT = 1
    DOWN = 2
    LEFT = 3

class MazeAgent(ap.Agent):
    '''
    Initializing agent elements:
    - 4 possible actions
    - Q values as a zero matrix (unless a matrix definition is provided)
    - Policies. Values of epsilon, alpha, and gamma
    '''
    def setup(self):
        # Actions are linked to a movement in the grid.
        self.actions = {'turn_left': -1, 'turn_right': 1, 'move': 0}
        self.directions = {DirectionEnum.UP: (-1,0), DirectionEnum.DOWN: (1, 0), DirectionEnum.LEFT: (0, -1), DirectionEnum.RIGHT: (0, 1)}
        self.direction = DirectionEnum.UP
        self.env = self.model.env
        self.reward = 0
        m, n = self.env.shape
        self.Q = self.p.Q
        # Learning policies
        self.epsilon = self.p.epsilon
        self.alpha = self.p.alpha
        self.gamma = self.p.gamma
        self.prt = False

    '''
    Actual action execution. This process will be employed after agent has trained
    '''
    def execute(self, action):
        if action != 'move':
            if self.direction == DirectionEnum.UP and action == 'turn_left':
                self.direction = DirectionEnum.LEFT
            elif self.direction == DirectionEnum.LEFT and action == 'turn_right':
                self.direction = DirectionEnum.UP
            else:
                self.direction = DirectionEnum(self.direction.value + self.actions[action])

        else: 
            self.reward += self.env.get_reward(self.get_position())
            self.env.move_by(self, self.directions[self.direction])
        return action

    '''
    Get position of agent in environment
    '''
    def get_position(self):
        return self.env.positions[self]

    '''
    Get both position and direction
    '''
    def get_state(self):
        return (self.get_position(), self.direction)

    '''
    Training. Agent will be able to perform a number of possible episode.
    An episode is a complete cycle, until agent reaches the goal
    '''
    def train(self, episodes=0):
        for _ in range(episodes):
            self.direction = DirectionEnum.UP   
            state = self.get_state()           
            visited = set()
            while state[0] != self.p.goal:     
                visited.add(state[0])
                action = self.choose_action() 
                state = self.get_state()  
                self.execute(action)
                new_state = self.get_state()   
                reward = self.env.get_reward_training(state[0],new_state[0], visited) 
                self.update_Q(state, action, reward, new_state)
                state = new_state  
            self.env.move_to(self, self.p.start) 
            self.direction = DirectionEnum.UP
            self.env.setup()
        self.env.move_to(self, self.p.start)
        self.direction = DirectionEnum.UP
    '''
    Applying epsilon greedy policy
    '''
    def choose_action(self):
        state = self.get_state() 
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(list(self.actions.keys()))
        else:
            if self.prt:
                print(self.Q[state])
            return max(self.Q[state], key=self.Q[state].get)

    '''
    Updating Q-values according to definition
    '''
    def update_Q(self, state, action, reward, new_state):
        max_Q_new_state = max(self.Q[new_state].values())
        self.Q[state][action] += self.alpha * (
            reward + self.gamma * max_Q_new_state - self.Q[state][action]
        )


'''
Maze environment
'''
class Maze(ap.Grid):
    def setup(self):
        # Initialize the maze environment
        pass

    '''
    Reward function. The returned value is used to update Q-values
    '''
    def get_reward_training(self, state, new_state, visited):
        current_distance = abs(state[0] - self.p.goal[0]) + abs(state[1] - self.p.goal[1])
        new_distance = abs(new_state[0] - self.p.goal[0]) + abs(new_state[1] - self.p.goal[1])

        if new_state == self.p.goal:
            return self.p.reward  # Reward for reaching the goal
        elif self.p.maze[new_state] == -1:
            return self.p.penalty  # Penalty for hitting a wall
        elif new_state in visited:
            return -5  # Penalty for revisiting a state
        elif new_distance < current_distance:
            return 1  # Reward for moving closer to the goal
        else:
            return -1  # Small penalty for moving away or staying in place

    def get_reward(self, state):
        if state == self.p.goal:
            return self.p.reward 
        elif self.p.maze[state] == -1:
            return self.p.penalty 
        else:
            return -1

'''
'''
class MazeModel(ap.Model):
    def setup(self):
        self.env = Maze(self, shape=maze.shape)
        self.agent = MazeAgent(self)
        self.env.add_agents([self.agent], positions=[self.p.start])
        self.agent.train(self.p.episodes)
        self.agent.epsilon = 0         
        self.agent.reward = 0

    def step(self):
        action = self.agent.choose_action()
        self.agent.execute(action)
        print(action)

    def update(self):
        if self.agent.get_position() == self.model.p.goal:
            self.stop()

    def end(self):
        self.report('Q-Table', self.agent.Q)




def animation_plot(model, ax):
    n, m = model.p.maze.shape
    
    grid = np.zeros((n, m))
    grid[model.p.maze == -1] = -1
    grid[model.p.goal] = 1

    color_dict = {0:'#ffffff', -1:'#000000', 2:'#0000ff', 1:'#00ff00'}
    ap.gridplot(grid, ax=ax, color_dict=color_dict, convert=True)
    agent = list(model.env.agents)[0]
    grid[model.env.positions[agent]] = 2
    ap.gridplot(grid, ax=ax, color_dict=color_dict, convert=True)
    
    ax.set_title(f"Agent Q-Learning\nReward: {agent.reward} | Direction: {agent.direction}")
    



actions = ['turn_left', 'turn_right', 'move']
maze = np.array([[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1],
                 [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1],
                 [-1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, 0, -1],
                 [-1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, 0, -1],
                 [-1, 0, 0, 0, 0, -1, -1, -1, 0, -1, -1, -1, -1, 0, 0, 0, -1, 0, 0, -1],
                 [-1, -1, -1, 0, -1, -1, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, -1],
                 [-1, 0, 0, -1, -1, 0, 0, -1, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, 0, -1],
                 [-1, -1, 0, -1, 0, 0, 0, 0, -1, -1, -1, 0, 0, -1, -1, 0, 0, 0, 0, -1],
                 [-1, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1],
                 [-1, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1],
                 [-1, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1],
                 [-1, -1, 0, -1, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1],
                 [-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, 0, -1],
                 [-1, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1],
                 [-1, 0, -1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1],
                 [-1, 0, -1, -1, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1],
                 [-1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1],
                 [-1, 0, -1, 0, -1, 0, -1, 0, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, 0, -1],
                 [-1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1],
                 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1]])

#maze = np.zeros((5, 5))
n = len(maze)

# Initilize Q-values with 0
Q = {}
for x in range(n):
    for y in range(n):
        for direction in DirectionEnum:
            Q[((x, y), direction)] = {action: 0 for action in actions}

parameters = {
    'maze': maze,
    'start': (18, 17),
    'goal': (8, 10), 
    'epsilon': 0.7, 
    'alpha': 0.4,
    'gamma': 0.5,
    'episodes': 1000,
    'steps': 100,
    'reward': 1000,
    'penalty': -1000,
    'Q': Q
}


# Display Q-matrix after training

fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111)
mazeModel = MazeModel(parameters)
animation = ap.animate(mazeModel, fig, ax, animation_plot)
IPython.display.HTML(animation.to_jshtml())




turn_left
move
move
move
move
move
move
move
turn_right
move
move
move
move
move
move
turn_left
move
move
turn_right
move
move
turn_left
move
turn_left
move
turn_right
move
move
turn_left
move
turn_right
move
move
move
turn_right
move
move
turn_right
move
move
turn_left
move
move
move
turn_right
move
move
turn_left
move
move
turn_right
move
move
turn_right
move
turn_left
move
move
move
turn_right
move
move
turn_right
move
