In [107]:
import numpy as np
import matplotlib.pyplot as plt

In [102]:
# Setup
env = MiniMazeEnv(size=5)

# Hyperparameters
episodes = 10
epsilon = 0.9
lr = 0.01
gamma = 0.8


In [112]:
# Qlearning - Off-Policy
class QLearning():
    def __init__(self, env, epsilon, lr, gamma):
        self.env = env
        # self.Q_table = np.zeros((env.state_space_n, env.action_space_n))
        self.Q_table = np.random.rand(env.state_space_n, env.action_space_n)
        
        self.epsilon = epsilon
        self.lr = lr
        self.gamma = gamma
        
        
    def policy(self, state):
        # epsilon greedy
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space_sample()
        else:
            return np.argmax(self.Q_table[state, :])
        
    def update(self, old_state, old_action, new_state, reward):
        prediction = self.Q_table[old_state, old_action]
        target = reward + self.gamma * np.max(self.Q_table[new_state, :])
        self.Q_table[old_state, old_action] = prediction + self.lr * (target - prediction)

agent_qlearning = QLearning(env, epsilon, lr, gamma)

In [227]:
print('Policy Before Training')
maze_policy = visualize_policy(agent_qlearning.Q_table, (5, 5))
for row in maze_policy:
    print(' '.join(row))
total_rewards_per_episode = []

# Training Loop
for i in range(episodes):
    state = env.reset()
    
    done = False
    while not done:
        action = agent_qlearning.policy(state)
        
        new_state, reward, done, info = env.step(action)

        agent_qlearning.update(state, action, new_state, reward)
        
        state = new_state



# Policy After Training
print('Policy After Training')
maze_policy = visualize_policy(agent_qlearning.Q_table, (5, 5))
for row in maze_policy:
    print(' '.join(row))

Policy Before Training
→ → → → G
→ → → → ↑
→ → → ↑ ↑
→ → → ↑ ↑
S ↑ ↑ ↑ ↑
Policy After Training
→ → → → G
→ → → → ↑
→ → → ↑ ↑
→ → → ↑ ↑
S ↑ ↑ ↑ ↑


In [70]:
# Gym-like Maze Environment
class MiniMazeEnv:
    """
    Mini maze environment with dynamic size and 4 discrete actions.
    Actions are: 0: Left, 1: Down, 2: Right, 3: Up
    Starts at the bottom left corner and the top right corner 
    gives a reward of 1 and is a terminal state.
    """
    def __init__(self, size=5):
        self.size = size
        self.state_space_n = size * size
        self.action_space_n = 4
        
        # Start at bottom left
        self.start_state = 0
        self.goal_state = self.state_space_n - 1
        self.state = self.start_state
        
        self.rewards = np.zeros(self.state_space_n)
        # Top right corner reward
        self.rewards[self.goal_state] = 1
        
        # Dictionary where the keys are each state in maze and the values are 
        # another dictionary where those keys are the actions and the values are the 
        # next states. 
        self.transitions = {
            s: {
                a: self._get_next_state(s, a) for a in range(self.action_space_n)
                } for s in range(self.state_space_n)
            }
        self.terminal_states = [self.goal_state]

    def _get_next_state(self, state, action):
        row, col = divmod(state, self.size)
        # Counter clockwise orientation
        if action == 0:  # Left
            col = max(col - 1, 0)
        elif action == 1:  # Down
            row = max(row - 1, 0)
        elif action == 2:  # Right
            col = min(col + 1, self.size - 1)
        elif action == 3:  # Up
            row = min(row + 1, self.size - 1)

        return row * self.size + col

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        if self.state in self.terminal_states:
            return self.state, self.rewards[self.state], True, {}
        next_state = self.transitions[self.state][action]
        reward = self.rewards[next_state]
        self.state = next_state
        done = self.state in self.terminal_states
        return next_state, reward, done, {}

    def render(self):
        grid = np.full((self.size, self.size), '.')
        row, col = divmod(self.state, self.size)
        # Agent position
        grid[self.size - 1 - row, col] = 'A'
        # Reward position
        grid[0, self.size - 1] = 'R' if self.state != self.goal_state else 'A'
        print("\n".join(" ".join(row) for row in grid))
        print('\n')
        
    def action_space_sample(self):
        return np.random.choice(self.action_space_n)


# env = MiniMazeEnv(size=3)
# state = env.reset()
# env.render()
# env.step(2)
# env.render()


In [91]:
import numpy as np

def visualize_policy(Q_table, shape):
    action_symbols = ['←', '↓', '→', '↑']

    policy = np.argmax(Q_table, axis=1)

    grid = [['' for _ in range(shape[1])] for _ in range(shape[0])]

    for state in range(shape[0] * shape[1]):
        row, col = divmod(state, shape[1])
        row = shape[0] - 1 - row
        if state == 0:
            grid[row][col] = 'S'
        elif state == shape[0] * shape[1] - 1:
            grid[row][col] = 'G'
        else:
            grid[row][col] = action_symbols[policy[state]]

    return grid

np.random.seed(0)
dummy_Q_table = np.random.random((9, 4))

# Visualize the policy
maze_policy = visualize_policy(dummy_Q_table, (3, 3))
for row in maze_policy:
    print(' '.join(row))


↑ ↑ G
↓ ↑ ←
S ↑ ←
