In [3]:
import os

import numpy as np
import gymnasium as gym

import gym_env

In [4]:
# Construct the environment
env = gym.make("simple-5x5")
env.reset()

({'agent': array([0, 0]), 'target': array([4, 4])}, {'distance': 8.0})

In [5]:
actions = np.arange(env.action_space.n, dtype=int)
start_loc = env.unwrapped.start_loc
target_loc = env.unwrapped.target_loc
maze = env.unwrapped.maze
size = maze.size
target_locs = [target_loc]

In [6]:
print(f"actions: {actions}, start loc: {start_loc}, target loc: {target_loc}, size: {size}")


actions: [0 1 2 3], start loc: [0 0], target loc: [4 4], size: 25


In [7]:
def get_blocked_states(maze):
    blocked_states = []

    for i in range(maze.shape[0]):
        for j in range(maze.shape[1]):
            if maze[i,j] == "1":
                blocked_states.append((i,j))
    
    return blocked_states

In [8]:
def create_transition_matrix_mapping(maze):
    """
    Creates a mapping from maze state indices to transition matrix indices
    Returns for both the full maze (mapping_b), and the maze with no borders (mapping_nb)
    """
    blocked_states = get_blocked_states(maze)
    n = len(maze)  # Size of the maze (N)

    mapping_nb, mapping_b = {}, {}
    matrix_idx_nb, matrix_idx_b = 0, 0

    for i in range(n):
        for j in range(n):
            # If we higt a blocked state
            if (i, j) not in blocked_states:
                mapping_nb[(i, j)] = matrix_idx_nb
                matrix_idx_nb += 1
            mapping_b[(i,j)] = matrix_idx_b
            matrix_idx_b += 1

    return mapping_nb, mapping_b

In [9]:
mapping_nb, mapping_b = create_transition_matrix_mapping(maze)
reverse_mapping_nb = {index: (i, j) for (i, j), index in mapping_nb.items()}
reverse_mapping_b = {index: (i, j) for (i, j), index in mapping_b.items()}

In [10]:
def get_transition_matrix(size, mapping):
    T = np.zeros(shape=(size, size))
    # loop through the maze
    for row in range(maze.shape[0]):
        for col in range(maze.shape[1]):            
            # if we hit a barrier
            if maze[row,col] == '1':
                continue

            idx_cur = mapping[row, col]

            # check if current state is terminal
            if maze[row,col] == 'G':
                T[idx_cur, idx_cur] = 1
                continue

            state = (row,col)
            successor_states = env.unwrapped.get_successor_states(state)
            for successor_state in successor_states:
                idx_new = mapping[successor_state[0][0], successor_state[0][1]]
                T[idx_cur, idx_new] = 1
    
    return T

In [11]:
def select_action(policy):
    epsilon = 0.2
    if policy == "softmax":
        return
    elif policy == "e-greedy":
        if np.random.uniform(low=0,high=1) < epsilon:
            return random_action
        else:
            return max_action

In [12]:
def one_hot_row(s_idx, size):
    row = np.zeros(size) 
    row[s_idx] = 1

    return row

In [13]:
T = get_transition_matrix(size=size, mapping=mapping_b)

In [14]:
# Get terminal states
terminals = np.diag(T) == 1

# Calculate P = T_{NT}
P = T[~terminals][:,terminals]

# Calculate reward
r = np.full(len(T), -1)     # our reward at each non-terminal state to be -1
r[terminals] = 10           # reward at terminal state is 10
c = -r

In [15]:
# Create a matrices needed for calculation
# DR = np.zeros((size,size))
DR = np.eye(size-sum(terminals))
Z = np.zeros(size)
V = np.zeros(size)
one_hot = np.eye(size-sum(terminals))

# Hyperparameters
alpha = 0.1
gamma = 0.9
epsilon = 0.2

# Other things
num_steps = 25000

In [None]:
Z[terminals] = np.exp(r[terminals] / gamma)

In [16]:
# Calculate expr
expr = np.exp(r[terminals] / gamma)

In [17]:
def importance_sampling(state, s_new_idx):
    successor_states = env.unwrapped.get_successor_states(state)
    p = 1/len(successor_states)
    w = (p * Z[s_new_idx]) / sum(p * Z[mapping_b[(s[0][0],s[0][1])]] for s in successor_states)
    
    return w

In [18]:
env.reset()

# Iterate through number of steps
for i in range(num_steps):
    # Current state
    state = env.unwrapped.agent_loc
    state_idx = mapping_b[(state[0], state[1])]

    # Choose action (random for now)
    action = env.unwrapped.random_action()

    # Take action
    obs, _, done, _, _ = env.step(action)

    if done:
        env.reset()
        continue

    # Unpack observation to get new state
    next_state = obs["agent"]
    next_state_idx = mapping_b[(next_state[0], next_state[1])]

    # Update Default Representation
    w = importance_sampling(state, next_state_idx)
    w = 1 if np.isnan(w) or w == 0 else w
    DR[state_idx] = (1 - alpha) * DR[state_idx] + alpha * ( one_hot[state_idx] + gamma * DR[next_state_idx] ) * w

    # Update Z-Values
    Z[state_idx] = DR[state_idx] @ P @ expr

    # Update state
    state = next_state

  w = (p * Z[s_new_idx]) / sum(p * Z[mapping_b[(s[0][0],s[0][1])]] for s in successor_states)


In [19]:
# Z[terminals] = np.exp(r[terminals] / gamma)

In [20]:
Z

array([2.36024521e+01, 4.52458533e+01, 2.10018597e+02, 7.73747532e+02,
       2.25510102e+03, 4.76807472e+01, 0.00000000e+00, 0.00000000e+00,
       2.06496369e+03, 1.11944567e+04, 1.79777390e+02, 0.00000000e+00,
       0.00000000e+00, 1.24145593e+04, 2.46253958e+04, 5.13745977e+02,
       2.32883238e+03, 8.67252001e+03, 0.00000000e+00, 9.18049503e+04,
       1.45105254e+03, 5.80505024e+03, 1.92796082e+04, 9.26949158e+04,
       6.69104951e+04])

In [21]:
v_maze = np.zeros_like(maze)
for row in range(v_maze.shape[0]):
    for col in range(v_maze.shape[1]):
        if maze[row, col] == "1":
            v_maze[row,col] = "BAR"
            continue
        v_maze[row,col] = round(np.log(Z[mapping_b[(row,col)]]), 2)

In [22]:
v_maze

array([['3.16', '3.81', '5.35', '6.65', '7.72'],
       ['3.86', 'BAR', 'BAR', '7.63', '9.32'],
       ['5.19', 'BAR', 'BAR', '9.43', '10.11'],
       ['6.24', '7.75', '9.07', 'BAR', '11.43'],
       ['7.28', '8.67', '9.87', '11.44', '11.11']], dtype='<U21')