In [1]:
import os

import numpy as np
import importlib
import gymnasium as gym
import matplotlib.pyplot as plt
from matplotlib import patches
import matplotlib.animation as manimation
from sklearn.decomposition import PCA


import gym_env
import utils
from utils import create_mapping, get_transition_matrix, create_mapping_nb, get_full_maze_values

## Helper Functions

In [2]:
def policy_reval(agent, r_new):
    """
    The New environment is the same as the old one except we 
    
    Args:
    agent (LinearRL class) : The LinearRL agent
    r_new (array) : Updated reward

    Returns:
    V_new (array) : New value of each state
    """
    expr_new = np.exp(r_new[agent.terminals] / agent._lambda)
    Z_new = np.zeros(len(r_new))

    Z_new[~agent.terminals] = agent.DR[~agent.terminals][:,~agent.terminals] @ agent.P @ expr_new
    Z_new[agent.terminals] = expr_new
    V_new = np.round(np.log(Z_new), 2)

    return V_new

## LinearRL-TD Model

In [3]:
class LinearRL:
    def __init__(self, env_name, alpha=0.1, beta=1, gamma=0.904, _lambda=1.0, epsilon=0.4, num_steps=25000, policy="random", imp_samp=True):
        self.env = gym.make(env_name)
        self.start_loc = self.env.unwrapped.start_loc
        self.target_locs = self.env.unwrapped.target_locs
        self.maze = self.env.unwrapped.maze
        self.walls = self.env.unwrapped.get_walls()
        self.size = self.maze.size - len(self.walls)   # Size of the state space is the = size of maze - number of blocked states
        self.height, self.width = self.maze.shape
        # self.target_locs = [self.target_loc]

        # Create mapping and Transition matrix
        self.mapping = create_mapping_nb(self.maze, self.walls)
        self.reverse_mapping = {index: (i, j) for (i, j), index in self.mapping.items()}
        self.T = get_transition_matrix(self.env, self.mapping)
        

        # Get terminal states
        self.terminals = np.diag(self.T) == 1
        # Calculate P = T_{NT}
        self.P = self.T[~self.terminals][:,self.terminals]
        # Set reward
        self.reward_nt = -1   # Non-terminal state reward
        self.reward_t = -1    # Terminal state reward
        self.r = np.full(len(self.T), self.reward_nt)
        self.r[self.terminals] = self.reward_t
        self.expr_t = np.exp(self.r[self.terminals] / _lambda)
        # Precalculate exp(r) for use with LinearRL equations
        self.expr_nt = np.exp(self.reward_nt / _lambda)

        # Params
        self.alpha = alpha
        self.beta = beta
        self.gamma = self.expr_nt
        self._lambda = _lambda
        self.epsilon = epsilon
        self.num_steps = num_steps
        self.policy = policy
        self.imp_samp = imp_samp

        # Model
        self.DR = self.get_DR()
        self.Z = np.full(self.size, 0.01)

        self.V = np.zeros(self.size)
        self.one_hot = np.eye(self.size)

    def get_states(self):
        """
        Returns all non-blocked states as well as a mapping of each state (i,j) -> to an index (k)
        """
        states = []
        index_mapping = {}
        index = 0
        for i in range(len(self.maze)):
            for j in range(len(self.maze[i])):
                if self.maze[i][j] in ['0', 'S', 'G']:
                    states.append((i, j))
                    index_mapping[(i, j)] = index
                    index += 1

        return states, index_mapping

    def get_DR(self):
        """
        Returns the DR initialization based on what decision policy we are using, values are filled with 0.01 if using softmax to avoid div by zero
        """
        if self.policy == "random":
            DR = np.eye(self.size)
            DR[np.where(self.terminals)[0], np.where(self.terminals)[0]] = (1/(1-self.gamma))
        
        elif self.policy == "softmax":
            DR = np.full((self.size, self.size), 0.01)
            np.fill_diagonal(DR, 1)
            DR[np.where(self.terminals)[0], np.where(self.terminals)[0]] = (1/(1-self.gamma))

        return DR

    def update_V(self):
        self.Z[~self.terminals] = self.DR[~self.terminals][:,~self.terminals] @ self.P @ self.expr_t
        self.Z[self.terminals] = self.expr_t
        self.V = np.round(np.log(self.Z), 2)
    
    def importance_sampling(self, state, s_prob):
        """
        Performs importance sampling P(x'|x)/u(x'|x). P(.) is the default policy, u(.) us the decision policy
        """
        successor_states = self.env.unwrapped.get_successor_states(state)
        p = 1/len(successor_states)
        w = p/s_prob
                
        return w

    def select_action(self, state, beta=0.5, target_loc=None):
        """
        Action selection based on our policy
        Options are: [random, softmax, egreedy, test]
        """
        if self.policy == "random":
            return self.env.unwrapped.random_action()
        
        elif self.policy == "softmax":
            successor_states = self.env.unwrapped.get_successor_states(state)      # succesor_states = [(state, terminated), ...]
            action_probs = np.full(self.env.action_space.n, 0.0)

            v_sum = sum(
                        np.exp((np.log(self.Z[self.mapping[(s[0][0],s[0][1])]] + 1e-20)) / self.beta) for s in successor_states
                        )

            # if we don't have enough info, random action
            if v_sum == 0:
                return self.env.unwrapped.random_action() 

            for action in self.env.unwrapped.get_available_actions(state):
                direction = self.env.unwrapped._action_to_direction[action]
                new_state = state + direction
                
                action_probs[action] = np.exp((np.log(self.Z[self.mapping[(new_state[0], new_state[1])]] + 1e-20)) / self.beta ) / v_sum

            action = np.random.choice(self.env.action_space.n, p=action_probs)
            s_prob = action_probs[action]

            return action, s_prob
    
        elif self.policy == "egreedy":
            if np.random.uniform(low=0, high=1) < self.epsilon:
                return self.env.unwrapped.random_action()
            else:
                action_values = np.full(self.env.action_space.n, -np.inf)
                for action in self.env.unwrapped.get_available_actions(state):
                    direction = self.env.unwrapped._action_to_direction[action]
                    new_state = state + direction

                    if self.maze[new_state[0], new_state[1]] == "1":
                        continue

                    action_values[action] = round(np.log(self.Z[self.mapping[(new_state[0],new_state[1])]]), 2)

                return np.argmax(action_values)
            
        elif self.policy == "test":
            action_values = np.full(self.env.action_space.n, -np.inf)
            for action in self.env.unwrapped.get_available_actions(state):
                direction = self.env.unwrapped._action_to_direction[action]
                new_state = state + direction

                # Need this to make it work for now
                if np.array_equal(new_state, target_loc):
                    return action

                if self.maze[new_state[0], new_state[1]] == "1":
                    continue
                action_values[action] = round(np.log(self.Z[self.mapping[(new_state[0],new_state[1])]]), 2)

            return np.nanargmax(action_values)

    def get_D_inv(self):
        """
        Calculates the DR directly using matrix inversion, used for testing
        """
        I = np.eye(self.size)
        D_inv = np.linalg.inv(I-self.gamma*self.T)

        return D_inv

    def learn(self):
        """
        Agent explores the maze according to its decision policy and and updates its DR as it goes
        """
        print(f"Decision Policy: {self.policy}, Number of Iterations: {self.num_steps}, lr={self.alpha}, temperature={self.beta}")
        self.env.reset()

        # D_inv_1 = self.get_D_inv()
        # D_inv_2 = np.linalg.inv(np.diag(np.exp(-self.r))-self.T)

        # Iterate through number of steps
        for i in range(self.num_steps):
            # Current state
            state = self.env.unwrapped.agent_loc
            state_idx = self.mapping[(state[0], state[1])]

            # Choose action
            if self.policy == "softmax":
                action, s_prob = self.select_action(state)
            else:
                action = self.select_action(state, self.policy)
        
            # Take action
            obs, _, done, _, _ = self.env.step(action)

            # Unpack observation to get new state
            next_state = obs["agent"]
            next_state_idx = self.mapping[(next_state[0], next_state[1])]

            # Importance sampling
            if self.policy == "softmax":
                w = self.importance_sampling(state, s_prob)
                w = 1 if np.isnan(w) or w == 0 else w
            else:
                w = 1
            
            ## Update default representation
            target = self.one_hot[state_idx] + self.gamma * self.DR[next_state_idx]
            # If we are using importance sampling
            if self.imp_samp:
                self.DR[state_idx] = (1 - self.alpha) * self.DR[state_idx] + self.alpha * target * w
            else:
                self.DR[state_idx] = (1 - self.alpha) * self.DR[state_idx] + self.alpha * target

            ## Update Z-Values
            self.Z = self.DR[:,~self.terminals] @ self.P @ self.expr_t

            if done:
                self.env.reset()
                continue
            
            # Update state
            state = next_state

        # Update DR at terminal state
        self.Z[self.terminals] = np.exp(self.r[self.terminals] / self._lambda)
        self.V = np.round(np.log(self.Z), 2)


## Train Agents

### D_inv agent

In [4]:
# Agent to be used with D_inv
agent = LinearRL(env_name="simple-5x5-2", _lambda=1.0, alpha=0.001, beta=1.0, num_steps=500000, policy="softmax", imp_samp=True)

D_inv = np.linalg.inv(np.diag(np.exp(-agent.r / agent._lambda)) - agent.T)

agent.DR = D_inv
agent.update_V()
maze_values = get_full_maze_values(agent)
print(maze_values)

[[-7.95 -6.36 -4.7  -3.04 -1.  ]
 [-8.66  -inf  -inf -4.42 -3.05]
 [-7.43  -inf  -inf -5.67 -5.01]
 [-5.79 -4.21 -2.65  -inf -5.59]
 [-4.67 -3.03 -1.   -2.66 -4.3 ]]


### Train agent with importance sampling

In [5]:
# Initialize the agent
agent_with_imp = LinearRL(env_name="simple-5x5-2", _lambda=1.0, alpha=0.001, beta=0.2, num_steps=500000, policy="softmax", imp_samp=True)

In [6]:
# Train the agent
agent_with_imp.learn()

Decision Policy: softmax, Number of Iterations: 500000, lr=0.001, temperature=0.2


  logger.deprecation(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


In [7]:
# Print out the values to see what it learned
maze_values = get_full_maze_values(agent_with_imp)
print("Maze:")
print(agent.maze)
print("-------------------------------------")
print("Values:")
print(maze_values)

Maze:
[['S' '0' '0' '0' 'G']
 ['0' '1' '1' '0' '0']
 ['0' '1' '1' '0' '0']
 ['0' '0' '0' '1' '0']
 ['0' '0' 'G' '0' '0']]
-------------------------------------
Values:
[[-5.25 -5.06 -3.92 -1.77 -1.  ]
 [-5.12  -inf  -inf -3.65 -2.43]
 [-5.05  -inf  -inf -4.91 -4.79]
 [-4.99 -3.57 -1.65  -inf -4.91]
 [-4.66 -2.44 -1.   -1.66 -4.91]]


### Train agent without importance sampling

In [8]:
agent_no_imp = LinearRL(env_name="simple-5x5-2", _lambda=1.0, alpha=0.001, beta=0.2, num_steps=500000, policy="softmax", imp_samp=False)

In [9]:
# Train agent without importance sampling
agent_no_imp.learn()

Decision Policy: softmax, Number of Iterations: 500000, lr=0.001, temperature=0.2


In [10]:
# Print out the values to see what it learned
maze_values = get_full_maze_values(agent_no_imp)
print("Maze:")
print(agent.maze)
print("-------------------------------------")
print("Values:")
print(maze_values)

Maze:
[['S' '0' '0' '0' 'G']
 ['0' '1' '1' '0' '0']
 ['0' '1' '1' '0' '0']
 ['0' '0' '0' '1' '0']
 ['0' '0' 'G' '0' '0']]
-------------------------------------
Values:
[[-5.01 -4.94 -2.95 -1.95 -1.  ]
 [-4.97  -inf  -inf -2.95 -1.95]
 [-4.94  -inf  -inf -4.91 -4.75]
 [-4.93 -2.6  -1.55  -inf -4.91]
 [-4.8  -1.9  -1.   -1.66 -4.91]]


## Update Agents DR

In [11]:
loc = 17
r_new = np.full(20, -1)
r_new[loc] = 1
print(r_new)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1]


### Update agent with D_inv, to verify working

In [12]:
V_new = policy_reval(agent=agent, r_new=r_new)

In [13]:
agent.V = V_new
maze_values = get_full_maze_values(agent)
print(maze_values)

[[-7.63 -6.34 -4.7  -3.04 -1.  ]
 [-7.03  -inf  -inf -4.41 -3.04]
 [-5.45  -inf  -inf -5.56 -4.74]
 [-3.79 -2.21 -0.65  -inf -3.92]
 [-2.67 -1.03  1.   -0.66 -2.31]]


### Update agent with importance sampling

In [15]:
V_new = policy_reval(agent=agent_with_imp, r_new=r_new)

In [17]:
agent_with_imp.V = V_new
maze_values = get_full_maze_values(agent_with_imp)
print(maze_values)

[[-3.59 -3.47 -3.9  -1.73 -1.  ]
 [-3.46  -inf  -inf -3.62 -2.39]
 [-3.39  -inf  -inf -3.25 -3.32]
 [-3.31 -1.57  0.35  -inf -3.25]
 [-2.89 -0.44  1.    0.33 -3.25]]


### Update agent without importance sampling

In [18]:
V_new = policy_reval(agent=agent_no_imp, r_new=r_new)

In [19]:
agent_no_imp.V = V_new
maze_values = get_full_maze_values(agent_no_imp)
print(maze_values)

[[-3.35 -3.3  -2.95 -1.95 -1.  ]
 [-3.31  -inf  -inf -2.95 -1.95]
 [-3.28  -inf  -inf -3.25 -3.24]
 [-3.27 -0.6   0.45  -inf -3.25]
 [-3.09  0.1   1.    0.33 -3.25]]


## Test Agents