# Testing SR-IS vs SR on 4-room replanning

In [1]:
import os
import pickle
from copy import deepcopy
from itertools import islice

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
import gymnasium as gym

import gym_env
from utils import test_agent, get_transition_matrix, create_mapping_nb, woodbury
from models import SR_IS, SR_TD

In [2]:
# Set the seed
seed = 26
np.random.seed(seed)
seeds = np.arange(10)

# Save dir
save_dir = os.path.join('..', 'figures/')

In [13]:
# Model Hyperparams
reward = -0.8
alpha = 0.05
beta = 1.0
_lambda = 1.0
num_steps = 80000
num_simulations = 20

# New term states
term_states = {
    "bottom-right" : [(4,6), (6,4)],
    "top-right" : [(0,4), (0,6), (2,6)],
    "bottom-left" : [(4,0), (6,0), (6,2)]
}

In [4]:
# Transition matrix with no goal
env = gym.make("four-room-ng")
maze = env.unwrapped.maze
# Get the transition matrix of the new environment
mapping = create_mapping_nb(env.unwrapped.maze, env.unwrapped.get_walls())
T_no_goal = get_transition_matrix(env, mapping)

In [16]:
avgs = {
    'Random':{'mean':[],'median':[]},
    'SR':{'mean':[],'median':[]},
    'SR-IS':{'mean':[],'median':[]},
}

In [5]:
SR_IS_agent = SR_IS(env_name="four-room-br", reward=reward, term_reward=reward, _lambda=_lambda, beta=beta, alpha=alpha, num_steps=num_steps, policy="softmax", imp_samp=True)
Random_agent = SR_IS(env_name="four-room-br", reward=reward, term_reward=reward, _lambda=_lambda, beta=beta, num_steps=num_steps, policy="random", imp_samp=False)
SR_agent = SR_TD(env_name="four-room-br", reward=1,  term_reward=5, beta=beta, alpha=alpha, num_steps=num_steps, policy="softmax")

In [6]:
# Train agents that need to be trained
SR_IS_agent.learn(seed=int(seed))
SR_agent.learn(seed=int(seed))

In [10]:
new_term_idx = SR_IS_agent.mapping[term_states["top-right"][0]]
new_term_loc = term_states["top-right"][0]

In [11]:
# Construct a new transition matrix and new environment
T_new = np.copy(T_no_goal)
T_new[new_term_idx] = 0
T_new[new_term_idx, new_term_idx] = 1
new_terminals = np.diag(T_new) == 1
new_target_locs = np.array([list(new_term_loc)])
env_new = env
env_new.unwrapped.target_locs = [np.array((new_term_loc[0], new_term_loc[1]))]

In [12]:
# Update the env of agents
SR_IS_agent.env = env_new
Random_agent.env = env_new
SR_agent.env = env_new

# Update P
SR_IS_agent.P = T_new[~new_terminals][:,new_terminals]

# Use Woodbury update to get a new DR
DR_new = woodbury(SR_IS_agent, T_new, inv=False)

# Update terminals
SR_IS_agent.terminals = new_terminals
Random_agent.terminals = new_terminals
SR_agent.terminals = new_terminals

# Set the DR to updated DR for SR-IS agent and update the values
SR_IS_agent.DR = DR_new
SR_IS_agent.update_V()

# Get new reward for SR agent
r_new = np.ones(len(SR_agent.r))
r_new[new_term_idx] = 5
SR_agent.r = r_new
SR_agent.update_V()

In [15]:
# Initialize holders and run simulations
SR_IS_avg, Random_avg, SR_td_avg = [],[],[]
for i in range(num_simulations):
    SR_IS_avg.append(len(test_agent(agent=SR_IS_agent, policy=SR_IS_agent.policy, seed=int(seed))))
    Random_avg.append(len(test_agent(agent=Random_agent, policy=Random_agent.policy, seed=int(seed))))
    SR_td_avg.append(len(test_agent(agent=SR_agent, policy=SR_agent.policy, seed=int(seed))))

  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


In [17]:
# Append mean and median to main holder
avgs['SR-IS']['mean'].append(np.mean(SR_IS_avg))
avgs['SR-IS']['median'].append(np.median(SR_IS_avg))
avgs['Random']['mean'].append(np.mean(Random_avg))
avgs['Random']['median'].append(np.median(Random_avg))
avgs['SR']['mean'].append(np.mean(SR_td_avg))
avgs['SR']['median'].append(np.median(SR_td_avg))

In [18]:
avgs

{'Random': {'mean': [162.0], 'median': [162.0]},
 'SR': {'mean': [86.0], 'median': [86.0]},
 'SR-IS': {'mean': [6.0], 'median': [6.0]}}