In [1]:
import os

import numpy as np
import importlib
import seaborn as sns
import gymnasium as gym
import matplotlib.pyplot as plt

import gym_env
from models import LinearRL
from utils import get_full_maze_values, policy_reval, decision_policy, update_terminal_reward
from utils_render import plot_decision_prob

In [2]:
# Set the random seed for NumPy
seed = 24
np.random.seed(seed)

# Save dir
save_dir = os.path.join('..', 'figures/')

# For plotting
idx = 4
prob_locs = [3, 5]
colors = [1, 9]

## DR Complete Agent

In [3]:
# Agent to be used with D_inv
agent = LinearRL(env_name="tolman-latent", reward=-1, term_reward=5)
# Make the reward for the first terminal state higher than the second to bias the DR towards that terminal state
# update_terminal_reward(agent, loc=0, r=5)
# update_terminal_reward(agent, loc=1, r=5)

agent.DR = agent.gamma * agent.get_D_inv()
agent.update_V()
maze_values = get_full_maze_values(agent)

In [4]:
pii_old = decision_policy(agent, agent.Z)

In [5]:
pii_old[4]
print(f"Prob left: {pii_old[4][3]} | Prob right: {pii_old[4][5]}")

Prob left: 0.4883122491710901 | Prob right: 0.4883122491710901


In [6]:
# Make the reward for the first terminal state negative and recalculate policy
update_terminal_reward(agent, loc=0, r=-5)
V_new, Z_new = policy_reval(agent)

In [7]:
pii_new = decision_policy(agent, Z_new)

In [8]:
print(f"Prob left: {pii_new[4][3]} | Prob right: {pii_new[4][5]}")

Prob left: 0.02341663522531297 | Prob right: 0.9532078631168673


## With Importance Sampling

In [44]:
# Model Hyperparams
reward = -1
terminal_reward = 5
alpha = 0.2
beta = 1.0
_lambda = 1.0
num_steps = 15000

In [45]:
# Agent
agent = LinearRL(env_name="tolman-latent", reward=reward, term_reward=terminal_reward, _lambda=_lambda, beta=beta, alpha=alpha, num_steps=num_steps, policy="softmax", imp_samp=True)
agent.learn()

In [46]:
pii_old = decision_policy(agent, agent.Z)
print(f"Prob left: {pii_old[4][3]} | Prob right: {pii_old[4][5]}")

Prob left: 0.40777478735745515 | Prob right: 0.5697173749040657


In [47]:
# Make the reward for the first terminal state negative and recalculate policy
update_terminal_reward(agent, loc=0, r=-5)
V_new, Z_new = policy_reval(agent)

In [48]:
pii_new = decision_policy(agent, Z_new)
print(f"Prob left: {pii_new[4][3]} | Prob right: {pii_new[4][5]}")

Prob left: 0.027922241155506775 | Prob right: 0.9565873078648339


In [56]:
save_path = save_dir + "latent_with_is.png"
# plot_decision_prob(probs_train=pii_old[4][prob_locs], probs_test=pii_new[idx][prob_locs], colors=colors, leg_loc="upper center", save_path=None)

## Without Importance Sampling

In [50]:
# Agent
agent = LinearRL(env_name="tolman-latent", reward=reward, term_reward=terminal_reward, _lambda=_lambda, beta=beta, alpha=alpha, num_steps=num_steps, policy="softmax", imp_samp=False)
agent.learn()

In [51]:
pii_old = decision_policy(agent, agent.Z)
print(f"Prob left: {pii_old[4][3]} | Prob right: {pii_old[4][5]}")

Prob left: 0.37068442440716126 | Prob right: 0.5737825654058131


In [52]:
# Make the reward for the first terminal state negative and recalculate policy
update_terminal_reward(agent, loc=0, r=-5)
V_new, Z_new = policy_reval(agent)

In [53]:
pii_new = decision_policy(agent, Z_new)
print(f"Prob left: {pii_new[4][3]} | Prob right: {pii_new[4][5]}")

Prob left: 0.021453238332517242 | Prob right: 0.9211461609669758


In [57]:
save_path = save_dir + "latent_without_is.png"
# plot_decision_prob(probs_train=pii_old[4][prob_locs], probs_test=pii_new[idx][prob_locs], colors=colors, leg_loc="upper center", save_path=None)