In [146]:
import os

import matplotlib.pyplot as plt
import numpy as np
import gymnasium as gym

import gym_env
from models import LinearRL_NHB

### Make simple environment and transition policies

In [147]:
## Construct a simple environment
envstep=[]
for s in range(3):
    # actions 0=left, 1=right
    envstep.append([[0,0], [0,0]])  # [s', done]
envstep = np.array(envstep)
# State 0 -> 1, 2
envstep[0,0] = [1,0]
envstep[0,1] = [2,0]

# State 1 -> 3
envstep[1,0] = [3,1]
envstep[1,1] = [3,1]

# State 2 -> 4
envstep[2,0] = [4,1]
envstep[2,1] = [4,1]

## Construct a biased policy
T_b = np.zeros((5,5))
# State 3 is more rewarding than state 4 so we will incorporate that preference into the policy
T_b[0,1] = 0.8
T_b[0,2] = 0.2
T_b[1,3] = 1.0
T_b[2,4] = 1.0
T_b[3,3] = 1.0
T_b[4,4] = 1.0

## The unbiased transition policy
T = np.zeros((5,5))
T[0,1] = 0.5
T[0,2] = 0.5
T[1,3] = 1.0
T[2,4] = 1.0
T[3,3] = 1.0
T[4,4] = 1.0

### Train SR agents

In [206]:
def softmax(state, T, V, envstep):
    successor_states = np.where(T[state, :] != 0)[0]
    action_probs = np.full(2, 0.0)   # We can hardcode this because every state has 2 actions

    v_sum = sum(np.exp(V[s] / 1.0) for s in successor_states)

    # if we don't have enough info, random action
    if v_sum == 0:
        return  np.random.choice([0,1])

    for action in [0,1]:
        new_state, done = envstep[state, action]

        # If we hit a done state our action doesn't matter
        if done:
            action = np.random.choice([0,1])
            return action, 1
        action_probs[action] = np.exp(V[new_state] / 1.0 ) / v_sum
        
    action = np.random.choice([0,1], p=action_probs)
    s_prob = action_probs[action]

    return action, s_prob

def imp_sampling(T, state, s_prob):
    successor_states = np.where(T[state, :] != 0)[0]
    p = 1/len(successor_states)
    w = p/s_prob
            
    return w

def train_SR(num_steps, alpha, gamma, r, imp_samp):
    M = np.eye(5)
    one_hot = np.eye(5)
    state = 0
    for _ in range(num_steps):
        V = M @ r
        action, s_prob = softmax(state, T, V, envstep)
        if imp_samp:
            w = imp_sampling(T, state, s_prob)
        else:
            w = 1

        # Take action
        next_state, done = envstep[state, action]

        # Update SR
        target = one_hot[state] + gamma * M[next_state]
        M[state] = (1 - alpha) * M[state] + alpha * target * w

        if done:
            state = 0
            continue
        
        state = next_state
    
    return M

In [213]:
# Define parameters
imp_samp = True
num_steps = 150
num_simulations = 400
alpha = 0.25
gamma = 0.3
r = np.array([0,0,0,10,2])

In [214]:
M_avg_with = np.zeros((5,5))
M_avg_without = np.zeros((5,5))
for _ in range(num_simulations):
    M_without = train_SR(num_steps, alpha, gamma, r, imp_samp=False)
    M_with = train_SR(num_steps, alpha, gamma, r, imp_samp=True)
    M_avg_without += M_without
    M_avg_with += M_with

M_avg_without /= num_simulations
M_avg_with /= num_simulations

In [215]:
print(M_avg_without[0])
print(M_avg_with[0])

[1.         0.28177132 0.01822868 0.08453139 0.00449958]
[0.99052883 0.15030017 0.14685848 0.04509005 0.03502307]
