In [1]:
from env.windy_gridworld import WindyGridWorld
import numpy as np

In [2]:
ROWS = 7
COLUMNS = 10
NUM_ACTIONS = 4
WIND_LOC = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]
# Tuples are row column
TARGET_LOC = [3, 7]
INIT_LOCATION = [3, 0]

# Set up the environment
env = WindyGridWorld(rows=ROWS, columns=COLUMNS, init_location=np.array(INIT_LOCATION))
env.target_location = np.array(TARGET_LOC)
env.wind_location = np.array(WIND_LOC)

## off-policy n-step SARSA

In [3]:
def off_policy_n_step_sarsa(
        Q_init:np.ndarray,
        policy:np.ndarray,
        env:WindyGridWorld,
        behaviour_policy:np.ndarray=None,
        n:int=1,
        episodes:int=1,
        alpha=0.1,
        gamma:float=1.0,
        epsilon:float=0.1,
):
    def generate_epsilon_greedy(Q:np.ndarray):
        num_actions = Q.shape[-1]
        q_max = Q.max(axis=-1, keepdims=True).repeat(num_actions, axis=-1)
        b = np.isclose(Q, q_max).astype(np.float32)
        num_best_actions = b.sum(axis=-1, keepdims=True)
        b = b / num_best_actions
        b = b * (1 - epsilon)
        b = b + (epsilon / num_actions)
        b = np.clip(b, 0, 1)
        return b

    make_behaviour_epsilon_greedy = behaviour_policy is None
    Q = Q_init.copy()
    policy = policy.copy()
    if make_behaviour_epsilon_greedy:
        behaviour_policy = generate_epsilon_greedy(Q)
    memory = [{} for _ in range(n + 1)]
    num_actions = policy.shape[-1]
    assert num_actions == behaviour_policy.shape[-1], "Number of actions should be equal for both target and behaviour policy"

    for episode in range(episodes):
        state = env.reset()['agent'].tolist()
        T = np.inf
        t = 0
        tau = 0
        action = np.random.choice(num_actions, p=behaviour_policy[*state])
        memory[t]['state'] = state
        memory[t]['action'] = int(action)

        while tau < T - 1:
            if t < T:
                # Load the current state-action pair
                state = memory[t % (n + 1)]['state']
                action = memory[t % (n + 1)]['action']
                
                feedback = env.step(action)
                new_state = feedback[0]['agent']
                reward = feedback[1]
                terminated = feedback[2]

                memory[(t + 1) % (n + 1)]['state'] = new_state.tolist()
                memory[(t + 1) % (n + 1)]['reward'] = reward

                if terminated:
                    T = t + 1
                else:
                    new_action = np.random.choice(num_actions, p=behaviour_policy[*new_state])
                    memory[(t + 1) % (n + 1)]['action'] = int(new_action)
            
            tau = t + 1 - n

            if tau >= 0:
                rho = 1
                # from tau + 1 to tau + n - 1
                for k in range(tau + 1, min(tau + n, T)):
                    state = memory[k % (n + 1)]['state']
                    action = memory[k % (n + 1)]['action']
                    state_action_pair = state + [action]

                    rho *= policy[*state_action_pair] / behaviour_policy[*state_action_pair]
                
                G = 0
                for k in range(tau + 1, min(tau + n + 1, T + 1)):
                    G += (gamma ** (k - tau - 1)) * memory[k % (n + 1)]['reward']

                if tau + n < T:
                    state = memory[(tau + n) % (n + 1)]['state']
                    action = memory[(tau + n) % (n + 1)]['action']
                    state_action_pair = state + [action]
                    G += (gamma ** n) * Q[*state_action_pair]
                
                state = memory[tau % (n + 1)]['state']
                action = memory[tau % (n + 1)]['action']
                state_action_pair = state + [action]
                Q[*state_action_pair] += alpha * rho * (G - Q[*state_action_pair])
                
                # Make policy greedy with respect to Q
                q_max = Q.max(axis=-1, keepdims=True).repeat(num_actions, axis=-1)
                policy = np.isclose(Q, q_max).astype(np.float32)
                policy = policy / policy.sum(axis=-1, keepdims=True)
            
            t += 1
            if make_behaviour_epsilon_greedy:
                behaviour_policy = generate_epsilon_greedy(Q)

        print(f'\rEpisode {episode + 1:>5}', end='')
    return Q, policy

In [4]:
Q_init = np.zeros(shape=(ROWS, COLUMNS, NUM_ACTIONS), dtype=np.float32)
n = 3
episodes = 1000
policy = np.zeros((ROWS, COLUMNS, NUM_ACTIONS), dtype=np.float32)
policy[:, :, 0] = 1.0
# behaviour_policy = np.ones((ROWS, COLUMNS, NUM_ACTIONS), dtype=np.float32) / NUM_ACTIONS
Q, policy = off_policy_n_step_sarsa(
    Q_init=Q_init, 
    n=n, 
    episodes=episodes, 
    env=env, 
    policy=policy,
    behaviour_policy=None,
    alpha=0.1
)

Episode  1000