In [1]:
from env.windy_gridworld import WindyGridWorld
import numpy as np

In [2]:
ROWS = 7
COLUMNS = 10
NUM_ACTIONS = 4
WIND_LOC = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]
# Tuples are row column
TARGET_LOC = [3, 7]
INIT_LOCATION = [3, 0]

# Set up the environment
env = WindyGridWorld(rows=ROWS, columns=COLUMNS, init_location=np.array(INIT_LOCATION))
env.target_location = np.array(TARGET_LOC)
env.wind_location = np.array(WIND_LOC)

## $Q(\sigma)$ for estimating $Q \approx q_*$ or $q_\pi$

In [3]:
def n_step_tree_backup(
        Q_init:np.ndarray,
        sigma:np.ndarray,
        policy:np.ndarray,
        behaviour_policy: np.ndarray,
        env:WindyGridWorld,
        n:int=1,
        episodes:int=1,
        alpha=0.1,
        gamma:float=1.0,
        epsilon: float=0.1
):
    Q = Q_init.copy()
    policy = policy.copy()
    memory = [{} for _ in range(n + 1)]
    num_actions = Q.shape[-1]

    for episode in range(episodes):
        state = env.reset()['agent'].tolist()
        action = np.random.choice(num_actions, p=behaviour_policy[*state])
        action = int(action)

        memory[0]['state'] = state
        memory[0]['action'] = action

        t = 0
        T = np.inf
        tau = 0

        while tau < T - 1:
            if t < T:
                # recall state and action
                state = memory[t % (n + 1)]['state']
                action = memory[t % (n + 1)]['action']

                feedback = env.step(action)
                new_state = feedback[0]['agent'].tolist()
                reward = feedback[1]
                terminated = feedback[2]

                # Store new state and reward
                memory[(t + 1) % (n + 1)]['state'] = new_state
                memory[(t + 1) % (n + 1)]['reward'] = reward

                if terminated:
                    T = t + 1
                else:
                    new_action = np.random.choice(num_actions, p=behaviour_policy[*new_state])
                    new_action = int(new_action)

                    state_action_pair = new_state + [new_action]
                    rho = policy[*state_action_pair] / behaviour_policy[*state_action_pair]
                    memory[(t + 1) % (n + 1)]['action'] = new_action
                    memory[(t + 1) % (n + 1)]['rho'] = rho
                
            tau = t + 1 - n
            if tau >= 0:
                G = 0
                if t + 1 < T:
                    state = memory[(t + 1) % (n + 1)]['state']
                    action = memory[(t + 1) % (n + 1)]['action']
                    G = Q[*(state + [action])]
                
                for k in reversed(range(tau + 1, min(t + 1, T) + 1)):
                    state = memory[k % (n + 1)]['state']
                    action = memory[k % (n + 1)]['action']
                    state_action_pair = state + [action]
                    reward = memory[k % (n + 1)]['reward']
                    rho = memory[k % (n + 1)]['rho']
                    sigma_k = sigma[*state]

                    if k == T:
                        G = memory[T % (n + 1)]['reward']
                    else:
                        V_bar = np.sum(Q[*state] * policy[*state])
                        G = reward + gamma * (rho * sigma_k + (1 - sigma_k) * policy[*state_action_pair]) * (G - Q[*state_action_pair])
                        G += gamma * V_bar
                
                state = memory[tau % (n + 1)]['state']
                action = memory[tau % (n + 1)]['action']
                Q[*state_action_pair] += alpha * (G - Q[*state_action_pair])

                # Make the policy greedy
                policy = Q.argmax(axis=-1)
                policy = np.eye(num_actions)[policy].astype(np.float32)
                behaviour_policy = (1 - policy) * (epsilon / num_actions) + policy * (1 - epsilon + epsilon / num_actions)
            t = t + 1

        print(f"\rEpisode {episode + 1:<5}", end='')

    return Q, policy

In [4]:
Q_init = np.zeros(shape=(ROWS, COLUMNS, NUM_ACTIONS), dtype=np.float32)
n = 3
episodes = 500
policy = np.zeros((ROWS, COLUMNS, NUM_ACTIONS), dtype=np.float32)
policy[:, :, :] = 1.0 / 4
behaviour_policy = policy.copy()
sigma = np.zeros((ROWS, COLUMNS))
sigma[::2, ::2] = 1
Q, policy = n_step_tree_backup(
    Q_init=Q_init, 
    behaviour_policy=behaviour_policy,
    sigma=sigma,
    n=n, 
    episodes=episodes, 
    env=env, 
    policy=policy,
    gamma=1.0,
    alpha=0.1
)

Episode 500  

In [5]:
state = env.reset()['agent'].tolist()
step = 0
while not env.is_terminated():
    action = policy[*state].argmax()
    feedback = env.step(action)
    state = feedback[0]['agent']
    step += 1
    print(step)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
