In [11]:
import numpy as np
from tqdm import tqdm
import random
from system import (
    states,
    s_state,
    actions,
    t_state, 
    wind_col,
    num_states,
    num_actions,
    init_mdp,
    init_reward,
    to_idx,
    get_valid_actions,
    rows,
    cols
)
from model import Model
from policy import Policy, PolicyInit
from agent import Agent

In [7]:
def ch_egreedy(epsilon, Q, s):
    p = np.zeros(num_actions)
    valid_actions = get_valid_actions(s, idx=True)
    va = len(valid_actions)
    # print(va)
    p[valid_actions] = epsilon / va

    validqsa = Q[to_idx(s)][valid_actions]
    p[valid_actions[np.argmax(validqsa)]] += 1 - epsilon
    choice = np.random.choice(np.arange(num_actions), p=p)
    return choice

In [4]:
mdp = init_mdp(num_states, num_actions, wind_col)
reward = init_reward(num_states, t_reward=100)

model = Model(mdp, reward)

policy = Policy(num_states, num_actions, PolicyInit.RANDOM)

agent = Agent(model, policy)

In [8]:
def Egreedy_geneps(T: int, eps, Q_val: np.ndarray, start_state: np.ndarray, model: Model):
        estate = [start_state]
        start_action = ch_egreedy(eps, Q_val, start_state)
        eaction = [start_action]
        ereward = [
            model.get_reward(
                start_state, model.gen_next(start_state, start_action)
            )
        ]
        for i in range(T):
            estate.append(model.gen_next(estate[-1], eaction[-1], astuple=True))
            eaction.append(ch_egreedy(eps, Q_val, estate[-1]))
            ereward.append(model.get_reward(estate[-2], estate[-1]))
            if ereward[-1] == model.get_reward(estate[-2], t_state):
                break
        return estate, eaction, ereward

In [15]:
Q_val = np.random.uniform(-10, -1, (num_states, num_actions))
for a in range(num_actions):
    Q_val[37][a] = 0
# Q_val = np.zeros((num_states, num_actions))
returns = [[[] for a in range(num_actions)] for s in range(num_states)]
# valf_list = [(np.max(Q_val, axis=1))]


T = 10000
gamma = 0.9
alpha = 0.1
epsilon = 0.1
lambda_ = 0.9
for i in tqdm(range(1000)):
    # start_state = 30
    state = s_state 
    action = ch_egreedy(epsilon, Q_val, state)
    estates, eactions, erewards = Egreedy_geneps(T, epsilon, Q_val, state, model)
    T = len(erewards)
    
    sr = 0
    sv = 0
    g = 0
    pwl = lambda_
    for j in range(T-1):
        # s = tuple(estates[T - j - 1])
        s = estates[T - j - 1]
        a = eactions[T - j - 1]
        r = erewards[T - j - 1]
        sr = sr * lambda_ + r*(1 - pwl)
        sv = sv * gamma * lambda_ + Q_val[to_idx(s)][a]
        g = sr + gamma*sv
        Q_val[to_idx(s)][a] = Q_val[to_idx(s)][a] + alpha * (g - Q_val[to_idx(s)][a])
        pwl = pwl * lambda_

  sv = sv * gamma * lambda_ + Q_val[to_idx(s)][a]
  Q_val[to_idx(s)][a] = Q_val[to_idx(s)][a] + alpha * (g - Q_val[to_idx(s)][a])
 12%|█▏        | 122/1000 [02:15<16:18,  1.11s/it]


KeyboardInterrupt: 

In [None]:
valf = valf_list[-1]

In [None]:
import matplotlib.pyplot as plt
valf_toplot = valf.reshape(rows, cols)
plt.gca().invert_yaxis()
heatmap = plt.imshow(valf_toplot)
plt.colorbar(heatmap)
plt.show()

In [None]:
# Run agent
score = 0
steps = 0
s = np.array([3,0])

path = [s]

while(steps < 1000):
    a = policy.get_action(s)

    s_ = model.gen_next(s, a)
    path.append(s_)

    r = model.get_reward(s, s_)
    score += r
    print(f"State: {s}, Action: {actions[a]}, Next State: {s_}, Reward: {r}")

    if(r!= -1):
        print(f"Game Over - Score: {score}")
        break
    
    s = s_
    steps += 1

In [None]:
path_plt = np.array(path) + 0.5
y, x = path_plt.T

ax = plt.gca()
ax.grid()

ax.set_xlim(0, cols)
ax.set_xticks(np.arange(0, cols, 1), minor=False)
ax.set_xticklabels([])
ax.set_xticks(np.arange(0.5, cols, 1), minor=True)
ax.set_xticklabels([str(x) for x in wind_col], minor=True)

ax.set_ylim(rows, 0)
ax.set_yticks(np.arange(0, rows, 1), minor=False)
ax.set_yticklabels([])

ax.plot(x, y)
plt.show()