In [53]:
import numpy as np

from system import (
    states,
    actions, 
    wind_col,
    num_states,
    num_actions,
    init_mdp,
    init_reward,
    to_idx,
    to_state,
    get_valid_actions,
    gen_random_sa,
    rows,
    cols
)
from model import Model
from policy import Policy, PolicyInit
from agent import Agent

In [54]:
mdp = init_mdp(num_states, num_actions, wind_col)
reward = init_reward(num_states, t_reward=100)
model = Model(mdp, reward)

policy = Policy(num_states, num_actions, PolicyInit.RANDOM)

agent = Agent(model, policy)

In [55]:
def ch_egreedy(epsilon, Q, s):
    p = np.zeros(num_actions)
    valid_actions_idx = get_valid_actions(to_state(s), idx=True)
    va = len(valid_actions_idx)
    # print(va)
    p[valid_actions_idx] = epsilon / va
    validqsa = Q[s][valid_actions_idx]
    p[valid_actions_idx[np.argmax(validqsa)]] += 1 - epsilon
    choice = np.random.choice(np.arange(num_actions), p=p)
    return choice

In [56]:
Q_val = np.random.uniform(-10, -1, (num_states, num_actions))
for a in range(num_actions):
    Q_val[37][a] = 0
# Q_val = np.zeros((num_states, num_actions))
returns = [[[] for a in range(num_actions)] for s in range(num_states)]
valf_list = [(np.max(Q_val, axis=1))]

i = 0
T = 1000
gamma = 0.9
alpha = 0.1
while i < 1000:
    start_state = 30
    state = start_state
    while state != 37:
        action = ch_egreedy(0.1, Q_val, state)
        # print(action)
        nstate = model.gen_next(to_state(state), action)
        r = model.get_reward(to_state(state), to_state(nstate))
        print(r)
        valid_actions_idx = get_valid_actions(nstate, idx=True)
        validqsa = Q_val[to_idx(nstate)][valid_actions_idx]
        nmax= np.max(validqsa)
        print(nmax)
        Q_val[state, action] = Q_val[state, action] + alpha * (r + gamma * nmax - Q_val[state, action])
        state = to_idx(nstate)
    i += 1

[-1 -1]
-2.5043749363922023


ValueError: setting an array element with a sequence.

In [None]:
valf = valf_list[-1]

In [None]:
import matplotlib.pyplot as plt
valf_toplot = valf.reshape(rows, cols)
plt.gca().invert_yaxis()
heatmap = plt.imshow(valf_toplot)
plt.colorbar(heatmap)
plt.show()

In [None]:
# Run agent
score = 0
steps = 0
s = np.array([3,0])

path = [s]

while(steps < 1000):
    a = policy.get_action(s)

    s_ = model.gen_next(s, a)
    path.append(s_)

    r = model.get_reward(s, s_)
    score += r
    print(f"State: {s}, Action: {actions[a]}, Next State: {s_}, Reward: {r}")

    if(r!= -1):
        print(f"Game Over - Score: {score}")
        break
    
    s = s_
    steps += 1

In [None]:
path_plt = np.array(path) + 0.5
y, x = path_plt.T

ax = plt.gca()
ax.grid()

ax.set_xlim(0, cols)
ax.set_xticks(np.arange(0, cols, 1), minor=False)
ax.set_xticklabels([])
ax.set_xticks(np.arange(0.5, cols, 1), minor=True)
ax.set_xticklabels([str(x) for x in wind_col], minor=True)

ax.set_ylim(rows, 0)
ax.set_yticks(np.arange(0, rows, 1), minor=False)
ax.set_yticklabels([])

ax.plot(x, y)
plt.show()