In [1]:
import numpy as np

from system import (
    states,
    actions, 
    wind_col,
    num_states,
    num_actions,
    init_mdp,
    init_reward,
    to_idx,
    to_state,
    get_valid_actions,
    gen_random_sa,
    rows,
    cols
)
from model import Model
from policy import Policy, PolicyInit
from agent import Agent

ModuleNotFoundError: No module named 'requests'

In [None]:
def ch_egreedy(epsilon, Q, s):
    if np.random.rand() < epsilon:
        return np.random.choice(get_valid_actions(to_state(s)))
    else:
        return np.argmax(Q[s])

In [None]:
mdp = init_mdp(num_states, num_actions, wind_col)
reward = init_reward(num_states, t_reward=100)
model = Model(mdp, reward)

policy = Policy(num_states, num_actions, PolicyInit.RANDOM)

agent = Agent(model, policy)

In [None]:
Q_val = np.random.uniform(-10, -1, (num_states, num_actions))
for a in range(num_actions):
    Q_val[37][a] = 0
# Q_val = np.zeros((num_states, num_actions))
returns = [[[] for a in range(num_actions)] for s in range(num_states)]
valf_list = [(np.max(Q_val, axis=1))]

i = 0
T = 1000
gamma = 0.9
alpha = 0.1
while i < 1000:
    start_state = 30
    start_action = ch_egreedy(0.1, Q_val, start_state)
    state = start_state
    action = start_action
    while state != 37:
        r = model.reward(state, action)
        nstate = model.gen_next(state, action)
        naction = ch_egreedy(0.1, Q_val, nstate)
        Q_val[state, action] = Q_val[state, action] + alpha * (r + gamma * Q_val[nstate][naction] - Q_val[state, action])
        state = nstate
        action = naction
    i += 1

In [None]:
valf = valf_list[-1]

In [None]:
import matplotlib.pyplot as plt
valf_toplot = valf.reshape(rows, cols)
plt.gca().invert_yaxis()
heatmap = plt.imshow(valf_toplot)
plt.colorbar(heatmap)
plt.show()

In [None]:
# Run agent
score = 0
steps = 0
s = np.array([3,0])

path = [s]

while(steps < 1000):
    a = policy.get_action(s)

    s_ = model.gen_next(s, a)
    path.append(s_)

    r = model.get_reward(s, s_)
    score += r
    print(f"State: {s}, Action: {actions[a]}, Next State: {s_}, Reward: {r}")

    if(r!= -1):
        print(f"Game Over - Score: {score}")
        break
    
    s = s_
    steps += 1

In [None]:
path_plt = np.array(path) + 0.5
y, x = path_plt.T

ax = plt.gca()
ax.grid()

ax.set_xlim(0, cols)
ax.set_xticks(np.arange(0, cols, 1), minor=False)
ax.set_xticklabels([])
ax.set_xticks(np.arange(0.5, cols, 1), minor=True)
ax.set_xticklabels([str(x) for x in wind_col], minor=True)

ax.set_ylim(rows, 0)
ax.set_yticks(np.arange(0, rows, 1), minor=False)
ax.set_yticklabels([])

ax.plot(x, y)
plt.show()