In [63]:
from random import *
from gridworld import gridworld_agent

In [64]:
def select_action(action_values, action_size, state, eps):
    eps = 0.995*eps # GLIE
    eps_val = random()
    if eps_val > eps:
        return np.argmax(action_values[state[0], state[1], :]), eps
    else:
        return randint(0, action_size-1), eps

In [65]:
# INITIALISING 
rows = 7
cols = 10
winds = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]
start = [3, 0]
goal = [3, 7]
action_mode = "std"
env = gridworld_agent(rows, cols, winds, start, goal, action_mode)

In [66]:
# Q-Table and Eligibility Values for backward view SARSA
action_values = np.zeros([env.rows,env.cols,env.action_size])
eligibility_trace = np.zeros([env.rows,env.cols,env.action_size])

In [67]:
max_episodes = 500
gamma = 0.9
lr = 0.1
max_steps = 100
epsilon = 0.7 # For Epsilon-Greedy exploration

In [68]:
for i in range(max_episodes):
    print("Episode: {}".format(i), end=' ')
    state = env.reset()
    steps = 0
    action, epsilon = select_action(action_values, env.action_size, state, epsilon)
    tot_err = 0
    done = False

    while steps < max_steps and not done:
        steps += 1
        new_state, reward, done = env.step(action)
        new_action, epsilon = select_action(action_values, env.action_size, new_state, epsilon)
        
        err = reward + gamma*action_values[new_state[0], new_state[1], new_action] - action_values[state[0], state[1], action]
        action_values[state[0], state[1], action] += lr*err
        state = new_state
        action = new_action
        tot_err+=err
        
    print("Total Error: {}".format(tot_err))

Episode: 0 Total Error: -98.49465220000005
Episode: 1 Total Error: -96.94781127074036
Episode: 2 Total Error: -97.70211348687302
Episode: 3 Total Error: -95.87674748071525
Episode: 4 Total Error: -96.59891071516564
Episode: 5 Total Error: -92.78980710665931
Episode: 6 Total Error: -96.08032284014092
Episode: 7 Total Error: -91.05720285410186
Episode: 8 Total Error: -92.92945887239603
Episode: 9 Total Error: -92.33670148346087
Episode: 10 Total Error: -91.54358291221361
Episode: 11 Total Error: -90.8864578350846
Episode: 12 Total Error: -91.31405422361685
Episode: 13 Total Error: -89.57628114394976
Episode: 14 Total Error: -88.53940068706434
Episode: 15 Total Error: -87.46304131054269
Episode: 16 Total Error: -88.07434822516532
Episode: 17 Total Error: -90.50847808859508
Episode: 18 Total Error: -86.77891973973702
Episode: 19 Total Error: -87.27539971421977
Episode: 20 Total Error: -83.88270629999757
Episode: 21 Total Error: -86.09284561918415
Episode: 22 Total Error: -82.25571450969778

In [69]:
print(action_values)

[[[ -5.57525298  -5.58908527  -5.60094878  -5.6285968 ]
  [ -5.56645454  -5.59834186  -5.56442615  -5.59805485]
  [ -5.61141739  -5.64540585  -5.61914478  -5.64763757]
  [ -5.86273259  -5.1367488   -5.85180587  -5.84012845]
  [ -5.936787     1.87098554  -5.90726402  -5.89524535]
  [ -5.74515489  13.40953151  -5.6795923   -5.67422249]
  [ -5.34370948  28.35462838  -5.29121408  -5.28465684]
  [ -4.882909    32.61625378  -4.89183246  -4.91936941]
  [ -4.49933014  37.3513931   -4.52353589  -4.47373355]
  [ -3.90437503  -3.88695695  42.612659    -3.88364434]]

 [[ -5.58134177  -5.60564807  -5.57994137  -5.60838818]
  [ -5.54498478  -5.52216284  -5.54243433  -5.53015401]
  [ -5.51927839  -5.46412391  -5.47150499  -5.49548284]
  [ -5.38071997  -4.03142316  -5.43467347  -5.40151093]
  [ -5.22663345  -1.73733789  -5.16468564  -5.23571438]
  [ -0.38743155  -4.76443475  -4.82367593  -4.85809095]
  [ -5.10028526  -3.02283093  -5.06064917  -5.14112563]
  [ -4.74887428   5.85987814  -4.70892215  -4.

In [70]:
steps = 0
env.reset()
env.printenv()
done = False

while steps < max_steps and not done:
    steps += 1
    action, epsilon = select_action(action_values, env.action_size, state, epsilon)
    new_state, reward, done = env.step(action)
    state = new_state
    action = new_action
    env.printenv()

---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | 

---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | |

---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | |

---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | | | | | |
---------------------
| | | | | | | |g| | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
 0 0 0 1 1 1 2 2 1 0
---------------------
| | | | | | | | | | |
---------------------
| | | | | | | | | | |
---------------------
|*| | | | | |