In [1]:
from numpy.random.mtrand import rand
from agent import Agent
from maze import Maze
from policy import Policy

from util import plot_matrix, transform_policy_to_matrix_values

In [2]:
MAX_EPISODES = 100_000
THRESHOLD = 0.01
CONVERGED_THRESHOLD=100

In [3]:
maze = Maze(
    lenght=4,
    height=4,
    all_rewards=-1,
    special_rewards={
        (3, 0): 40,
        (2, 1): -10,
        (3, 1): -10,
        (0, 3): 10,
        (1, 3): -2},
    end_positions=[(3, 0), (0, 3)]
)

In [4]:
random_policy = Policy(lenght=4, height=4, greedy=False)
optimal_policy = Policy(lenght=4, height=4)

In [5]:
agent = Agent(maze, optimal_policy, (2, 3), 1)
agent.value_iteration()
agent.update_policy_to_deterministic()


Sweep 0: 
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
Sweep 1: 
[[-1 -1 40  0]
 [-1 -1 -1 40]
 [10 -1 -1 -1]
 [ 0 10 -1 -1]]
Sweep 2: 
[[-2 39 40  0]
 [ 9 -2 39 40]
 [10  9 -2 30]
 [ 0 10  8 -2]]
Sweep 3: 
[[38 39 40  0]
 [ 9 38 39 40]
 [10  9 29 30]
 [ 0 10  8 29]]
Sweep 4: 
[[38 39 40  0]
 [37 38 39 40]
 [10 37 29 30]
 [ 0 10 28 29]]
Sweep 5: 
[[38 39 40  0]
 [37 38 39 40]
 [36 37 36 30]
 [ 0 36 28 29]]
Sweep 6: 
[[38 39 40  0]
 [37 38 39 40]
 [36 37 36 35]
 [ 0 36 35 29]]
Sweep 7: 
[[38 39 40  0]
 [37 38 39 40]
 [36 37 36 35]
 [ 0 36 35 34]]
Done after 7 sweeps!



In [6]:
agent.simulate()


Simulating agent starting on (2, 3)
Moving from (2, 3) to (2, 2) ↑
Moving from (2, 2) to (1, 2) ←
Moving from (1, 2) to (1, 1) ↑
Moving from (1, 1) to (1, 0) ↑
Moving from (1, 0) to (2, 0) →
Moving from (2, 0) to (3, 0) →
Finished simulation om (3, 0)



In [7]:
agent.visualize()


Values:         Policy:
 38 39 40  0    →  →  →  ⦾   
 37 38 39 40    ↑  ↑  ↑  ↑   
 36 37 36 35    →  ↑  ←  ←   
  0 36 35 34    ⦾  ↑  ↑  ↑   



## First-visit Monte Carlo prediction
Policy: random<br>
discount: 1

In [8]:
agent.policy = random_policy
agent.discount = 1
agent.first_visit_mc_prediction(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 21367 episodes.
[[-14.79173865 -11.62117973   0.62211702   0.        ]
 [-13.69616742 -16.5120146  -12.9500684   -4.62133689]
 [ -7.15541355 -14.31973191 -19.27225715 -18.9606053 ]
 [  0.          -8.58581595 -17.63779725 -20.16162396]]
Stopped after 26365 episodes.
[[-14.64752324 -11.62494871   0.64302694   0.        ]
 [-13.72757033 -16.51489151 -12.93836107  -4.33137658]
 [ -7.00332876 -14.28140222 -19.20275169 -18.77369008]
 [  0.          -8.39286413 -17.42157659 -19.92416824]]
Stopped after 26881 episodes.
[[-14.71598151 -11.63024027   0.64112277   0.        ]
 [-13.7919598  -16.57028405 -12.99343391  -4.29888804]
 [ -6.97263037 -14.31979616 -19.25337647 -18.77845084]
 [  0.          -8.40744293 -17.49621212 -19.95067651]]
Stopped after 28060 episodes.
[[-14.73763208 -11.63844746   0.60820236   0.        ]
 [-13.90343762 -16.6060183  -13.07756876  -4.26190258]
 [ -7.02047404 -14.30737428 -19.25669071 -18.73905596]
 [  0.          -8.30794672 -17.47360367 -19.9748731

## First-visit Monte Carlo prediction
Policy: random<br>
discount: 0.9

In [9]:
agent.discount = 0.9
agent.first_visit_mc_prediction(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 11203 episodes.
[[-5.35076408 -2.80615243  6.9337374   0.        ]
 [-5.35077886 -7.49461577 -4.84748692  2.00704227]
 [-0.95004493 -5.6816785  -9.11282673 -8.59378062]
 [ 0.         -1.67871343 -7.04017552 -7.93406589]]
Stopped after 11983 episodes.
[[-5.38909194 -2.85447576  6.87153732  0.        ]
 [-5.43643089 -7.5979144  -4.85954905  2.05825681]
 [-0.99998081 -5.67743183 -9.07010331 -8.49302364]
 [ 0.         -1.6879289  -7.03429181 -7.85132583]]
Stopped after 12294 episodes.
[[-5.34140994 -2.81395649  6.94488906  0.        ]
 [-5.41144706 -7.54974909 -4.82572839  2.0324291 ]
 [-1.00690215 -5.69079462 -9.1186011  -8.52867686]
 [ 0.         -1.65139323 -7.07552253 -7.93691638]]
Stopped after 12529 episodes.
[[-5.36603578 -2.84999122  6.97267211  0.        ]
 [-5.42998297 -7.56552146 -4.88007717  1.94036492]
 [-1.03439844 -5.68211092 -9.12026586 -8.55671651]
 [ 0.         -1.65292234 -7.05448391 -7.91568575]]
Stopped after 13025 episodes.
[[-5.32721496 -2.84035151  6.8

## First-visit Monte Carlo prediction
Policy: optimal<br>
discount: 1

In [10]:
agent.policy = optimal_policy
agent.discount = 1
agent.first_visit_mc_prediction(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 28 episodes.
[[38. 39. 40.  0.]
 [37. 38. 39. 40.]
 [36. 37. 36. 35.]
 [ 0. 36. 35. 34.]]
Did not converge within 100000 episodes.
[[38. 39. 40.  0.]
 [37. 38. 39. 40.]
 [36. 37. 36. 35.]
 [ 0. 36. 35. 34.]]


## First-visit Monte Carlo prediction
Policy: optimal<br>
discount: 0.9

In [11]:
agent.discount = 0.9
agent.first_visit_mc_prediction(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 28 episodes.
[[30.5     35.      40.       0.     ]
 [26.45    30.5     35.      40.     ]
 [22.805   26.45    22.805   19.5245 ]
 [ 0.      22.805   19.5245  16.57205]]
Did not converge within 100000 episodes.
[[30.5     35.      40.       0.     ]
 [26.45    30.5     35.      40.     ]
 [22.805   26.45    22.805   19.5245 ]
 [ 0.      22.805   19.5245  16.57205]]


## Tabular TD
Policy: random<br>
discount: 1

In [12]:
agent.policy = random_policy
agent.discount = 1
agent.tabular_td(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Did not converge within 100000 episodes
[[-16.15783286 -12.57325547   8.54155602   0.        ]
 [-12.76991875 -15.92285117 -15.53829893  -9.01108913]
 [ -3.86861403 -12.65860024 -21.26816648 -22.32287058]
 [  0.         -12.76450266 -18.99529999 -21.25862187]]


## Tabular TD
Policy: random<br>
discount: 0.9

In [13]:
agent.policy = random_policy
agent.discount = 0.9
agent.tabular_td(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Did not converge within 100000 episodes
[[-5.36922723 -4.27181215 -0.67740676  0.        ]
 [-4.13486295 -6.86293041 -6.85720912  0.78812473]
 [-0.36778336 -5.6544217  -8.61722106 -8.68706355]
 [ 0.         -1.56648372 -6.97502013 -8.2393819 ]]


## Tabular TD
Policy: optimal<br>
discount: 1

In [14]:
agent.policy = optimal_policy
agent.discount = 1
agent.tabular_td(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 1091 episodes.
[[37.99994877 39.         40.          0.        ]
 [36.99935304 38.         38.98942528 39.99483972]
 [35.95975071 37.         36.         34.99643057]
 [ 0.         35.95932567 34.99957454 33.94765431]]


## Tabular TD
Policy: optimal<br>
discount: 0.9

In [15]:
agent.discount = 0.9
agent.tabular_td(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

Stopped after 1288 episodes.
[[30.49998539 35.         40.          0.        ]
 [26.44968034 30.5        34.99526078 39.99937263]
 [22.78812193 26.45       22.805      19.52438238]
 [ 0.         22.80375543 19.52423947 16.51482171]]


## On-policy first-visit Monte Carlo control
policy: random e-soft<br>
discount: 1

In [16]:
agent.policy = random_policy
agent.discount = 1
qf = agent.on_policy_first_vist_mc(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

UnboundLocalError: local variable 'converged_count' referenced before assignment

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## On-policy first-visit Monte Carlo control
policy: random e-soft<br>
discount: 0.9

In [None]:
agent.policy.reset_policy()
agent.discount = 0.9
qf = agent.on_policy_first_vist_mc(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## Sarsa (on-policy TD control)
policy: derived from Q<br>
discount: 1

In [None]:
agent.policy = random_policy
agent.discount = 1
qf = agent.sarsa(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## Sarsa (on-policy TD control)
policy: derived from Q<br>
discount: 0.9

In [None]:
agent.policy.reset_policy()
agent.discount = 0.9
qf = agent.sarsa(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## Q-learning (off-policy TD control)
policy: derived from Q<br>
discount: 1

In [None]:
agent.policy = random_policy
agent.discount = 1
qf = agent.q_learning(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## Q-learning (off-policy TD control)
policy: derived from Q<br>
discount: 0.9

In [None]:
agent.policy.reset_policy()
agent.discount = 0.9
qf = agent.q_learning(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf))

## Double Q-learning
policy: derived from Q<br>
discount: 1

In [None]:
agent.policy = random_policy
agent.discount = 1
qf = agent.double_q_learning(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf[0]))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf[1]))

## Double Q-learning
policy: derived from Q<br>
discount: 0.9

In [None]:
agent.policy.reset_policy()
agent.discount = 0.9
qf = agent.double_q_learning(max_episodes=MAX_EPISODES, threshold=THRESHOLD, converged_threshold=CONVERGED_THRESHOLD)

In [None]:
plot_matrix(4, 4, transform_policy_to_matrix_values(agent.policy.policy_matrix))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf[0]))
plot_matrix(4, 4, transform_policy_to_matrix_values(qf[1]))