### **Code Description**
This code focuses on solving the N-puzzle problem using Reinforcement Learning methods.
- **Methods**
    - `async_value_iteration`: Asynchronous value iteration where state values are updated when the agent visits those states. The agent follows an epsilon-greedy policy while taking actions.
    - `async_value_iteration_with_stack`: Same as above. However, the agent also keeps a stack of the states it has visited during an episode and updates the values of the states in the reverse order at the end of each episode.
    - `n_step_TD`: n-step Temporal Difference

In [87]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
import utils
from environment import Environment
from agent import Agent

In [None]:
N = 5
max_episodes = 200
max_steps = 10000
learning_rate = 1.0
epsilon_start = 0.6
epsilon_end = 0.6
default_state_value = 0.0
epsilon_decay_type = "linear"   # Should be "linear" or "exponential"
update_on_increase = True

env = Environment(N)

agent = Agent(env)

initial_state_string = utils.get_random_state_string(N)

agent.async_value_iteration_with_stack(initial_state_string, 
                                       max_episodes, 
                                       max_steps, 
                                       learning_rate, 
                                       epsilon_start, 
                                       epsilon_end, 
                                       default_state_value, 
                                       epsilon_decay_type, 
                                       update_on_increase)

In [60]:
N = 3
max_episodes = 3000
max_steps = 2000
learning_rate = 1.0
epsilon_start = 0.1
epsilon_end = 0.1
default_state_value = 0.0
theta = 1e-3

env = Environment(N)

agent = Agent(env)

initial_state_string = utils.get_random_state_string(N)

agent.async_value_iteration(initial_state_string, max_episodes, max_steps, learning_rate, epsilon_start, epsilon_end, default_state_value, theta)

AttributeError: 'function' object has no attribute 'max_potential'

In [89]:
N = 5                                        # Size of the puzzle
n = 10000                                     # n of n-step TD
max_episodes = 50
max_steps = 12000                             # Maximum number of steps per episode
epsilon_start = 0.45
epsilon_end = 0.45
default_state_value = 0.0
update_on_increase = True                   # Update state values only when the new estimate is greater than the old estimate
epsilon_decay_type = "linear"
plus_value_iteration = True                 # Add one-step Bellman update (the one used in value iteration) to n-step TD
plus_value_iteration_with_stack = True      # Repeat the one-step Bellman update on states visited in the episode in reverse order


env = Environment(N)

agent = Agent(env)

initial_state_string = utils.get_random_state_string(N)

agent.n_step_TD(initial_state_string, 
                n, 
                max_episodes, 
                max_steps, 
                epsilon_start, 
                epsilon_end, 
                default_state_value, 
                update_on_increase, 
                epsilon_decay_type, 
                plus_value_iteration, 
                plus_value_iteration_with_stack)

Higher potential reached: -90
10 4  0  24 22 
7  21 19 23 3  
18 16 6  20 17 
12 5  9  8  13 
15 1  2  14 11 
Higher potential reached: -89
10 0  4  24 22 
7  21 19 23 3  
18 16 6  20 17 
12 5  9  8  13 
15 1  2  14 11 
Higher potential reached: -88
0  10 4  24 22 
7  21 19 23 3  
18 16 6  20 17 
12 5  9  8  13 
15 1  2  14 11 
Higher potential reached: -87
7  0  4  24 22 
21 10 19 23 3  
18 16 6  20 17 
12 5  9  8  13 
15 1  2  14 11 
Higher potential reached: -86
0  7  4  24 22 
21 10 19 23 3  
18 16 6  20 17 
12 5  9  8  13 
15 1  2  14 11 
Higher potential reached: -85
10 21 23 22 3  
7  19 6  4  0  
18 16 9  24 17 
12 5  2  20 11 
15 1  14 13 8  
Higher potential reached: -84
10 21 23 3  0  
7  19 6  22 4  
18 16 9  24 17 
12 5  2  20 11 
15 1  14 13 8  
Higher potential reached: -83
10 21 23 3  4  
7  19 6  22 0  
18 16 9  24 17 
12 5  2  20 11 
15 1  14 13 8  
Higher potential reached: -82
10 21 3  6  4  
7  19 23 0  17 
18 16 9  22 24 
12 5  2  20 11 
15 1  14 13 8  
Higher pot

KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":
    N = 3
    n = 1000
    max_episodes = 80
    max_steps = 1000
    epsilon_start = 0.4
    epsilon_end = 0.01
    default_state_value = 0.0
    update_on_increase = True
    epsilon_decay_type = "linear"
    plus_value_iteration = True
    plus_value_iteration_with_stack = True 
    process_count = 10

    utils.parallel_processing(process_count, N, n, 
                                                max_episodes, 
                                                max_steps, 
                                                epsilon_start, 
                                                epsilon_end, 
                                                default_state_value, 
                                                update_on_increase, 
                                                epsilon_decay_type,
                                                plus_value_iteration, 
                                                plus_value_iteration_with_stack)

In [None]:
N = 3
n = 50
max_episodes = 300
max_steps = 1000
epsilon_start = 0.5
epsilon_end = 0.01
default_state_value = 0.0


env = Environment(N)

agent = Agent(env)

initial_state_string = utils.get_random_state_string(N)

agent.n_step_TD_2(initial_state_string, n, max_episodes, max_steps, epsilon_start, epsilon_end, default_state_value)

In [39]:
len(agent.V)

21378

In [21]:
agent.V

{'01020304050607080910111213141516171819202122232400': 0.0,
 '20090416050815121410012306112413071822021903211700': 9999645798.0,
 '20090416050815121410012306112413071822001903211702': 9999645919.0,
 '20090416050815121410012306110013071822241903211702': -359936.0,
 '20090416050815121400012306111013071822241903211702': -359945.0,
 '20090416050815121410012306001113071822241903211702': -359936.0,
 '20090416050815120010012306141113071822241903211702': -359948.0,
 '20090400050815121610012306141113071822241903211702': -359949.0,
 '20090004050815121610012306141113071822241903211702': -359950.0,
 '20000904050815121610012306141113071822241903211702': -359951.0,
 '00200904050815121610012306141113071822241903211702': -359952.0,
 '08200904050015121610012306141113071822241903211702': -359951.0,
 '08200904051500121610012306141113071822241903211702': -359952.0,
 '08000904051520121610012306141113071822241903211702': -359953.0,
 '00080904051520121610012306141113071822241903211702': -359954.0,
 '15080904

In [40]:
agent.exploit(initial_state_string)

2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
8  21 0  10 4  
14 13 7  24 17 
- step: 1 -
2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
8  21 7  10 4  
14 13 0  24 17 
- step: 2 -
2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
8  21 7  10 4  
14 0  13 24 17 
- step: 3 -
2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
8  21 7  10 4  
0  14 13 24 17 
- step: 4 -
2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
0  21 7  10 4  
8  14 13 24 17 
- step: 5 -
2  20 22 1  12 
3  19 15 11 23 
9  18 16 5  6  
21 0  7  10 4  
8  14 13 24 17 
- step: 6 -
2  20 22 1  12 
3  19 15 11 23 
9  0  16 5  6  
21 18 7  10 4  
8  14 13 24 17 
- step: 7 -
2  20 22 1  12 
3  0  15 11 23 
9  19 16 5  6  
21 18 7  10 4  
8  14 13 24 17 
- step: 8 -
2  0  22 1  12 
3  20 15 11 23 
9  19 16 5  6  
21 18 7  10 4  
8  14 13 24 17 
- step: 9 -
2  22 0  1  12 
3  20 15 11 23 
9  19 16 5  6  
21 18 7  10 4  
8  14 13 24 17 
- step: 10 -
2  22 1  0  12 
3  20 15 11 23 
9  19 16 5  6  
21 18 7  10 4  
8  14 13 24 17 