# Final Project - Reinforcements Learning 
Hello dear students,<br> this is the template notebook. Please click on the "File" tab and then on "Save a copy into drive".

---
<br>

### Name and ID:
Student 1: Avraham Raviv, 204355390
<br>
Student 2: Yevgeni Berkovitch, 317079234
<br><br>
<img src="https://play-lh.googleusercontent.com/e_oKlKPISbgdzut1H9opevS7-LTB8-8lsmpCdMkhlnqFenZhpjxbLmx7l158-xQQCIY">

### https://github.com/mpSchrader/gym-sokoban

# Installs

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install gym_sokoban

!imageio_download_bin ffmpeg

# Imports

In [2]:
import random

import numpy as np
import matplotlib.pyplot as plt

import base64
import imageio
from pyvirtualdisplay import Display
from IPython.display import HTML

import gym
from gym import error, spaces, utils
from soko_pap import *

from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

import tensorflow
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from tqdm.notebook import tqdm
from collections import defaultdict

In [3]:
%matplotlib inline

In [4]:
imageio.plugins.ffmpeg.download()

In [5]:
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

In [6]:
import warnings
warnings.filterwarnings('ignore')

# Display utils
The cell below contains the video display configuration. No need to make changes here.

In [7]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return HTML(tag)

# Utils

In [8]:
def get_distances_for_target(room_state, target):
    distances = np.zeros(shape=room_state.shape)
    visited_cells = set()
    cell_queue = deque()

    visited_cells.add(target)
    cell_queue.appendleft(target)

    while len(cell_queue) != 0:
        cell = cell_queue.pop()
        distance = distances[cell[0]][cell[1]]
        for x,y in ((1,0), (-1,-0), (0,1), (0,-1)):
            next_cell_x, next_cell_y = cell[0]+x, cell[1]+y
            if room_state[next_cell_x][next_cell_y] != 0 and not (next_cell_x, next_cell_y) in visited_cells:
                distances[next_cell_x][next_cell_y] = distance + 1
                visited_cells.add((next_cell_x, next_cell_y))
                cell_queue.appendleft((next_cell_x, next_cell_y))
                
    return distances

def get_maze_info(room_state):
    targets = []
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):
            if room_state[i][j] == 2:
                targets.append((i, j))

    distances0 = get_distances_for_target(room_state, targets[0])
    distances1 = get_distances_for_target(room_state, targets[1])
    common_distances = np.minimum(distances0, distances1)
    
    maze_info = {}
    maze_info['target0'] = targets[0]
    maze_info['target1'] = targets[1]
    maze_info['distances0'] = distances0
    maze_info['distances1'] = distances1
    maze_info['coomon_distances'] = common_distances
    return maze_info

def calc_distances(room_state, distances):
    boxes = []
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):            
            if room_state[i][j] == 4:
                boxes.append((i,j))
    if len(boxes) == 2:
        return distances[boxes[0][0]][boxes[0][1]] + distances[boxes[1][0]][boxes[1][1]]
    
    return distances[boxes[0][0]][boxes[0][1]]

def box2target_change_reward(room_state, next_room_state, maze_info):
    if np.array_equal(room_state, next_room_state):
        return -1.0
    
    target0 = maze_info['target0']
    target1 = maze_info['target1']
    distances0 = maze_info['distances0']
    distances1 = maze_info['distances1']
    common_distances = maze_info['coomon_distances']
    
    relevant_distances = common_distances    
    
    if room_state[target0[0]][target0[1]] == 3:
        relevant_distances = distances1
    elif room_state[target1[0]][target1[1]] == 3:
        relevant_distances = distances0
    
    change_reward = 0.0      
    t2b = calc_distances(room_state, relevant_distances)
    n_t2b = calc_distances(next_room_state, relevant_distances)
    if n_t2b < t2b:
        change_reward += 1.0
    elif n_t2b > t2b:
        change_reward -= 1.0
        
    return change_reward  

# Solution

In [9]:
class SOK_Agent:
    def __init__(self):
        # Construct DQN models
        self.state_size = (112,112,1) 
        self.action_size = 8
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.batch_size = 8
        
        # Replay buffers
        self.replay_buffer = deque(maxlen=50000)
        self.prioritized_replay_buffer = deque(maxlen=1000)
        
        # Hyperparameters
        self.gamma = 0.9
        self.epsilon = 1.0   
        self.epsilon_min = 0.2
        self.epsilon_decay = 0.9995
        self.replay_rate = 10
        self.update_beta = 0.999
        
        self.action_rotation_map = {
            0: 2,
            1: 3,
            2: 1,
            3: 0,
            4: 6,
            5: 7,
            6: 5,
            7: 4
        }

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (16,16), strides=(16,16), input_shape=self.state_size, activation='relu'))
        model.add(Conv2D(64, (3,3), activation='relu'))
        model.add(Conv2D(64, (3,3), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))  
        model.add(Dense(128, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        lr_schedule = ExponentialDecay(0.001, decay_steps=2000, decay_rate=0.99, staircase=False)
        model.compile(optimizer=Adam(learning_rate=lr_schedule), loss='mse')        
        return model

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append([state, action, reward, next_state, done])    
        
    def copy_to_prioritized_buffer(self, n):
        for i in range(n):
            self.prioritized_replay_buffer.append(self.replay_buffer[-1-i])  

    def act(self, state, stochastic=False):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state, verbose=0)[0]
        
        if stochastic:
            act_probs = np.exp(act_values)/np.exp(act_values).sum()
            return np.random.choice(np.arange(self.action_size), size=1, p=act_probs)[0]
              
        return np.argmax(act_values) 

    def replay(self): 
        if len(self.replay_buffer) < self.batch_size:
            return
        
        if len(self.prioritized_replay_buffer) < self.batch_size//2:
            minibatch = random.sample(self.replay_buffer, self.batch_size) 
        else:    
            minibatch = random.sample(self.replay_buffer, self.batch_size//2) 
            minibatch.extend(random.sample(self.prioritized_replay_buffer, self.batch_size//2))
        
        states = np.zeros((self.batch_size*4, self.state_size[0], self.state_size[1]))
        actions = np.zeros(self.batch_size*4, dtype=int)
        rewards = np.zeros(self.batch_size*4)
        next_states = np.zeros((self.batch_size*4, self.state_size[0], self.state_size[1]))
        statuses = np.zeros(self.batch_size*4)
        targets = np.zeros((self.batch_size*4, self.action_size)) 
        
        for i, (state, action, reward, next_state, done) in enumerate(minibatch): 
            for rot in range(4):  
                ind = i*4+rot
                if rot != 0:
                    state = np.rot90(state, axes=(1,2))
                    next_state = np.rot90(next_state, axes=(1,2))
                    action = self.action_rotation_map.get(action)

                states[ind] = state.copy()
                actions[ind] = action
                rewards[ind] = reward
                next_states[ind] = next_state.copy()
                statuses[ind] = 1 if done else 0          
        
        targets = self.model.predict(states) 
        max_actions = np.argmax(self.model.predict(next_states), axis=1)
        next_rewards = self.target_model.predict(next_states)
        
        ind = 0
        for action, reward, next_reward, max_action, done in zip(actions, rewards, next_rewards, max_actions, statuses):  
            if not done:
                reward += self.gamma * next_reward[max_action]
            targets[ind][action] = reward
            ind += 1
        
        self.model.fit(states, targets, epochs=10, verbose=0) 
        
        self.update_target_model()        
    
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay    
        
    def update_target_model(self):
        model_w = self.model.get_weights()
        target_model_w = self.target_model.get_weights()
        updated_target_model_w = []
        for i in range(len(model_w)):
            updated_target_model_w.append(self.update_beta*target_model_w[i] + (1-self.update_beta)*model_w[i])
        self.target_model.set_weights(updated_target_model_w)    
            
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [10]:
def process_frame(frame):
    f = frame.mean(axis=2)
    f = f / 255
    return np.expand_dims(f, axis=0)

## Training

#### Test Suite

In [11]:
max_episodes = 50000
max_steps = 40

def init_sok(r):
    random.seed(r)
    sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=2)
    sok.set_maxsteps(max_steps)
    return sok

In [12]:
def test_agent(e, cur_record, stochastic=False):
    current_epsilon = agent.epsilon
    agent.epsilon = 0.0
    num_solved = 0

    for t in tqdm(range(100)):    
        sok = init_sok(t)
        steps = 0

        state = sok.get_image('rgb_array')
        done = False
        while not done:
            steps += 1
            action = agent.act(process_frame(state), stochastic)
            if action < 4:
                action += 1
            else:
                action += 5
            state, reward, done, info = sok.step(action)

        if sok.boxes_on_target == 2:            
            num_solved += 1
            
    agent.epsilon = current_epsilon    
    print("Episode %d Epsilon %.3f Learning Rate %.6f Solved: %d" % (
        e+1, 
        agent.epsilon, 
        agent.model.optimizer._decayed_lr(tensorflow.float32).numpy(),
        num_solved))
    
    if num_solved > cur_record:
        agent.save("models\Q3_05A_%d.h5" % (num_solved)) 
        cur_record = num_solved
        
    return num_solved, cur_record 

In [13]:
agent = SOK_Agent()

running_puzzles = 0
running_solved = 0
solved_in_train = []
solved_in_test = []
test_record = 0

for e in range(max_episodes):
    sok = init_sok(e+100)
    random.seed(e)
    running_puzzles += 1
    
    state = process_frame(sok.get_image('rgb_array'))
    room_state = sok.room_state.copy() 
    maze_info = get_maze_info(room_state)
    
    for step in range(sok.max_steps):
        action = agent.act(state, stochastic=True)
        if action < 4:
            next_state, reward, done, _ = sok.step(action+1) 
        else:
            next_state, reward, done, _ = sok.step(action+5)         
        
        next_state = process_frame(next_state)        
        next_room_state = sok.room_state
        
        if not done:
            reward += box2target_change_reward(room_state, next_room_state, maze_info)
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state.copy() 
        room_state = next_room_state.copy()                
        
        if (step+1) % agent.replay_rate == 0:
            agent.replay()            
        
        if done: 
            if sok.boxes_on_target == 2:  
                agent.copy_to_prioritized_buffer(step+1)  
                running_solved += 1
                
            if (e+1) % 10 == 0 and e > 0:
                print(f"{running_solved} | {running_puzzles}") 

                if (e+1) % 100 == 0:
                    solved_in_train.append(running_solved)
                    running_puzzles = 0
                    running_solved = 0
                    
            break
            
    if (e+1) % 100 == 0 and e > 0:
        solved_tests, test_record = test_agent(e, test_record, stochastic=False) 
        solved_in_test.append(solved_tests)

1 | 10
1 | 20
1 | 30
1 | 40
1 | 50
2 | 60
3 | 70
3 | 80
4 | 90
5 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 100 Epsilon 0.822 Learning Rate 0.000980 Solved: 0
2 | 10
2 | 20
3 | 30
4 | 40
5 | 50
6 | 60
7 | 70
7 | 80
8 | 90
[SOKOBAN] Retry . . .
8 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 200 Epsilon 0.679 Learning Rate 0.000962 Solved: 5
0 | 10
3 | 20
3 | 30
3 | 40
3 | 50
[SOKOBAN] Retry . . .
5 | 60
[SOKOBAN] Retry . . .
6 | 70
6 | 80
6 | 90
7 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 300 Epsilon 0.561 Learning Rate 0.000944 Solved: 4
1 | 10
2 | 20
3 | 30
3 | 40
5 | 50
7 | 60
7 | 70
9 | 80
11 | 90
12 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 400 Epsilon 0.467 Learning Rate 0.000926 Solved: 5
1 | 10
3 | 20
4 | 30
5 | 40
5 | 50
[SOKOBAN] Retry . . .
7 | 60
[SOKOBAN] Retry . . .
8 | 70
10 | 80
11 | 90
15 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 500 Epsilon 0.391 Learning Rate 0.000910 Solved: 13
3 | 10
[SOKOBAN] Retry . . .
4 | 20
5 | 30
7 | 40
[SOKOBAN] Retry . . .
7 | 50
7 | 60
10 | 70
[SOKOBAN] Retry . . .
10 | 80
12 | 90
13 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 600 Epsilon 0.327 Learning Rate 0.000894 Solved: 17
1 | 10
1 | 20
2 | 30
5 | 40
7 | 50
[SOKOBAN] Retry . . .
8 | 60
10 | 70
16 | 80
16 | 90
19 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 700 Epsilon 0.275 Learning Rate 0.000878 Solved: 10
2 | 10
[SOKOBAN] Retry . . .
3 | 20
4 | 30
5 | 40
7 | 50
10 | 60
15 | 70
16 | 80
[SOKOBAN] Retry . . .
17 | 90
18 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 800 Epsilon 0.232 Learning Rate 0.000863 Solved: 20
1 | 10
3 | 20
[SOKOBAN] Retry . . .
6 | 30
7 | 40
8 | 50
12 | 60
12 | 70
16 | 80
19 | 90
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 900 Epsilon 0.200 Learning Rate 0.000849 Solved: 15
1 | 10
[SOKOBAN] Retry . . .
2 | 20
4 | 30
4 | 40
4 | 50
5 | 60
9 | 70
10 | 80
13 | 90
13 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1000 Epsilon 0.200 Learning Rate 0.000834 Solved: 12
2 | 10
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
7 | 20
10 | 30
13 | 40
16 | 50
17 | 60
18 | 70
22 | 80
23 | 90
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1100 Epsilon 0.200 Learning Rate 0.000820 Solved: 6
1 | 10
3 | 20
4 | 30
6 | 40
7 | 50
9 | 60
10 | 70
12 | 80
13 | 90
14 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1200 Epsilon 0.200 Learning Rate 0.000806 Solved: 11
0 | 10
6 | 20
8 | 30
9 | 40
[SOKOBAN] Retry . . .
9 | 50
11 | 60
13 | 70
15 | 80
17 | 90
[SOKOBAN] Retry . . .
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1300 Epsilon 0.200 Learning Rate 0.000792 Solved: 17
2 | 10
3 | 20
[SOKOBAN] Retry . . .
6 | 30
[SOKOBAN] Retry . . .
9 | 40
13 | 50
13 | 60
15 | 70
17 | 80
18 | 90
19 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1400 Epsilon 0.200 Learning Rate 0.000779 Solved: 17
4 | 10
6 | 20
9 | 30
[SOKOBAN] Retry . . .
11 | 40
13 | 50
15 | 60
16 | 70
17 | 80
[SOKOBAN] Retry . . .
18 | 90
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1500 Epsilon 0.200 Learning Rate 0.000766 Solved: 11
2 | 10
3 | 20
4 | 30
[SOKOBAN] Retry . . .
4 | 40
7 | 50
[SOKOBAN] Retry . . .
10 | 60
14 | 70
15 | 80
16 | 90
17 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1600 Epsilon 0.200 Learning Rate 0.000753 Solved: 17
5 | 10
6 | 20
9 | 30
12 | 40
[SOKOBAN] Retry . . .
15 | 50
16 | 60
18 | 70
19 | 80
22 | 90
[SOKOBAN] Retry . . .
25 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1700 Epsilon 0.200 Learning Rate 0.000741 Solved: 17
[SOKOBAN] Retry . . .
2 | 10
3 | 20
4 | 30
6 | 40
8 | 50
[SOKOBAN] Retry . . .
10 | 60
11 | 70
[SOKOBAN] Retry . . .
13 | 80
[SOKOBAN] Retry . . .
16 | 90
18 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1800 Epsilon 0.200 Learning Rate 0.000728 Solved: 18
2 | 10
[SOKOBAN] Retry . . .
5 | 20
[SOKOBAN] Retry . . .
8 | 30
[SOKOBAN] Retry . . .
13 | 40
[SOKOBAN] Retry . . .
14 | 50
16 | 60
17 | 70
[SOKOBAN] Retry . . .
21 | 80
22 | 90
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 1900 Epsilon 0.200 Learning Rate 0.000717 Solved: 21
2 | 10
4 | 20
[SOKOBAN] Retry . . .
5 | 30
[SOKOBAN] Retry . . .
7 | 40
9 | 50
11 | 60
12 | 70
[SOKOBAN] Retry . . .
14 | 80
[SOKOBAN] Retry . . .
15 | 90
16 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2000 Epsilon 0.200 Learning Rate 0.000704 Solved: 13
4 | 10
[SOKOBAN] Retry . . .
5 | 20
[SOKOBAN] Retry . . .
6 | 30
8 | 40
12 | 50
15 | 60
18 | 70
20 | 80
21 | 90
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2100 Epsilon 0.200 Learning Rate 0.000693 Solved: 18
0 | 10
2 | 20
[SOKOBAN] Retry . . .
5 | 30
10 | 40
11 | 50
13 | 60
14 | 70
17 | 80
20 | 90
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2200 Epsilon 0.200 Learning Rate 0.000681 Solved: 18
2 | 10
5 | 20
6 | 30
[SOKOBAN] Retry . . .
6 | 40
8 | 50
12 | 60
15 | 70
19 | 80
21 | 90
23 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2300 Epsilon 0.200 Learning Rate 0.000671 Solved: 11
1 | 10
1 | 20
3 | 30
6 | 40
6 | 50
[SOKOBAN] Retry . . .
6 | 60
[SOKOBAN] Retry . . .
7 | 70
10 | 80
12 | 90
15 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2400 Epsilon 0.200 Learning Rate 0.000659 Solved: 15
1 | 10
3 | 20
5 | 30
8 | 40
9 | 50
12 | 60
12 | 70
14 | 80
17 | 90
[SOKOBAN] Retry . . .
21 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2500 Epsilon 0.200 Learning Rate 0.000648 Solved: 18
1 | 10
1 | 20
[SOKOBAN] Retry . . .
4 | 30
5 | 40
7 | 50
8 | 60
10 | 70
12 | 80
16 | 90
17 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2600 Epsilon 0.200 Learning Rate 0.000637 Solved: 17
2 | 10
4 | 20
7 | 30
8 | 40
9 | 50
12 | 60
13 | 70
15 | 80
17 | 90
20 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2700 Epsilon 0.200 Learning Rate 0.000626 Solved: 19
1 | 10
2 | 20
6 | 30
9 | 40
10 | 50
[SOKOBAN] Retry . . .
12 | 60
17 | 70
19 | 80
20 | 90
[SOKOBAN] Retry . . .
20 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2800 Epsilon 0.200 Learning Rate 0.000616 Solved: 17
2 | 10
6 | 20
8 | 30
[SOKOBAN] Retry . . .
9 | 40
10 | 50
13 | 60
15 | 70
18 | 80
21 | 90
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 2900 Epsilon 0.200 Learning Rate 0.000606 Solved: 17
0 | 10
2 | 20
5 | 30
7 | 40
9 | 50
10 | 60
14 | 70
17 | 80
18 | 90
19 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3000 Epsilon 0.200 Learning Rate 0.000596 Solved: 17
5 | 10
7 | 20
11 | 30
[SOKOBAN] Retry . . .
13 | 40
17 | 50
21 | 60
23 | 70
26 | 80
[SOKOBAN] Retry . . .
29 | 90
33 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3100 Epsilon 0.200 Learning Rate 0.000587 Solved: 21
2 | 10
4 | 20
6 | 30
9 | 40
11 | 50
13 | 60
16 | 70
19 | 80
22 | 90
[SOKOBAN] Retry . . .
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3200 Epsilon 0.200 Learning Rate 0.000578 Solved: 15
1 | 10
2 | 20
4 | 30
7 | 40
[SOKOBAN] Retry . . .
9 | 50
11 | 60
13 | 70
15 | 80
17 | 90
[SOKOBAN] Retry . . .
18 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3300 Epsilon 0.200 Learning Rate 0.000568 Solved: 15
5 | 10
6 | 20
8 | 30
10 | 40
12 | 50
14 | 60
17 | 70
18 | 80
20 | 90
20 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3400 Epsilon 0.200 Learning Rate 0.000559 Solved: 14
4 | 10
8 | 20
11 | 30
12 | 40
14 | 50
[SOKOBAN] Retry . . .
15 | 60
18 | 70
23 | 80
27 | 90
27 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[SOKOBAN] Retry . . .

Episode 3500 Epsilon 0.200 Learning Rate 0.000550 Solved: 18
2 | 10
4 | 20


KeyboardInterrupt: 