# Final Project - Reinforcements Learning 
Hello dear students,<br> this is the template notebook. Please click on the "File" tab and then on "Save a copy into drive".

---
<br>

### Name and ID:
Student 1: Avraham Raviv, 204355390
<br>
Student 2: Yevgeni Berkovitch, 317079234
<br><br>
<img src="https://play-lh.googleusercontent.com/e_oKlKPISbgdzut1H9opevS7-LTB8-8lsmpCdMkhlnqFenZhpjxbLmx7l158-xQQCIY">

### https://github.com/mpSchrader/gym-sokoban

# Installs

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install gym_sokoban

!imageio_download_bin ffmpeg

# Imports

In [4]:
import random
import time

import numpy as np
import matplotlib.pyplot as plt

import base64
import imageio
from pyvirtualdisplay import Display
from IPython.display import HTML

import gym
from gym import error, spaces, utils
from soko_pap import *

from collections import deque
from queue import PriorityQueue

from keras.models import Sequential
from keras.layers import Dense

In [5]:
%matplotlib inline

In [6]:
imageio.plugins.ffmpeg.download()

In [7]:
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

# Display utils
The cell below contains the video display configuration. No need to make changes here.

In [8]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return HTML(tag)

# Utils

In [9]:
def get_distances(room_state):
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):
            if room_state[i][j] == 2:
                target = (i, j)

    distances = np.zeros(shape=room_state.shape)
    visited_cells = set()
    cell_queue = deque()

    visited_cells.add(target)
    cell_queue.appendleft(target)

    while len(cell_queue) != 0:
        cell = cell_queue.pop()
        distance = distances[cell[0]][cell[1]]
        for x,y in ((1,0), (-1,-0), (0,1), (0,-1)):
            next_cell_x, next_cell_y = cell[0]+x, cell[1]+y
            if room_state[next_cell_x][next_cell_y] != 0 and not (next_cell_x, next_cell_y) in visited_cells:
                distances[next_cell_x][next_cell_y] = distance + 1
                visited_cells.add((next_cell_x, next_cell_y))
                cell_queue.appendleft((next_cell_x, next_cell_y))
                
    return distances   

def calc_distances(room_state, distances):
    box = None
    mover = None
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):            
            if room_state[i][j] == 4:
                box = (i,j)
            
            if room_state[i][j] == 5:
                mover = (i,j)
    
    return mover, box, distances[box[0]][box[1]]   

def box2target_change_reward(room_state, next_room_state, distances):
    if np.array_equal(room_state, next_room_state):
        return -1.0
    
    mover, box, t2b = calc_distances(room_state, distances)
    n_mover, n_box, n_t2b = calc_distances(next_room_state, distances)
    
    change_reward = 0.0
    if n_t2b < t2b:
        change_reward += 5.0
    elif n_t2b > t2b:
        change_reward -= 5.0
        
    m2b = np.sqrt((mover[0]-box[0])**2 + (mover[1]-box[1])**2)
    n_m2b = np.sqrt((n_mover[0]-n_box[0])**2 + (n_mover[1]-n_box[1])**2)
    
    if n_m2b < m2b and m2b >= 2:
        change_reward += 1.0
    elif n_m2b > m2b and n_m2b >= 2:
        change_reward -= 1.0
        
    return change_reward   

# Solution

In [10]:
class SOK_Agent:
    def __init__(self):
        # Construct DQN models
        self.state_size = (25,) 
        self.action_size = 8
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.batch_size = 8
        
        # Replay buffers
        self.replay_buffer = deque(maxlen=5000)
        self.prioritized_replay_buffer = deque(maxlen=500)
        self.prioritized_replay_batch = 50        
        
        # Hyperparameters
        self.gamma = 0.9
        self.epsilon = 1.0   
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.995
        self.replay_rate = 10
        self.update_beta = 0.999

        self.verbosity = 100 

    def _build_model(self):
        model = Sequential()
        model.add(Dense(512, input_shape=self.state_size, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer="adam")        
        return model

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append([state, action, reward, next_state, done])    
        
    def copy_to_prioritized_buffer(self, n):
        for i in range(n):
            self.prioritized_replay_buffer.append(self.replay_buffer[-1-i])  

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0]) 

    def replay(self):        
        minibatch = random.sample(self.replay_buffer, self.batch_size//2) 
        minibatch.extend(random.sample(self.prioritized_replay_buffer, self.batch_size//2))
        
        states = np.zeros((self.batch_size, self.state_size[0]))
        actions = np.zeros(self.batch_size, dtype=int)
        rewards = np.zeros(self.batch_size)
        next_states = np.zeros((self.batch_size, self.state_size[0]))
        statuses = np.zeros(self.batch_size)
        targets = np.zeros((self.batch_size, self.action_size)) 
        
        for i, (state, action, reward, next_state, done) in enumerate(minibatch): 
            states[i] = state.copy()
            actions[i] = action
            rewards[i] = reward
            next_states[i] = next_state.copy()
            statuses[i] = 1 if done else 0    
        
        targets = self.model.predict(states) 
        max_actions = np.argmax(self.model.predict(next_states), axis=1)
        next_rewards = self.target_model.predict(next_states)
        
        ind = 0
        for action, reward, next_reward, max_action, done in zip(actions, rewards, next_rewards, max_actions, statuses):  
            if not done:
                reward += self.gamma * next_reward[max_action]
            targets[ind][action] = reward
            ind += 1
        
        self.model.fit(states, targets, epochs=10, verbose=0) 
        
    def update_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay
        
    def update_target_model(self):
        model_w = self.model.get_weights()
        target_model_w = self.target_model.get_weights()
        updated_target_model_w = []
        for i in range(len(model_w)):
            updated_target_model_w.append(self.update_beta*target_model_w[i] + (1-self.update_beta)*model_w[i])
        self.target_model.set_weights(updated_target_model_w)    
            
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [11]:
def process_frame(frame):
    f = frame[16:96, 16:96, 0]   
    f = f.reshape(5, 16, 5, 16).max(axis=(1, 3))
    f = f.flatten()
    f = f / 255
    return np.expand_dims(f, axis=0)

## Training

In [12]:
max_episodes = 10000
max_steps = 500

def init_sok(r):
    random.seed(r%10)
    sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
    sok.set_maxsteps(max_steps)
    return sok

In [11]:
agent = SOK_Agent()
successes_before_train = 20
successful_episodes = 0
continuous_successes_goal = 100
continuous_successes = 0

steps_per_episode = []

for e in range(max_episodes):
    if continuous_successes >= continuous_successes_goal:
        print("Agent training finished!")
        break
    
    print("Episode: %d" % (e))
    
    sok = init_sok(e)
    state = process_frame(sok.get_image('rgb_array'))
    random.seed(e)
    
    room_state = sok.room_state.copy() 
    distances = get_distances(room_state)
    
    for step in range(sok.max_steps):
        action = agent.act(state)
        if action < 4:
            next_state, reward, done, _ = sok.step(action+1) 
        else:
            next_state, reward, done, _ = sok.step(action+5)         
        
        next_state = process_frame(next_state)        
        next_room_state = sok.room_state
        
        if not done:
            reward += box2target_change_reward(room_state, next_room_state, distances)
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state.copy() 
        room_state = next_room_state.copy()
                
        if successful_episodes >= successes_before_train:
            if (step+1) % agent.replay_rate == 0:
                agent.replay() 
                agent.update_target_model()
                agent.update_epsilon()
        
        if done:            
            if 3 in sok.room_state:
                successful_episodes += 1
                continuous_successes += 1
                print("SOLVED! Episode %d Steps: %d Epsilon %.4f" % (e, step+1, agent.epsilon)) 
                agent.copy_to_prioritized_buffer(min(agent.prioritized_replay_batch, step+1))
            else:
                continuous_successes = 0
                
            steps_per_episode.append(step+1)
            #agent.save("exp1_episode%d.h5" % (e))
            
            break

Episode: 0
Episode: 1
SOLVED! Episode 1 Steps: 172 Epsilon 1.0000
Episode: 2
Episode: 3
SOLVED! Episode 3 Steps: 54 Epsilon 1.0000
Episode: 4
SOLVED! Episode 4 Steps: 1 Epsilon 1.0000
Episode: 5
SOLVED! Episode 5 Steps: 28 Epsilon 1.0000
Episode: 6
Episode: 7
SOLVED! Episode 7 Steps: 25 Epsilon 1.0000
Episode: 8
SOLVED! Episode 8 Steps: 352 Epsilon 1.0000
Episode: 9
SOLVED! Episode 9 Steps: 74 Epsilon 1.0000
Episode: 10
Episode: 11
SOLVED! Episode 11 Steps: 53 Epsilon 1.0000
Episode: 12
Episode: 13
Episode: 14
SOLVED! Episode 14 Steps: 2 Epsilon 1.0000
Episode: 15
Episode: 16
SOLVED! Episode 16 Steps: 138 Epsilon 1.0000
Episode: 17
SOLVED! Episode 17 Steps: 449 Epsilon 1.0000
Episode: 18
SOLVED! Episode 18 Steps: 6 Epsilon 1.0000
Episode: 19
Episode: 20
Episode: 21
SOLVED! Episode 21 Steps: 134 Epsilon 1.0000
Episode: 22
SOLVED! Episode 22 Steps: 438 Epsilon 1.0000
Episode: 23
SOLVED! Episode 23 Steps: 2 Epsilon 1.0000
Episode: 24
SOLVED! Episode 24 Steps: 3 Epsilon 1.0000
Episode: 25


SOLVED! Episode 188 Steps: 2 Epsilon 0.2988
Episode: 189
SOLVED! Episode 189 Steps: 350 Epsilon 0.2988
Episode: 190
SOLVED! Episode 190 Steps: 3 Epsilon 0.2988
Episode: 191
SOLVED! Episode 191 Steps: 3 Epsilon 0.2988
Episode: 192
Episode: 193
SOLVED! Episode 193 Steps: 1 Epsilon 0.2988
Episode: 194
SOLVED! Episode 194 Steps: 1 Epsilon 0.2988
Episode: 195
SOLVED! Episode 195 Steps: 7 Epsilon 0.2988
Episode: 196
SOLVED! Episode 196 Steps: 6 Epsilon 0.2988
Episode: 197
SOLVED! Episode 197 Steps: 3 Epsilon 0.2988
Episode: 198
SOLVED! Episode 198 Steps: 2 Epsilon 0.2988
Episode: 199
Episode: 200
SOLVED! Episode 200 Steps: 3 Epsilon 0.2988
Episode: 201
SOLVED! Episode 201 Steps: 3 Epsilon 0.2988
Episode: 202
Episode: 203
SOLVED! Episode 203 Steps: 1 Epsilon 0.2988
Episode: 204
SOLVED! Episode 204 Steps: 1 Epsilon 0.2988
Episode: 205
SOLVED! Episode 205 Steps: 7 Epsilon 0.2988
Episode: 206
SOLVED! Episode 206 Steps: 6 Epsilon 0.2988
Episode: 207
SOLVED! Episode 207 Steps: 3 Epsilon 0.2988
Epi

SOLVED! Episode 358 Steps: 2 Epsilon 0.2988
Episode: 359
Episode: 360
SOLVED! Episode 360 Steps: 3 Epsilon 0.2988
Episode: 361
SOLVED! Episode 361 Steps: 3 Epsilon 0.2988
Episode: 362
Episode: 363
SOLVED! Episode 363 Steps: 1 Epsilon 0.2988
Episode: 364
SOLVED! Episode 364 Steps: 1 Epsilon 0.2988
Episode: 365
SOLVED! Episode 365 Steps: 7 Epsilon 0.2988
Episode: 366
SOLVED! Episode 366 Steps: 6 Epsilon 0.2988
Episode: 367
SOLVED! Episode 367 Steps: 3 Epsilon 0.2988
Episode: 368
SOLVED! Episode 368 Steps: 2 Epsilon 0.2988
Episode: 369
Episode: 370
SOLVED! Episode 370 Steps: 3 Epsilon 0.2988
Episode: 371
SOLVED! Episode 371 Steps: 3 Epsilon 0.2988
Episode: 372
Episode: 373
SOLVED! Episode 373 Steps: 1 Epsilon 0.2988
Episode: 374
SOLVED! Episode 374 Steps: 1 Epsilon 0.2988
Episode: 375
SOLVED! Episode 375 Steps: 7 Epsilon 0.2988
Episode: 376
SOLVED! Episode 376 Steps: 6 Epsilon 0.2988
Episode: 377
SOLVED! Episode 377 Steps: 3 Epsilon 0.2988
Episode: 378
SOLVED! Episode 378 Steps: 2 Epsilon

SOLVED! Episode 525 Steps: 7 Epsilon 0.2988
Episode: 526
SOLVED! Episode 526 Steps: 6 Epsilon 0.2988
Episode: 527
SOLVED! Episode 527 Steps: 3 Epsilon 0.2988
Episode: 528
SOLVED! Episode 528 Steps: 2 Epsilon 0.2988
Episode: 529
Episode: 530
SOLVED! Episode 530 Steps: 3 Epsilon 0.2988
Episode: 531
SOLVED! Episode 531 Steps: 3 Epsilon 0.2988
Episode: 532
Episode: 533
SOLVED! Episode 533 Steps: 1 Epsilon 0.2988
Episode: 534
SOLVED! Episode 534 Steps: 1 Epsilon 0.2988
Episode: 535
SOLVED! Episode 535 Steps: 7 Epsilon 0.2988
Episode: 536
SOLVED! Episode 536 Steps: 6 Epsilon 0.2988
Episode: 537
SOLVED! Episode 537 Steps: 3 Epsilon 0.2988
Episode: 538
SOLVED! Episode 538 Steps: 2 Epsilon 0.2988
Episode: 539
SOLVED! Episode 539 Steps: 7 Epsilon 0.2988
Episode: 540
SOLVED! Episode 540 Steps: 3 Epsilon 0.2988
Episode: 541
SOLVED! Episode 541 Steps: 3 Epsilon 0.2988
Episode: 542
SOLVED! Episode 542 Steps: 395 Epsilon 0.2988
Episode: 543
SOLVED! Episode 543 Steps: 1 Epsilon 0.2988
Episode: 544
SOL

SOLVED! Episode 688 Steps: 2 Epsilon 0.2988
Episode: 689
SOLVED! Episode 689 Steps: 9 Epsilon 0.2988
Episode: 690
SOLVED! Episode 690 Steps: 3 Epsilon 0.2988
Episode: 691
SOLVED! Episode 691 Steps: 3 Epsilon 0.2988
Episode: 692
Episode: 693
SOLVED! Episode 693 Steps: 1 Epsilon 0.2988
Episode: 694
SOLVED! Episode 694 Steps: 1 Epsilon 0.2988
Episode: 695
SOLVED! Episode 695 Steps: 7 Epsilon 0.2988
Episode: 696
SOLVED! Episode 696 Steps: 6 Epsilon 0.2988
Episode: 697
SOLVED! Episode 697 Steps: 3 Epsilon 0.2988
Episode: 698
SOLVED! Episode 698 Steps: 2 Epsilon 0.2988
Episode: 699
Episode: 700
SOLVED! Episode 700 Steps: 3 Epsilon 0.2988
Episode: 701
SOLVED! Episode 701 Steps: 3 Epsilon 0.2988
Episode: 702
Episode: 703
SOLVED! Episode 703 Steps: 1 Epsilon 0.2988
Episode: 704
SOLVED! Episode 704 Steps: 1 Epsilon 0.2988
Episode: 705
SOLVED! Episode 705 Steps: 205 Epsilon 0.2988
Episode: 706
SOLVED! Episode 706 Steps: 6 Epsilon 0.2988
Episode: 707
SOLVED! Episode 707 Steps: 3 Epsilon 0.2988
Epi

SOLVED! Episode 848 Steps: 2 Epsilon 0.2988
Episode: 849
Episode: 850
SOLVED! Episode 850 Steps: 3 Epsilon 0.2988
Episode: 851
SOLVED! Episode 851 Steps: 3 Epsilon 0.2988
Episode: 852
Episode: 853
SOLVED! Episode 853 Steps: 1 Epsilon 0.2988
Episode: 854
SOLVED! Episode 854 Steps: 1 Epsilon 0.2988
Episode: 855
SOLVED! Episode 855 Steps: 7 Epsilon 0.2988
Episode: 856
SOLVED! Episode 856 Steps: 6 Epsilon 0.2988
Episode: 857
SOLVED! Episode 857 Steps: 3 Epsilon 0.2988
Episode: 858
SOLVED! Episode 858 Steps: 2 Epsilon 0.2988
Episode: 859
Episode: 860
SOLVED! Episode 860 Steps: 3 Epsilon 0.2988
Episode: 861
SOLVED! Episode 861 Steps: 3 Epsilon 0.2988
Episode: 862
SOLVED! Episode 862 Steps: 155 Epsilon 0.2988
Episode: 863
SOLVED! Episode 863 Steps: 1 Epsilon 0.2988
Episode: 864
SOLVED! Episode 864 Steps: 1 Epsilon 0.2988
Episode: 865
SOLVED! Episode 865 Steps: 7 Epsilon 0.2988
Episode: 866
SOLVED! Episode 866 Steps: 6 Epsilon 0.2988
Episode: 867
SOLVED! Episode 867 Steps: 13 Epsilon 0.2988
Ep

SOLVED! Episode 1002 Steps: 37 Epsilon 0.2988
Episode: 1003
SOLVED! Episode 1003 Steps: 1 Epsilon 0.2988
Episode: 1004
SOLVED! Episode 1004 Steps: 1 Epsilon 0.2988
Episode: 1005
SOLVED! Episode 1005 Steps: 7 Epsilon 0.2988
Episode: 1006
SOLVED! Episode 1006 Steps: 6 Epsilon 0.2988
Episode: 1007
SOLVED! Episode 1007 Steps: 3 Epsilon 0.2988
Episode: 1008
SOLVED! Episode 1008 Steps: 2 Epsilon 0.2988
Episode: 1009
SOLVED! Episode 1009 Steps: 378 Epsilon 0.2988
Episode: 1010
SOLVED! Episode 1010 Steps: 3 Epsilon 0.2988
Episode: 1011
SOLVED! Episode 1011 Steps: 3 Epsilon 0.2988
Episode: 1012
SOLVED! Episode 1012 Steps: 380 Epsilon 0.2988
Episode: 1013
SOLVED! Episode 1013 Steps: 1 Epsilon 0.2988
Episode: 1014
SOLVED! Episode 1014 Steps: 1 Epsilon 0.2988
Episode: 1015
SOLVED! Episode 1015 Steps: 7 Epsilon 0.2988
Episode: 1016
SOLVED! Episode 1016 Steps: 6 Epsilon 0.2988
Episode: 1017
SOLVED! Episode 1017 Steps: 11 Epsilon 0.2988
Episode: 1018
SOLVED! Episode 1018 Steps: 2 Epsilon 0.2988
Episo

SOLVED! Episode 1146 Steps: 6 Epsilon 0.2988
Episode: 1147
SOLVED! Episode 1147 Steps: 3 Epsilon 0.2988
Episode: 1148
SOLVED! Episode 1148 Steps: 2 Epsilon 0.2988
Episode: 1149
SOLVED! Episode 1149 Steps: 5 Epsilon 0.2988
Episode: 1150
SOLVED! Episode 1150 Steps: 3 Epsilon 0.2988
Episode: 1151
SOLVED! Episode 1151 Steps: 3 Epsilon 0.2988
Episode: 1152
SOLVED! Episode 1152 Steps: 92 Epsilon 0.2988
Episode: 1153
SOLVED! Episode 1153 Steps: 1 Epsilon 0.2988
Episode: 1154
SOLVED! Episode 1154 Steps: 1 Epsilon 0.2988
Episode: 1155
SOLVED! Episode 1155 Steps: 7 Epsilon 0.2988
Episode: 1156
SOLVED! Episode 1156 Steps: 6 Epsilon 0.2988
Episode: 1157
SOLVED! Episode 1157 Steps: 3 Epsilon 0.2988
Episode: 1158
SOLVED! Episode 1158 Steps: 2 Epsilon 0.2988
Episode: 1159
SOLVED! Episode 1159 Steps: 5 Epsilon 0.2988
Episode: 1160
SOLVED! Episode 1160 Steps: 3 Epsilon 0.2988
Episode: 1161
SOLVED! Episode 1161 Steps: 3 Epsilon 0.2988
Episode: 1162
SOLVED! Episode 1162 Steps: 21 Epsilon 0.2988
Episode: 

SOLVED! Episode 1290 Steps: 3 Epsilon 0.2988
Episode: 1291
SOLVED! Episode 1291 Steps: 3 Epsilon 0.2988
Episode: 1292
SOLVED! Episode 1292 Steps: 110 Epsilon 0.2988
Episode: 1293
SOLVED! Episode 1293 Steps: 1 Epsilon 0.2988
Episode: 1294
SOLVED! Episode 1294 Steps: 1 Epsilon 0.2988
Episode: 1295
SOLVED! Episode 1295 Steps: 7 Epsilon 0.2988
Episode: 1296
SOLVED! Episode 1296 Steps: 6 Epsilon 0.2988
Episode: 1297
SOLVED! Episode 1297 Steps: 3 Epsilon 0.2988
Episode: 1298
SOLVED! Episode 1298 Steps: 2 Epsilon 0.2988
Episode: 1299
SOLVED! Episode 1299 Steps: 5 Epsilon 0.2988
Episode: 1300
SOLVED! Episode 1300 Steps: 3 Epsilon 0.2988
Episode: 1301
SOLVED! Episode 1301 Steps: 3 Epsilon 0.2988
Episode: 1302
SOLVED! Episode 1302 Steps: 11 Epsilon 0.2988
Episode: 1303
SOLVED! Episode 1303 Steps: 1 Epsilon 0.2988
Episode: 1304
SOLVED! Episode 1304 Steps: 1 Epsilon 0.2988
Episode: 1305
SOLVED! Episode 1305 Steps: 7 Epsilon 0.2988
Episode: 1306
SOLVED! Episode 1306 Steps: 6 Epsilon 0.2988
Episode:

## Test Generalization

#### Learned Policy

In [12]:
agent.epsilon = 0.0
num_solved = 0

for t in range(100):    
    random.seed(t+10)
    sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
    sok.set_maxsteps(100)
    
    steps = 0
        
    state = sok.get_image('rgb_array')
    done = False
    while not done:
        steps += 1
        action = agent.act(process_frame(state))

        if action < 4:
            action += 1
        else:
            action += 5

        state, reward, done, info = sok.step(action)
        
    solved = False
    if 3 in sok.room_state:
        solved = True
        num_solved += 1
    
    print("Puzzle # %d: %d (%s) [%d/%d]" % ((t+1), steps, "YES" if solved else "NOT", num_solved, (t+1)))

Puzzle # 1: 100 (NOT) [0/1]
Puzzle # 2: 100 (NOT) [0/2]
Puzzle # 3: 100 (NOT) [0/3]
Puzzle # 4: 100 (NOT) [0/4]
Puzzle # 5: 100 (NOT) [0/5]
Puzzle # 6: 100 (NOT) [0/6]
Puzzle # 7: 100 (NOT) [0/7]
Puzzle # 8: 100 (NOT) [0/8]
Puzzle # 9: 100 (NOT) [0/9]
Puzzle # 10: 100 (NOT) [0/10]
Puzzle # 11: 100 (NOT) [0/11]
Puzzle # 12: 100 (NOT) [0/12]
Puzzle # 13: 100 (NOT) [0/13]
Puzzle # 14: 100 (NOT) [0/14]
Puzzle # 15: 100 (NOT) [0/15]
Puzzle # 16: 100 (NOT) [0/16]
Puzzle # 17: 100 (NOT) [0/17]
Puzzle # 18: 100 (NOT) [0/18]
Puzzle # 19: 100 (NOT) [0/19]
Puzzle # 20: 100 (NOT) [0/20]
Puzzle # 21: 100 (NOT) [0/21]
Puzzle # 22: 100 (NOT) [0/22]
Puzzle # 23: 100 (NOT) [0/23]
Puzzle # 24: 100 (NOT) [0/24]
Puzzle # 25: 100 (NOT) [0/25]
Puzzle # 26: 2 (YES) [1/26]
Puzzle # 27: 100 (NOT) [1/27]
Puzzle # 28: 1 (YES) [2/28]
Puzzle # 29: 100 (NOT) [2/29]
Puzzle # 30: 100 (NOT) [2/30]
Puzzle # 31: 100 (NOT) [2/31]
Puzzle # 32: 100 (NOT) [2/32]
Puzzle # 33: 100 (NOT) [2/33]
Puzzle # 34: 100 (NOT) [2/34]
Pu

#### Random Policy

In [13]:
agent.epsilon = 1.0
num_solved = 0

for t in range(100):    
    random.seed(t+10)
    sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
    sok.set_maxsteps(100)
    
    steps = 0
        
    state = sok.get_image('rgb_array')
    done = False
    while not done:
        steps += 1
        action = agent.act(process_frame(state))

        if action < 4:
            action += 1
        else:
            action += 5

        state, reward, done, info = sok.step(action)
        
    solved = False
    if 3 in sok.room_state:
        solved = True
        num_solved += 1
    
    print("Puzzle # %d: %d (%s) [%d/%d]" % ((t+1), steps, "YES" if solved else "NOT", num_solved, (t+1)))

Puzzle # 1: 100 (NOT) [0/1]
Puzzle # 2: 100 (NOT) [0/2]
Puzzle # 3: 17 (YES) [1/3]
Puzzle # 4: 100 (NOT) [1/4]
Puzzle # 5: 100 (NOT) [1/5]
Puzzle # 6: 100 (NOT) [1/6]
Puzzle # 7: 16 (YES) [2/7]
Puzzle # 8: 1 (YES) [3/8]
Puzzle # 9: 100 (NOT) [3/9]
Puzzle # 10: 100 (NOT) [3/10]
Puzzle # 11: 100 (NOT) [3/11]
Puzzle # 12: 12 (YES) [4/12]
Puzzle # 13: 100 (NOT) [4/13]
Puzzle # 14: 100 (NOT) [4/14]
Puzzle # 15: 100 (NOT) [4/15]
Puzzle # 16: 100 (NOT) [4/16]
Puzzle # 17: 43 (YES) [5/17]
Puzzle # 18: 100 (NOT) [5/18]
Puzzle # 19: 100 (NOT) [5/19]
Puzzle # 20: 17 (YES) [6/20]
Puzzle # 21: 100 (NOT) [6/21]
Puzzle # 22: 100 (NOT) [6/22]
Puzzle # 23: 100 (NOT) [6/23]
Puzzle # 24: 100 (NOT) [6/24]
Puzzle # 25: 100 (NOT) [6/25]
Puzzle # 26: 100 (NOT) [6/26]
Puzzle # 27: 100 (NOT) [6/27]
Puzzle # 28: 100 (NOT) [6/28]
Puzzle # 29: 100 (NOT) [6/29]
Puzzle # 30: 3 (YES) [7/30]
Puzzle # 31: 100 (NOT) [7/31]
Puzzle # 32: 100 (NOT) [7/32]
Puzzle # 33: 100 (NOT) [7/33]
Puzzle # 34: 56 (YES) [8/34]
Puzzle #