# Final Project - Reinforcements Learning 
Hello dear students,<br> this is the template notebook. Please click on the "File" tab and then on "Save a copy into drive".

---
<br>

### Name and ID:
Student 1: Avraham Raviv, 204355390
<br>
Student 2: Yevgeni Berkovitch, 317079234
<br><br>
<img src="https://play-lh.googleusercontent.com/e_oKlKPISbgdzut1H9opevS7-LTB8-8lsmpCdMkhlnqFenZhpjxbLmx7l158-xQQCIY">

### https://github.com/mpSchrader/gym-sokoban

# Installs

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install gym_sokoban

!imageio_download_bin ffmpeg

# Imports

In [2]:
import random
import time

import numpy as np
import scipy as scp
import matplotlib.pyplot as plt

import base64
import imageio
from pyvirtualdisplay import Display
from IPython.display import HTML

import gym
from gym import error, spaces, utils
from soko_pap import *

from collections import deque
from queue import PriorityQueue

from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

from tqdm.notebook import tqdm
from collections import defaultdict

In [3]:
%matplotlib inline

In [4]:
imageio.plugins.ffmpeg.download()

In [5]:
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

# Display utils
The cell below contains the video display configuration. No need to make changes here.

In [6]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return HTML(tag)

# Utils

In [7]:
def get_distances(room_state):
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):
            if room_state[i][j] == 2:
                target = (i, j)

    distances = np.zeros(shape=room_state.shape)
    visited_cells = set()
    cell_queue = deque()

    visited_cells.add(target)
    cell_queue.appendleft(target)

    while len(cell_queue) != 0:
        cell = cell_queue.pop()
        distance = distances[cell[0]][cell[1]]
        for x,y in ((1,0), (-1,-0), (0,1), (0,-1)):
            next_cell_x, next_cell_y = cell[0]+x, cell[1]+y
            if room_state[next_cell_x][next_cell_y] != 0 and not (next_cell_x, next_cell_y) in visited_cells:
                distances[next_cell_x][next_cell_y] = distance + 1
                visited_cells.add((next_cell_x, next_cell_y))
                cell_queue.appendleft((next_cell_x, next_cell_y))
                
    return distances   

def calc_distances(room_state, distances):
    box = None
    mover = None
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):            
            if room_state[i][j] == 4:
                box = (i,j)
            
            if room_state[i][j] == 5:
                mover = (i,j)
    
    return mover, box, distances[box[0]][box[1]]   

def box2target_change_reward(room_state, next_room_state, distances):
    if np.array_equal(room_state, next_room_state):
        return -5.0
    
    mover, box, t2b = calc_distances(room_state, distances)
    n_mover, n_box, n_t2b = calc_distances(next_room_state, distances)
    
    change_reward = 0.0
    if n_t2b < t2b:
        change_reward += 5.0
    elif n_t2b > t2b:
        change_reward -= 5.0
        
    m2b = np.sqrt((mover[0]-box[0])**2 + (mover[1]-box[1])**2)
    n_m2b = np.sqrt((n_mover[0]-n_box[0])**2 + (n_mover[1]-n_box[1])**2)
    
    if n_m2b < m2b and m2b >= 2:
        change_reward += 1.0
    elif n_m2b > m2b and n_m2b >= 2:
        change_reward -= 1.0
        
    return change_reward   

# Solution

In [13]:
class SOK_Agent:
    def __init__(self):
        # Construct DQN models
        self.state_size = (112,112,1) 
        self.action_size = 8
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.batch_size = 8
        
        # Replay buffers
        self.replay_buffer = deque(maxlen=10000)
        self.prioritized_replay_buffer = deque(maxlen=5000)
        
        # Hyperparameters
        self.gamma = 0.9
        self.epsilon = 1.0   
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.995
        self.update_beta = 0.99
        
        self.replay_rate = 10
        self.puzzle_replay_rate = 500
        
        self.action_rotation_map = {
            0: 2,
            1: 3,
            2: 1,
            3: 0,
            4: 6,
            5: 7,
            6: 5,
            7: 4
        }

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (16,16), strides=(16,16), input_shape=self.state_size, activation='relu'))
        model.add(Conv2D(64, (3,3), activation='relu'))
        model.add(Conv2D(64, (3,3), padding='same', activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))  
        model.add(Dense(128, activation='relu')) 
        model.add(Dense(32, activation='relu')) 
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer="adam")        
        return model

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append([state, action, reward, next_state, done])    
        
    def copy_to_prioritized_buffer(self, n):
        for i in range(n):
            self.prioritized_replay_buffer.append(self.replay_buffer[-1-i])  

    def act(self, state, stochastic=False):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state, verbose=0)[0]
        
        if stochastic:
            act_probs = np.exp(act_values)/np.exp(act_values).sum()
            return np.random.choice(np.arange(self.action_size), size=1, p=act_probs)[0]
              
        return np.argmax(act_values) 

    def replay(self): 
        if len(self.replay_buffer) < self.batch_size:
            return
        
        if len(self.prioritized_replay_buffer) < self.batch_size//2:
            minibatch = random.sample(self.replay_buffer, self.batch_size) 
        else:    
            minibatch = random.sample(self.replay_buffer, self.batch_size//2) 
            minibatch.extend(random.sample(self.prioritized_replay_buffer, self.batch_size//2))
        
        states = np.zeros((self.batch_size*4, self.state_size[0], self.state_size[1]))
        actions = np.zeros(self.batch_size*4, dtype=int)
        rewards = np.zeros(self.batch_size*4)
        next_states = np.zeros((self.batch_size*4, self.state_size[0], self.state_size[1]))
        statuses = np.zeros(self.batch_size*4)
        targets = np.zeros((self.batch_size*4, self.action_size)) 
        
        for i, (state, action, reward, next_state, done) in enumerate(minibatch): 
            for rot in range(4):  
                ind = i*4+rot
                if rot != 0:
                    state = np.rot90(state, axes=(1,2))
                    next_state = np.rot90(next_state, axes=(1,2))
                    action = self.action_rotation_map.get(action)

                states[ind] = state.copy()
                actions[ind] = action
                rewards[ind] = reward
                next_states[ind] = next_state.copy()
                statuses[ind] = 1 if done else 0          
        
        targets = self.model.predict(states) 
        max_actions = np.argmax(self.model.predict(next_states), axis=1)
        next_rewards = self.target_model.predict(next_states)
        
        ind = 0
        for action, reward, next_reward, max_action, done in zip(actions, rewards, next_rewards, max_actions, statuses):  
            if not done:
                reward += self.gamma * next_reward[max_action]
            targets[ind][action] = reward
            ind += 1
        
        self.model.fit(states, targets, epochs=10, verbose=0) 
        
        self.update_target_model()        
    
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay    
        
    def update_target_model(self):
        model_w = self.model.get_weights()
        target_model_w = self.target_model.get_weights()
        updated_target_model_w = []
        for i in range(len(model_w)):
            updated_target_model_w.append(self.update_beta*target_model_w[i] + (1-self.update_beta)*model_w[i])
        self.target_model.set_weights(updated_target_model_w)    
            
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [14]:
def process_frame(frame):
    f = frame.mean(axis=2)
    f = f / 255
    return np.expand_dims(f, axis=0)

## Training

#### Test Suite

In [15]:
def test_agent(e, stochastic=False):
    current_epsilon = agent.epsilon
    agent.epsilon = 0.0
    num_solved = 0
    solved_in_steps = defaultdict(int)

    for t in tqdm(range(100)):    
        random.seed(t)
        sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
        sok.set_maxsteps(20)
        steps = 0

        state = sok.get_image('rgb_array')
        done = False
        while not done:
            steps += 1
            action = agent.act(process_frame(state), stochastic)
            if action < 4:
                action += 1
            else:
                action += 5
            state, reward, done, info = sok.step(action)

        if 3 in sok.room_state:            
            num_solved += 1
            solved_in_steps[steps] += 1
    
    agent.epsilon = current_epsilon    
    print("Episode %d Solved: %d" % (e+1, num_solved))

In [20]:
class PuzzleGenerator:
    def __init__(self):
        self.puzzle_index = 100
        self.replay_index = 0
        self.replay_mode = False
        self.unsolved_puzzles = deque(maxlen=100)
    
    def get_puzzle(self):
        max_steps = 20
        if self.replay_mode:
            max_steps = 50
            puzzle_index = self.unsolved_puzzles[self.replay_index]
            self.replay_index += 1
            if self.replay_index >= len(self.unsolved_puzzles):
                self.replay_index = 0
                self.replay_mode = False
        else:
            puzzle_index = self.puzzle_index
            self.puzzle_index += 1
            
        random.seed(puzzle_index)
        sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
        sok.set_maxsteps(max_steps)
        return puzzle_index, sok 

In [None]:
MAX_EPISODES = 50_000

agent = SOK_Agent()
puzzle_generator = PuzzleGenerator()

running_puzzles = 0
running_solved = 0

for e in range(MAX_EPISODES):
    puzzle_index, sok = puzzle_generator.get_puzzle()
    random.seed(e)
    running_puzzles += 1
    
    state = process_frame(sok.get_image('rgb_array'))
    room_state = sok.room_state.copy() 
    distances = get_distances(room_state)
    
    for step in range(sok.max_steps):
        action = agent.act(state)
        if action < 4:
            next_state, reward, done, _ = sok.step(action+1) 
        else:
            next_state, reward, done, _ = sok.step(action+5)         
        
        next_state = process_frame(next_state)        
        next_room_state = sok.room_state
        
        if not done:
            reward += box2target_change_reward(room_state, next_room_state, distances)
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state.copy() 
        room_state = next_room_state.copy()                
        
        if (step+1) % agent.replay_rate == 0:
            agent.replay()            
        
        if done:
            if 3 in sok.room_state:  
                agent.copy_to_prioritized_buffer(step+1)  
                running_solved += 1
            else:
                puzzle_generator.unsolved_puzzles.append(puzzle_index)    
                
            if (e+1) % 20 == 0 and e > 0:
                print(f"{running_solved} | {running_puzzles}") 
                
                if (e+1) % 100 == 0:
                    running_puzzles = 0
                    running_solved = 0
                    
            break
            
    if (e+1) % 100 == 0 and e > 0:
        test_agent(e, stochastic=False) 
        
    if (e+1) % agent.puzzle_replay_rate  == 0 and e > 0:
        puzzle_generator.replay_mode = True
        agent.replay_buffer.clear()
        agent.prioritized_replay_buffer.clear()
        print("PUZZLE REPLAY MODE")

4 | 20
7 | 40
10 | 60
16 | 80
22 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 100 Solved: 36
6 | 20
13 | 40
21 | 60
27 | 80
38 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 200 Solved: 60
10 | 20
24 | 40
35 | 60
44 | 80
53 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 300 Solved: 63
10 | 20
23 | 40
34 | 60
42 | 80
52 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 400 Solved: 59
10 | 20
20 | 40
32 | 60
43 | 80
55 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 500 Solved: 69
PUZZLE REPLAY MODE
6 | 20
14 | 40
18 | 60
22 | 80
22 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 600 Solved: 74
14 | 20
28 | 40
38 | 60
50 | 80
61 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 700 Solved: 71
13 | 20
25 | 40
42 | 60
57 | 80
65 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 800 Solved: 74
11 | 20
20 | 40
36 | 60
51 | 80
64 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 900 Solved: 70
10 | 20
21 | 40
37 | 60
49 | 80
60 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1000 Solved: 73
PUZZLE REPLAY MODE
4 | 20
12 | 40
17 | 60
22 | 80
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1100 Solved: 79
12 | 20
26 | 40
41 | 60
55 | 80
72 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1200 Solved: 77
15 | 20
31 | 40
44 | 60
56 | 80
70 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1300 Solved: 73
15 | 20
29 | 40
44 | 60
60 | 80
74 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1400 Solved: 77
17 | 20
31 | 40
48 | 60
63 | 80
77 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1500 Solved: 76
PUZZLE REPLAY MODE
7 | 20
11 | 40
13 | 60
20 | 80
23 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1600 Solved: 67
13 | 20
29 | 40
40 | 60
55 | 80
73 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1700 Solved: 79
14 | 20
25 | 40
38 | 60
50 | 80
65 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1800 Solved: 76
11 | 20
23 | 40
39 | 60
49 | 80
62 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 1900 Solved: 75
17 | 20
33 | 40
49 | 60
64 | 80
78 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2000 Solved: 77
PUZZLE REPLAY MODE
8 | 20
12 | 40
17 | 60
20 | 80
24 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2100 Solved: 59
11 | 20
22 | 40
33 | 60
45 | 80
59 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2200 Solved: 59
12 | 20
26 | 40
40 | 60
53 | 80
69 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2300 Solved: 71
15 | 20
27 | 40
39 | 60
51 | 80
64 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2400 Solved: 68
12 | 20
24 | 40
38 | 60
54 | 80
68 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2500 Solved: 72
PUZZLE REPLAY MODE
11 | 20
17 | 40
25 | 60
28 | 80
34 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2600 Solved: 80
15 | 20
28 | 40
42 | 60
57 | 80
73 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2700 Solved: 77
13 | 20
29 | 40
43 | 60
59 | 80
72 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2800 Solved: 77
15 | 20
31 | 40
46 | 60
57 | 80
74 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 2900 Solved: 75
14 | 20
31 | 40
48 | 60
61 | 80
77 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3000 Solved: 88
PUZZLE REPLAY MODE
7 | 20
13 | 40
19 | 60
28 | 80
34 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3100 Solved: 82
16 | 20
29 | 40
42 | 60
57 | 80
75 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3200 Solved: 86
17 | 20
29 | 40
45 | 60
58 | 80
73 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3300 Solved: 73
11 | 20
23 | 40
39 | 60
51 | 80
66 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3400 Solved: 84
14 | 20
25 | 40
40 | 60
54 | 80
69 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3500 Solved: 78
PUZZLE REPLAY MODE
11 | 20
20 | 40
30 | 60
36 | 80
40 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3600 Solved: 78
14 | 20
24 | 40
37 | 60
52 | 80
66 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3700 Solved: 83
17 | 20
32 | 40
44 | 60
60 | 80
73 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3800 Solved: 85
15 | 20
28 | 40
45 | 60
57 | 80
75 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 3900 Solved: 84
17 | 20
36 | 40
53 | 60
70 | 80
86 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4000 Solved: 75
PUZZLE REPLAY MODE
10 | 20
22 | 40
31 | 60
35 | 80
37 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4100 Solved: 73
15 | 20
27 | 40
41 | 60
50 | 80
58 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4200 Solved: 75
18 | 20
34 | 40
48 | 60
59 | 80
78 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4300 Solved: 72
15 | 20
31 | 40
46 | 60
58 | 80
73 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4400 Solved: 80
16 | 20
33 | 40
48 | 60
65 | 80
80 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4500 Solved: 75
PUZZLE REPLAY MODE
14 | 20
22 | 40
34 | 60
45 | 80
51 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Episode 4600 Solved: 68
15 | 20
28 | 40
42 | 60
55 | 80
72 | 100


HBox(children=(FloatProgress(value=0.0), HTML(value='')))