# Final Project - Reinforcements Learning 
Hello dear students,<br> this is the template notebook. Please click on the "File" tab and then on "Save a copy into drive".

---
<br>

### Name and ID:
Student 1: Avraham Raviv, 204355390
<br>
Student 2: Yevgeni Berkovitch, 317079234
<br><br>
<img src="https://play-lh.googleusercontent.com/e_oKlKPISbgdzut1H9opevS7-LTB8-8lsmpCdMkhlnqFenZhpjxbLmx7l158-xQQCIY">

### https://github.com/mpSchrader/gym-sokoban

# Installs

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install gym_sokoban

!imageio_download_bin ffmpeg

# Imports

In [12]:
import random
import time

import numpy as np
import matplotlib.pyplot as plt

import base64
import imageio
from pyvirtualdisplay import Display
from IPython.display import HTML

import gym
from gym import error, spaces, utils
from soko_pap import *

from collections import deque
from queue import PriorityQueue

from tensorflow.keras.optimizers import RMSprop
from keras.models import Model, Sequential
from keras.layers import Input, Dense

In [3]:
%matplotlib inline

In [4]:
imageio.plugins.ffmpeg.download()

In [5]:
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

# Display utils
The cell below contains the video display configuration. No need to make changes here.

In [6]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return HTML(tag)

# Utils

In [7]:
def get_distances(room_state):
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):
            if room_state[i][j] == 2:
                target = (i, j)

    distances = np.zeros(shape=room_state.shape)
    visited_cells = set()
    cell_queue = deque()

    visited_cells.add(target)
    cell_queue.appendleft(target)

    while len(cell_queue) != 0:
        cell = cell_queue.pop()
        distance = distances[cell[0]][cell[1]]
        for x,y in ((1,0), (-1,-0), (0,1), (0,-1)):
            next_cell_x, next_cell_y = cell[0]+x, cell[1]+y
            if room_state[next_cell_x][next_cell_y] != 0 and not (next_cell_x, next_cell_y) in visited_cells:
                distances[next_cell_x][next_cell_y] = distance + 1
                visited_cells.add((next_cell_x, next_cell_y))
                cell_queue.appendleft((next_cell_x, next_cell_y))
                
    return distances   

def calc_distances(room_state, distances):
    box = None
    mover = None
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):            
            if room_state[i][j] == 4:
                box = (i,j)
            
            if room_state[i][j] == 5:
                mover = (i,j)
    
    return mover, box, distances[box[0]][box[1]]   

def box2target_change_reward(room_state, next_room_state, distances):
    if np.array_equal(room_state, next_room_state):
        return -0.5
    
    mover, box, t2b = calc_distances(room_state, distances)
    n_mover, n_box, n_t2b = calc_distances(next_room_state, distances)
    
    change_reward = 0.0
    if n_t2b < t2b:
        change_reward += 1.0
    elif n_t2b > t2b:
        change_reward -= 1.0
        
    m2b = np.sqrt((mover[0]-box[0])**2 + (mover[1]-box[1])**2)
    n_m2b = np.sqrt((n_mover[0]-n_box[0])**2 + (n_mover[1]-n_box[1])**2)
    
    if n_m2b < m2b and m2b >= 2:
        change_reward += 0.25
    elif n_m2b > m2b and n_m2b >= 2:
        change_reward -= 0.25
        
    return change_reward   

# Solution

In [29]:
class SOK_Agent:
    def __init__(self):
        # Construct DQN models
        self.state_size = (25,) 
        self.action_size = 8        
        self.batch_size = 8
        
        # Replay buffers
        self.unsolved_buffer = deque(maxlen=5000)
        self.randomly_solved_buffer = deque(maxlen=3000)      
        self.solved_buffer = deque(maxlen=1000)
        
        # Hyperparameters
        self.gamma = 0.9
        self.epsilon = 1.0   
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9995
        self.epsilon_update_rate = 10
        self.replay_rate = 10

        self.verbosity = 100 
        
        # Actor-Critic
        self.build_actor_critic()

    def build_actor_critic(self):    
        X_input = Input(self.state_size)
        X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X_input)
        
        action = Dense(self.action_size, activation="softmax", kernel_initializer='he_uniform')(X)
        value = Dense(1, kernel_initializer='he_uniform')(X)

        self.actor = Model(inputs = X_input, outputs = action)
        self.actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.000025))

        self.critic = Model(inputs = X_input, outputs = value)
        self.critic.compile(loss='mse', optimizer=RMSprop(lr=0.000025))    
    
    def remember(self, state, action, reward, next_state, done, status='unsolved'):
        if status == 'unsolved':
            self.unsolved_buffer.append([state, action, reward, next_state, done])
        elif status == 'solved':
            self.solved_buffer.append([state, action, reward, next_state, done])
        else:
            self.randomly_solved_buffer.append([state, action, reward, next_state, done])    

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.actor.predict(state, verbose=0)[0]
        return np.argmax(act_values)

    def replay(self):  
        minibatch = random.sample(self.unsolved_buffer, self.batch_size//4) 
        minibatch.extend(random.sample(self.randomly_solved_buffer, self.batch_size//2))
        minibatch.extend(random.sample(self.solved_buffer, self.batch_size//4))
        
        states = np.zeros((self.batch_size, self.state_size[0]))
        actions = np.zeros((self.batch_size, self.action_size))
        rewards = np.zeros(self.batch_size)
        
        for i, (state, action, reward, next_state, done) in enumerate(minibatch): 
            states[i] = state.copy()
            actions[i][action] = 1.0
            rewards[i] = reward
            
        values = self.critic.predict(states)[:, 0]
        advantages = rewards - values
        
        self.actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
        self.critic.fit(states, rewards, epochs=1, verbose=0)
        
    def update_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [30]:
def process_frame(frame):
    f = frame[16:96, 16:96, 0]   
    f = f.reshape(5, 16, 5, 16).max(axis=(1, 3))
    f = f.flatten()
    f = f / 255
    return np.expand_dims(f, axis=0)

## Training

In [31]:
max_episodes = 10000
max_steps = 100

def init_sok(r):
    random.seed(r%10)
    sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
    sok.set_maxsteps(max_steps)
    return sok

In [32]:
agent = SOK_Agent()
successes_before_train = 10
successful_episodes = 0
continuous_successes_goal = 100
continuous_successes = 0
total_steps = 0

steps_per_episode = []

for e in range(max_episodes):
    if continuous_successes >= continuous_successes_goal:
        print("Agent training finished!")
        break
    
    print("Episode: %d" % (e))
    
    sok = init_sok(e)
    state = process_frame(sok.get_image('rgb_array'))
    random.seed(e)
    
    room_state = sok.room_state.copy() 
    distances = get_distances(room_state)
    
    episode_memory = []
    
    for step in range(sok.max_steps):
        total_steps += 1
        current_epsilon = agent.epsilon
        if current_epsilon < 0.2 and step >= 20:
            agent.epsilon = 0.8 
        action = agent.act(state)
        agent.epsilon = current_epsilon
        
        if action < 4:
            next_state, reward, done, _ = sok.step(action+1) 
        else:
            next_state, reward, done, _ = sok.step(action+5)         
        
        next_state = process_frame(next_state)        
        next_room_state = sok.room_state
        
        if not done:
            reward += box2target_change_reward(room_state, next_room_state, distances)
        
        episode_memory.append((state, room_state, action, reward, next_state, done))
        
        state = next_state.copy() 
        room_state = next_room_state.copy()
                
        if successful_episodes >= successes_before_train:
            if (total_steps+1) % agent.replay_rate == 0:
                agent.replay() 
                agent.update_epsilon()
        
        if done: 
            status = 'unsolved'
            if step < 20:
                status = 'solved'
            elif 3 in sok.room_state:
                status = 'randomly_solved'
                
            states_actions_rewards = {}
            running_reward = 0            
            for state, room_state, action, reward, next_state, was_done in episode_memory[::-1]: 
                state_action_key = tuple((tuple(room_state.flatten()), action))
                if state_action_key in states_actions_rewards:
                    running_reward = states_actions_rewards[state_action_key]
                    continue
                    
                if was_done:
                    running_reward = reward
                else:    
                    running_reward += agent.gamma * reward

                agent.remember(state, action, running_reward, next_state, done, status)
                states_actions_rewards[state_action_key] = running_reward
            
            if status != 'unsolved':
                successful_episodes += 1
                continuous_successes += 1
                print("SOLVED! Episode %d Steps: %d Epsilon %.4f" % (e, step+1, agent.epsilon))
            else:
                continuous_successes = 0
                
            steps_per_episode.append(step+1)
            #agent.save("exp1_episode%d.h5" % (e))
            
            break

Episode: 0
Episode: 1
Episode: 2
Episode: 3
SOLVED! Episode 3 Steps: 54 Epsilon 1.0000
Episode: 4
SOLVED! Episode 4 Steps: 1 Epsilon 1.0000
Episode: 5
SOLVED! Episode 5 Steps: 28 Epsilon 1.0000
Episode: 6
Episode: 7
SOLVED! Episode 7 Steps: 25 Epsilon 1.0000
Episode: 8
Episode: 9
SOLVED! Episode 9 Steps: 74 Epsilon 1.0000
Episode: 10
Episode: 11
SOLVED! Episode 11 Steps: 53 Epsilon 1.0000
Episode: 12
Episode: 13
Episode: 14
SOLVED! Episode 14 Steps: 2 Epsilon 1.0000
Episode: 15
Episode: 16
Episode: 17
Episode: 18
SOLVED! Episode 18 Steps: 6 Epsilon 1.0000
Episode: 19
Episode: 20
Episode: 21
Episode: 22
Episode: 23
SOLVED! Episode 23 Steps: 2 Epsilon 1.0000
Episode: 24
SOLVED! Episode 24 Steps: 3 Epsilon 1.0000
Episode: 25
Episode: 26
Episode: 27
Episode: 28
SOLVED! Episode 28 Steps: 7 Epsilon 0.9846
Episode: 29
Episode: 30
Episode: 31
SOLVED! Episode 31 Steps: 20 Epsilon 0.9738
Episode: 32
Episode: 33
SOLVED! Episode 33 Steps: 16 Epsilon 0.9680
Episode: 34
Episode: 35
Episode: 36
Episo

SOLVED! Episode 307 Steps: 3 Epsilon 0.3465
Episode: 308
SOLVED! Episode 308 Steps: 2 Epsilon 0.3465
Episode: 309
Episode: 310
Episode: 311
SOLVED! Episode 311 Steps: 3 Epsilon 0.3431
Episode: 312
Episode: 313
Episode: 314
Episode: 315
Episode: 316
Episode: 317
SOLVED! Episode 317 Steps: 3 Epsilon 0.3344
Episode: 318
SOLVED! Episode 318 Steps: 2 Epsilon 0.3344
Episode: 319
Episode: 320
Episode: 321
SOLVED! Episode 321 Steps: 3 Epsilon 0.3311
Episode: 322
Episode: 323
Episode: 324
Episode: 325
Episode: 326
Episode: 327
SOLVED! Episode 327 Steps: 3 Epsilon 0.3228
Episode: 328
SOLVED! Episode 328 Steps: 2 Epsilon 0.3228
Episode: 329
Episode: 330
Episode: 331
SOLVED! Episode 331 Steps: 3 Epsilon 0.3196
Episode: 332
Episode: 333
Episode: 334
Episode: 335
Episode: 336
Episode: 337
SOLVED! Episode 337 Steps: 3 Epsilon 0.3117
Episode: 338
SOLVED! Episode 338 Steps: 2 Epsilon 0.3115
Episode: 339
Episode: 340
Episode: 341
SOLVED! Episode 341 Steps: 3 Epsilon 0.3084
Episode: 342
Episode: 343
Epis

Episode: 585
Episode: 586
SOLVED! Episode 586 Steps: 37 Epsilon 0.1374
Episode: 587
SOLVED! Episode 587 Steps: 3 Epsilon 0.1374
Episode: 588
SOLVED! Episode 588 Steps: 2 Epsilon 0.1374
Episode: 589
Episode: 590
Episode: 591
SOLVED! Episode 591 Steps: 3 Epsilon 0.1359
Episode: 592
Episode: 593
SOLVED! Episode 593 Steps: 22 Epsilon 0.1351
Episode: 594
SOLVED! Episode 594 Steps: 60 Epsilon 0.1347
Episode: 595
Episode: 596
Episode: 597
SOLVED! Episode 597 Steps: 3 Epsilon 0.1334
Episode: 598
SOLVED! Episode 598 Steps: 2 Epsilon 0.1334
Episode: 599
Episode: 600
Episode: 601
SOLVED! Episode 601 Steps: 3 Epsilon 0.1320
Episode: 602
Episode: 603
SOLVED! Episode 603 Steps: 27 Epsilon 0.1312
Episode: 604
Episode: 605
Episode: 606
Episode: 607
SOLVED! Episode 607 Steps: 3 Epsilon 0.1292
Episode: 608
SOLVED! Episode 608 Steps: 2 Epsilon 0.1292
Episode: 609
SOLVED! Episode 609 Steps: 92 Epsilon 0.1286
Episode: 610
Episode: 611
SOLVED! Episode 611 Steps: 3 Epsilon 0.1280
Episode: 612
Episode: 613
SO

SOLVED! Episode 843 Steps: 22 Epsilon 0.1000
Episode: 844
SOLVED! Episode 844 Steps: 48 Epsilon 0.1000
Episode: 845
Episode: 846
Episode: 847
SOLVED! Episode 847 Steps: 3 Epsilon 0.1000
Episode: 848
SOLVED! Episode 848 Steps: 2 Epsilon 0.1000
Episode: 849
Episode: 850
Episode: 851
SOLVED! Episode 851 Steps: 3 Epsilon 0.1000
Episode: 852
Episode: 853
SOLVED! Episode 853 Steps: 7 Epsilon 0.1000
Episode: 854
SOLVED! Episode 854 Steps: 97 Epsilon 0.1000
Episode: 855
Episode: 856
Episode: 857
SOLVED! Episode 857 Steps: 3 Epsilon 0.1000
Episode: 858
SOLVED! Episode 858 Steps: 2 Epsilon 0.1000
Episode: 859
Episode: 860
Episode: 861
SOLVED! Episode 861 Steps: 3 Epsilon 0.1000
Episode: 862
Episode: 863
SOLVED! Episode 863 Steps: 34 Epsilon 0.1000
Episode: 864
Episode: 865
Episode: 866
Episode: 867
SOLVED! Episode 867 Steps: 3 Epsilon 0.1000
Episode: 868
SOLVED! Episode 868 Steps: 2 Epsilon 0.1000
Episode: 869
Episode: 870
Episode: 871
SOLVED! Episode 871 Steps: 3 Epsilon 0.1000
Episode: 872
Epi

SOLVED! Episode 1098 Steps: 2 Epsilon 0.1000
Episode: 1099
Episode: 1100
Episode: 1101
SOLVED! Episode 1101 Steps: 3 Epsilon 0.1000
Episode: 1102
Episode: 1103
SOLVED! Episode 1103 Steps: 32 Epsilon 0.1000
Episode: 1104
Episode: 1105
Episode: 1106
SOLVED! Episode 1106 Steps: 85 Epsilon 0.1000
Episode: 1107
SOLVED! Episode 1107 Steps: 3 Epsilon 0.1000
Episode: 1108
SOLVED! Episode 1108 Steps: 7 Epsilon 0.1000
Episode: 1109
Episode: 1110
Episode: 1111
SOLVED! Episode 1111 Steps: 3 Epsilon 0.1000
Episode: 1112
Episode: 1113
SOLVED! Episode 1113 Steps: 22 Epsilon 0.1000
Episode: 1114
Episode: 1115
Episode: 1116
Episode: 1117
SOLVED! Episode 1117 Steps: 3 Epsilon 0.1000
Episode: 1118
Episode: 1119
Episode: 1120
Episode: 1121
SOLVED! Episode 1121 Steps: 3 Epsilon 0.1000
Episode: 1122
Episode: 1123
SOLVED! Episode 1123 Steps: 39 Epsilon 0.1000
Episode: 1124
Episode: 1125
Episode: 1126
SOLVED! Episode 1126 Steps: 81 Epsilon 0.1000
Episode: 1127
SOLVED! Episode 1127 Steps: 3 Epsilon 0.1000
Epis

KeyboardInterrupt: 