# Final Project - Reinforcements Learning 
Hello dear students,<br> this is the template notebook. Please click on the "File" tab and then on "Save a copy into drive".

---
<br>

### Name and ID:
Student 1: Avraham Raviv, 204355390
<br>
Student 2: Yevgeni Berkovitch, 317079234
<br><br>
<img src="https://play-lh.googleusercontent.com/e_oKlKPISbgdzut1H9opevS7-LTB8-8lsmpCdMkhlnqFenZhpjxbLmx7l158-xQQCIY">

### https://github.com/mpSchrader/gym-sokoban

# Installs

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install gym_sokoban

!imageio_download_bin ffmpeg

# Imports

In [2]:
from soko_pap import *

import random
from collections import defaultdict, deque

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
%matplotlib inline

In [4]:
imageio.plugins.ffmpeg.download()

In [5]:
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

# Display utils
The cell below contains the video display configuration. No need to make changes here.

In [6]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return HTML(tag)

# Utils

In [7]:
def calc_distances(room_state):
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):
            if room_state[i][j] == 2:
                target = (i, j)

    distances = np.zeros(shape=room_state.shape)
    visited_cells = set()
    cell_queue = deque()

    visited_cells.add(target)
    cell_queue.appendleft(target)

    while len(cell_queue) != 0:
        cell = cell_queue.pop()
        distance = distances[cell[0]][cell[1]]
        for x,y in ((1,0), (-1,-0), (0,1), (0,-1)):
            next_cell_x, next_cell_y = cell[0]+x, cell[1]+y
            if room_state[next_cell_x][next_cell_y] != 0 and not (next_cell_x, next_cell_y) in visited_cells:
                distances[next_cell_x][next_cell_y] = distance + 1
                visited_cells.add((next_cell_x, next_cell_y))
                cell_queue.appendleft((next_cell_x, next_cell_y))
                
    return distances   

def fetch_distances(room_state, distances):
    box = None
    mover = None
    for i in range(room_state.shape[0]):
        for j in range(room_state.shape[1]):            
            if room_state[i][j] == 4:
                box = (i,j)
            
            if room_state[i][j] == 5:
                mover = (i,j)
    
    return mover, box, distances[box[0]][box[1]]   

def box2target_change_reward(room_state, next_room_state, distances):
    if np.array_equal(room_state, next_room_state):
        return -1.0
    
    mover, box, t2b = fetch_distances(room_state, distances)
    n_mover, n_box, n_t2b = fetch_distances(next_room_state, distances)
    
    change_reward = 0.0
    if n_t2b < t2b:
        change_reward += 5.0
    elif n_t2b > t2b:
        change_reward -= 5.0
        
    m2b = np.sqrt((mover[0]-box[0])**2 + (mover[1]-box[1])**2)
    n_m2b = np.sqrt((n_mover[0]-n_box[0])**2 + (n_mover[1]-n_box[1])**2)
    
    if n_m2b < m2b and m2b >= 2:
        change_reward += 1.0
    elif n_m2b > m2b and n_m2b >= 2:
        change_reward -= 1.0
        
    return change_reward  

def process_frame(frame):
    f = frame.mean(axis=2)
    f = f / 255
    return np.expand_dims(f, axis=0)

action_rotation_map = {
    0: 2,
    1: 3,
    2: 1,
    3: 0,
    4: 6,
    5: 7,
    6: 5,
    7: 4
    }


def calc_rot_action(org_action, aug):
    # given an action and an augmentation, return the action that is equivalent to the augmentation
    # action is number from range 0-action_size, and aug is number from range 1-3
    # for single augmentation step, the action is change by using action_rotation_map
    action = org_action
    for i in range(aug):
        action = action_rotation_map[action]
    return action

# Solution

In [8]:
class Sokoban_DNN_Model(nn.Module):
    def __init__(self, state_size, action_size):
        super(Sokoban_DNN_Model, self).__init__()
        self.conv1 = nn.Conv2d(state_size[2], 32, kernel_size=16, stride=16)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 5 * 5, 512)
        self.fc2 = nn.Linear(512, 64)
        self.fc3 = nn.Linear(64, action_size)
        self.ReLU = nn.ReLU()

    def forward(self, x):
        x = self.ReLU(self.conv1(x))
        x = self.ReLU(self.conv2(x))
        x = self.flatten(x)
        x = self.ReLU(self.fc1(x))
        x = self.ReLU(self.fc2(x))
        x = self.fc3(x)
        return x

In [9]:
class SOK_Agent:
    def __init__(self):
        # Construct DQN models
        self.state_size = (112, 112, 1)
        self.action_size = 8
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.load_state_dict(self.model.state_dict())
        self.batch_size = 8

        # Replay buffers
        self.replay_buffer = deque(maxlen=5000)
        self.prioritized_replay_buffer = deque(maxlen=500)

        # Hyper parameters
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.3
        self.epsilon_decay = 0.995
        self.replay_rate = 10
        self.update_beta = 0.9999
        self.optimizer = optim.Adam(self.model.parameters())
        
        # info
        self.max_steps = 20
        self.max_episodes = 50000
        self.solved = 0
        self.test_rate = 100

    def _build_model(self):
        model = Sokoban_DNN_Model(self.state_size, self.action_size)
        return model

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append([state, action, reward, next_state, done])

    def copy_to_prioritized_buffer(self, n):
        for i in range(n):
            self.prioritized_replay_buffer.append(self.replay_buffer[-1 - i])

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        state_tensor = torch.tensor(state, dtype=torch.float32)
        act_values = self.model(state_tensor.unsqueeze(0)).detach()[0]
        return act_values.argmax().item()

    def replay(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        if len(self.prioritized_replay_buffer) < self.batch_size // 2:
            minibatch = random.sample(self.replay_buffer, self.batch_size)
        else:
            minibatch = random.sample(self.replay_buffer, self.batch_size // 2)
            minibatch.extend(random.sample(self.prioritized_replay_buffer, self.batch_size // 2))

        states = torch.zeros((self.batch_size*4, 1, self.state_size[0], self.state_size[1]))
        actions = torch.zeros(self.batch_size*4)
        rewards = torch.zeros(self.batch_size*4)
        next_states = torch.zeros((self.batch_size*4, 1, self.state_size[0], self.state_size[1]))
        statuses = torch.zeros(self.batch_size*4)        
        
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32)
            for aug in range(4):
                ind = i*4+aug               
                org_state = state_tensor.clone().detach()
                states[ind] = torch.rot90(org_state, k=aug, dims=[1, 2])
                actions[ind] = calc_rot_action(action, aug)
                rewards[ind] = reward
                org_next_state = next_state_tensor.clone().detach()
                next_states[ind] = torch.rot90(org_next_state, k=aug, dims=[1, 2])
                statuses[ind] = 1 if done else 0

        # targets = self.model.predict(states)
        # max_actions = np.argmax(self.model.predict(next_states), axis=1)
        # next_rewards = self.target_model.predict(next_states)
        targets = self.model(states).detach()
        max_actions = torch.argmax(self.model(next_states).detach(), dim=1)
        next_rewards = self.target_model(next_states).detach()

        ind = 0
        for action, reward, next_reward, max_action, done in zip(actions, rewards, next_rewards, max_actions, statuses):
            if not done:
                reward += self.gamma * next_reward[max_action]
            targets[ind][action.int().item()] = reward.long()
            ind += 1

        for _ in range(10):    
            self.optimizer.zero_grad()
            loss = torch.nn.functional.mse_loss(self.model(states), targets)
            loss.backward()
            self.optimizer.step()
        
        self.update_target_model()

        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay

        return loss

    def update_target_model(self):
        model_dict = self.model.state_dict()
        target_dict = self.target_model.state_dict()
        for key in model_dict:
            target_dict[key] = self.update_beta * target_dict[key] + (1 - self.update_beta) * model_dict[key]

    def test_agent(self, e, stochastic=False):
        current_epsilon = self.epsilon
        self.epsilon = 0.0
        num_solved = 0
        solved_in_steps = defaultdict(int)

        for t in range(100):
            random.seed(t)
            sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
            sok.set_maxsteps(self.max_steps)
            steps = 0

            state = sok.get_image('rgb_array')
            done = False
            while not done:
                steps += 1
                action = self.act(process_frame(state))
                if action < 4:
                    action += 1
                else:
                    action += 5
                state, reward, done, info = sok.step(action)

            if 3 in sok.room_state:
                num_solved += 1
                solved_in_steps[steps] += 1
        
        print("Episode %d Solved: %d (%s)" % (e, num_solved, "Stochastic" if stochastic else "Deterministic"))

        return num_solved

    def init_sok(self, r):
        random.seed(r + 100)
        Sok = PushAndPullSokobanEnv(dim_room=(7, 7), num_boxes=1)
        Sok.set_maxsteps(self.max_steps)
        return Sok

## Training

In [11]:
agent = SOK_Agent()
evaluation = []

print('Starting training')

for e in range(agent.max_episodes):
    if e % 100 == 0 and e > 0:
        num_s = agent.test_agent(e, stochastic=False)
        evaluation.append(num_s)
        if num_s > agent.solved:
            agent.solved = num_s           
            torch.save(agent.model.state_dict(), f"models/ddqn2_{e}_{num_s}.pth")

    sok = agent.init_sok(e)
    random.seed(e)

    state = process_frame(sok.get_image('rgb_array'))
    room_state = sok.room_state.copy()
    distances = calc_distances(room_state)

    for step in range(sok.max_steps):
        action = agent.act(state)
        if action < 4:
            next_state, reward, done, _ = sok.step(action + 1)
        else:
            next_state, reward, done, _ = sok.step(action + 5)

        next_state = process_frame(next_state)
        next_room_state = sok.room_state

        if not done:
            reward += box2target_change_reward(room_state, next_room_state, distances)

        agent.remember(state, action, reward, next_state, done)

        state = next_state.copy()
        room_state = next_room_state.copy()

        if (step + 1) % agent.replay_rate == 0 and step > 0:
            agent.replay()            

        if done:
            if 3 in sok.room_state:
                agent.copy_to_prioritized_buffer(step+1)
                
            break

Starting training
Episode 100 Solved: 39 (Deterministic)
Episode 200 Solved: 43 (Deterministic)
Episode 300 Solved: 47 (Deterministic)
Episode 400 Solved: 47 (Deterministic)
Episode 500 Solved: 49 (Deterministic)
Episode 600 Solved: 52 (Deterministic)
Episode 700 Solved: 61 (Deterministic)
Episode 800 Solved: 62 (Deterministic)
Episode 900 Solved: 61 (Deterministic)
Episode 1000 Solved: 60 (Deterministic)
Episode 1100 Solved: 60 (Deterministic)
Episode 1200 Solved: 60 (Deterministic)
Episode 1300 Solved: 65 (Deterministic)
Episode 1400 Solved: 65 (Deterministic)
Episode 1500 Solved: 59 (Deterministic)
Episode 1600 Solved: 62 (Deterministic)


KeyboardInterrupt: 