# **Imports for making a maze**

In [0]:
import math
import copy
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from collections import namedtuple
from itertools import count
from IPython.display import HTML
from IPython import display as ipythondisplay
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.autograd import Variable

is_ipython = 'inline' in matplotlib.get_backend()

## **Maze**

In [0]:
class Maze(object):
    def __init__(self, size=10, blocks_rate=0.1):
        self.size = size if size > 3 else 10
        self.blocks = int((size ** 2) * blocks_rate) 
        self.s_list = []
        self.maze_list = []
        self.e_list = []
        self.start_point = None
        self.goal_point = None

    def create_mid_lines(self, k):
        if k == 0: self.maze_list.append(self.s_list)
        elif k == self.size - 1: self.maze_list.append(self.e_list)
        else:
            tmp_list = []
            for l in range(0,self.size):
                if l == 0: tmp_list.extend("#")
                elif l == self.size-1: tmp_list.extend("#")
                else:
                    a = -1
                    tmp_list.extend([a])
            self.maze_list.append(tmp_list)

    def insert_blocks(self, k, s_r, e_r):
        b_y = random.randint(1, self.size-2)
        b_x = random.randint(1, self.size-2)
        if [b_y, b_x] == [1, s_r] or [b_y, b_x] == [self.size - 2, e_r]:
          k = k-1
        else: 
          self.maze_list[b_y][b_x] = "#"
          if self.maze_list[b_y-1][b_x] == -1:
            self.maze_list[b_y-1][b_x] = -2
          if self.maze_list[b_y+1][b_x] == -1:
            self.maze_list[b_y+1][b_x] = -2
          if self.maze_list[b_y][b_x-1] == -1:
            self.maze_list[b_y][b_x-1] = -2
          if self.maze_list[b_y][b_x+1] == -1:
            self.maze_list[b_y][b_x+1] = -2
          if self.maze_list[b_y+1][b_x+1] == -1:
            self.maze_list[b_y+1][b_x+1] = -2
          if self.maze_list[b_y-1][b_x-1] == -1:
            self.maze_list[b_y-1][b_x-1] = -2
          if self.maze_list[b_y+1][b_x-1] == -1:
            self.maze_list[b_y+1][b_x-1] = -2
          if self.maze_list[b_y-1][b_x+1] == -1:
            self.maze_list[b_y-1][b_x+1] = -2
            
    def generate_maze(self): 
        s_r = random.randint(1, int((self.size / 2)) - 1)
        for i in range(0, self.size):
            # if i == s_r: self.s_list.extend("S")
            # else:
            self.s_list.extend("#")
        self.start_point = [1, s_r]

        e_r = random.randint(int((self.size / 2)) + 1, self.size - 2)
        for j in range(0, self.size):
            # if j == e_r: self.e_list.extend([50])
            # else: 
            self.e_list.extend("#")
        self.goal_point = [self.size - 2, e_r]

        for k in range(0, self.size):
            self.create_mid_lines(k)

        # self.maze_list[self.start_point[0]][self.start_point[1]] = "S"
        self.maze_list[self.goal_point[0]][self.goal_point[1]] = 10
        
        for k in range(self.blocks):
            self.insert_blocks(k, s_r, e_r)

        return self.maze_list, self.start_point, self.goal_point

# **Maze functions**

In [0]:
class Field(object):
    def __init__(self, maze, start_point, goal_point):
        self.maze = maze
        self.start_point = start_point
        self.goal_point = goal_point
        self.movable_vec = [[1,0],[-1,0],[0,1],[0,-1]]

    def display(self, point=None):
        field_data = copy.deepcopy(self.maze)
        if not point is None:
                y, x = point
                field_data[y][x] = "@@"
        else:
                point = ""
        for line in field_data:
                print ("\t" + "%3s " * len(line) % tuple(line))

    def get_val(self, state):
        y, x = state[0],state[1]
        if state[0] == self.start_point[0] and state[1] == self.start_point[1]:
            return 0.0, False
        elif not(0<=x<len(self.maze) or 0<=y<len(self.maze)):
            return -1.0,True
        elif self.maze[y][x] == "#":
            return -1.0,True
        else:
            v = float(self.maze[y][x])
            if state[0] == self.goal_point[0] and state[1] == self.goal_point[1]: 
                return v, True
            else: 
                return v/10, False

# **Generate Maze**

In [0]:
size = 15
barriar_rate = 0.1

maze_1 = Maze(size, barriar_rate)
maze, start_point, goal_point = maze_1.generate_maze()
maze_field = Field(maze, start_point, goal_point)

maze_field.display()

	  #   #   #   #   #   #   #   #   #   #   #   #   #   #   # 
	  #  -1  -1  -1  -2  -2  -2  -1  -1  -1  -2  -2   #  -2   # 
	  #  -2  -2  -2  -2   #  -2  -1  -1  -1  -2   #  -2  -2   # 
	  #  -2   #  -2   #  -2  -2  -2  -2  -2  -2   #   #  -2   # 
	  #  -2  -2  -2  -2  -2  -1  -2   #  -2  -2  -2  -2  -2   # 
	  #  -1  -2  -2  -2  -1  -1  -2  -2  -2  -1  -2  -2  -2   # 
	  #  -1  -2   #  -2  -1  -1  -1  -1  -1  -1  -2   #   #   # 
	  #  -2  -2  -2  -2  -1  -1  -1  -1  -1  -1  -2  -2  -2   # 
	  #   #  -2  -2  -2  -2  -1  -1  -1  -1  -1  -2  -2  -2   # 
	  #  -2  -2  -2   #  -2  -1  -1  -1  -1  -1  -2   #  -2   # 
	  #  -2   #  -2  -2  -2  -2  -2  -1  -1  -2  -2  -2   #   # 
	  #  -2  -2  -2  -2  -2   #  -2  -2  -2  -2   #  -2  -2   # 
	  #  -1  -1  -1  -2   #  -2  -2   #  -2  -2  -2  -2  -2   # 
	  #  -1  -1  -1  -2  -2  -2  -2  -2  10  -1  -1  -2   #   # 
	  #   #   #   #   #   #   #   #   #   #   #   #   #   #   # 


In [0]:
class DQN(nn.Module):
  def __init__(self):
    super().__init__()

    self.fc1 = nn.Linear(in_features=2,out_features=128)
    self.tanh1 = nn.Tanh()
    self.fc2 = nn.Linear(in_features=128,out_features=128)
    self.tanh2 = nn.Tanh()
    self.fc3 = nn.Linear(in_features=128,out_features=128)
    self.tanh3 = nn.Tanh()
    self.out = nn.Linear(in_features=128, out_features=4)

  def forward(self,t):
    t = self.tanh1(self.fc1(t))
    t = self.tanh2(self.fc2(t))
    t = self.tanh3(self.fc3(t))
    t = self.out(t)
    return t

In [0]:
Experience = namedtuple('Experience',('state','action','next_state','reward'))

In [0]:
class ReplayMemory():
  def __init__(self,capacity):
    self.capacity = capacity
    self.memory = []
    self.push_count = 0

  def push(self,experience):
    if len(self.memory)<self.capacity:
      self.memory.append(experience)
    else:
      self.memory[self.push_count%self.capacity] = experience
    self.push_count+=1

  def sample(self,batch_size):
    return random.sample(self.memory, batch_size)

  def can_provide_sample(self,batch_size):
    return len(self.memory)>=batch_size

In [0]:
class EpsilonGreedyStrategy():
  def __init__(self,start,end,decay):
    self.start = start
    self.end = end
    self.decay = decay

  def get_exploration_rate(self,current_step):
    return self.end+(self.start-self.end)*math.exp(-1*current_step*self.decay)

In [0]:
class Agent():
  def __init__(self,strategy,num_actions,device):
    self.current_step = 0
    self.strategy = strategy
    self.num_actions = num_actions
    self.device = device

  def select_action(self,state,policy_net):
    rate = strategy.get_exploration_rate(self.current_step)
    self.current_step+=1

    if rate>random.random():
        # print("in if")
        action = random.randrange(self.num_actions) # 0 means down; 1 means up; 2 means right; 3 means left
        return torch.tensor([action]).to(device)
    else:
        # print("in else")
        with torch.no_grad():
            state = torch.from_numpy(state).to(self.device)
            state = state.float()
            return policy_net(state).argmax(dim=0)

In [0]:
batch_size = 225
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10
memory_size = 100000
lr = 0.001
num_episodes = 10000
num_actions = 4

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
strategy = EpsilonGreedyStrategy(eps_start,eps_end,eps_decay)
agent = Agent(strategy,num_actions,device)
memory = ReplayMemory(memory_size)

policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(),lr=lr)

In [0]:
def extract_tensors(experiences):

    batch = Experience(*zip(*experiences))

    t1 = torch.stack(batch.state)
    t2 = torch.stack(batch.action)
    t3 = torch.stack(batch.reward)
    t4 = torch.stack(batch.next_state)

    return (t1,t2,t3,t4)

In [0]:
def plot(values,moving_avg_period):
  plt.figure(2)
  plt.clf()
  plt.title('Training..')
  plt.xlabel('Episode')
  plt.ylabel('Duration')
  plt.plot(values)
  moving_avg = get_moving_average(moving_avg_period,values)
  plt.plot(moving_avg)
  plt.pause(0.001)
  print("Episode",len(values),"\n",moving_avg_period,"episode moving avg:",moving_avg[-1])
  if is_ipython: ipythondisplay.clear_output(wait=True)

def get_moving_average(period,values):
  values = torch.tensor(values,dtype=torch.float)
  if len(values)>=period:
    moving_avg = values.unfold(dimension=0, size=period,step=1).mean(dim=1).flatten(start_dim=0)
    moving_avg = torch.cat((torch.zeros(period-1),moving_avg))
    return moving_avg.numpy()
  else:
    moving_avg = torch.zeros(len(values))
    return moving_avg.numpy()

In [0]:
episode_duration = []
movable_vec = [[1,0],[-1,0],[0,1],[0,-1]]
goal_count = 0
# Penalties:-
# out-of-bounce: -100
# on # : -100
# goal: 50

wrong = True

for episode in range(num_episodes):
    # cum_reward = 0
    while wrong:
      x = random.randrange(1,len(maze)-2)
      y = random.randrange(1,len(maze)-2)
      if maze[y][x]!="#":
        print("y",y,"x",x,"maze",maze[y][x])
        start_point = [y,x]
        state = np.asarray(start_point) 
        wrong = False

    for timestep in count():
        action = agent.select_action(state,policy_net)
        action = action.item()
        next_state = state+movable_vec[action]
        reward,done = maze_field.get_val(next_state)
        state_mem = torch.from_numpy(state)
        next_state_mem = torch.from_numpy(next_state)
        reward_mem = torch.from_numpy(np.asarray(reward))
        action_mem = torch.from_numpy(np.asarray(action))
        memory.push(Experience(state_mem,action_mem,next_state_mem,reward_mem))
        state = next_state
        # cum_reward +=reward

        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states,actions,rewards,next_states = extract_tensors(experiences)
            # print("states")
            # print(states)
            # print("next states")
            # print(next_states)
            states = states.float()
            actions = actions.to(device)
            current_q_values = policy_net(states.to(device)).gather(dim=1,index=actions.unsqueeze(-1))
            final_state_locations = []
            
            for i in next_states.numpy():
                if (not(0<=i[0].item()<=len(maze)) and not(0<=i[0]<=len(maze)) and (maze[i[0]][i[1]] != "#" or maze[i[0]][i[1]] != maze[goal_point[0]][goal_point[1]])):
                    final_state_locations.append(True)
                else:
                    final_state_locations.append(False)
            # print(final_state_locations)
            non_final_state_locations = [final_state_location==False for final_state_location in final_state_locations]
            # print(non_final_state_locations)
            non_final_states = next_states[non_final_state_locations]
            # print(batch_size)
            
            values = torch.zeros(batch_size).to(device)
            non_final_states = non_final_states.float()
            # print(non_final_states)
            # print(len(non_final_state_locations))
            values[non_final_state_locations==True] = target_net(non_final_states.to(device)).max(dim=0)[0].detach()
            next_q_values = values.to(device)
            # print(type(next_q_values),type(gamma),type(rewards))
            target_q_values = (next_q_values.to(device)*gamma)+rewards.to(device)
            target_q_values = target_q_values.double()
            loss = F.mse_loss(current_q_values.double(),target_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # if cum_reward<=-50:
        #     break

        if done:
            print("episode:",episode)
            if state[0]==goal_point[0] and state[1]==goal_point[1]:
              # print(state)
              goal_count+=1
            episode_duration.append(timestep)
            # plot(episode_duration,100)
            break
    
    if episode%target_update==0:
        target_net.load_state_dict(policy_net.state_dict())
    
    wrong = True

print("number of times it reached the goal:",goal_count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
y 11 x 4 maze -2
episode: 7501
y 10 x 4 maze -2
episode: 7502
y 5 x 3 maze -2
episode: 7503
y 8 x 7 maze -1
episode: 7504
y 1 x 4 maze -2
episode: 7505
y 11 x 4 maze -2
episode: 7506
y 11 x 8 maze -2
episode: 7507
y 12 x 11 maze -2
episode: 7508
y 4 x 12 maze -2
episode: 7509
y 6 x 8 maze -1
episode: 7510
y 8 x 12 maze -2
episode: 7511
y 2 x 7 maze -1
episode: 7512
y 3 x 6 maze -2
episode: 7513
y 3 x 1 maze -2
episode: 7514
y 5 x 6 maze -1
episode: 7515
y 2 x 1 maze -2
episode: 7516
y 4 x 12 maze -2
episode: 7517
y 2 x 3 maze -2
episode: 7518
y 8 x 10 maze -1
episode: 7519
y 1 x 5 maze -2
episode: 7520
y 8 x 8 maze -1
episode: 7521
y 11 x 4 maze -2
episode: 7522
y 2 x 9 maze -1
episode: 7523
y 1 x 9 maze -1
episode: 7524
y 4 x 11 maze -2
episode: 7525
y 2 x 3 maze -2
episode: 7526
y 12 x 9 maze -2
episode: 7527
y 7 x 8 maze -1
episode: 7528
y 2 x 2 maze -2
episode: 7529
y 7 x 6 maze -1
episode: 7530
y 10 x 1 maze -2
episo

In [0]:
# try the model
x = random.randrange(1,len(maze)-2)
y = random.randrange(1,len(maze)-2)
start_point = [y,x]
state = np.asarray(start_point)
score = 0
steps = 0
while True:
    steps += 1
    state_ = torch.from_numpy(state)
    state_ = state_.float()
    with torch.no_grad():
      action = policy_net(state_.to(device)).argmax(dim=0)
    print("current state: {0} -> action: {1} ".format(state, action.item()))
    reward, done = maze_field.get_val(state+movable_vec[int(action.item())])
    maze_field.display(state)
    score = score + reward
    state = state + movable_vec[int(action.item())]
    print("current step: {0} \t score: {1}\n".format(steps, score))
    # if steps==100:
    #   break
    if done and state[0]==goal_point[0] and state[1]==goal_point[1]:
        maze_field.display(state)
        print("goal!")
        break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	  #  -2  -2  -2  -2   #  -2  -1  -1  -1  -2   #  -2  -2   # 
	  #  -2   #  -2   #  -2  -2  -2  -2  -2  -2   #   #  -2   # 
	  #  -2  -2  -2  -2  -2  -1  -2   #  -2  -2  -2  -2  -2   # 
	  #  -1  -2  -2  -2  -1  -1  -2  -2  -2  -1  -2  -2  -2   # 
	  #  -1  -2   #  -2  -1  -1  -1  -1  -1  -1  -2   #   #   # 
	  #  -2  -2  -2  -2  -1  -1  -1  -1  -1  -1  -2  -2  -2   # 
	  #   #  -2  -2  -2  -2  -1  -1  -1  -1  -1  -2  -2  -2   # 
	  #  -2  -2  -2   #  -2  -1  -1  -1  -1  -1  -2   #  -2   # 
	  #  -2   #  -2  -2  -2  -2  -2  -1  -1  -2  -2  -2   #   # 
	  #  -2  -2  -2  -2  -2   #  -2  -2  -2  -2   #  -2  -2   # 
	  #  @@  -1  -1  -2   #  -2  -2   #  -2  -2  -2  -2  -2   # 
	  #  -1  -1  -1  -2  -2  -2  -2  -2  10  -1  -1  -2   #   # 
	  #   #   #   #   #   #   #   #   #   #   #   #   #   #   # 
current step: 1694490 	 score: -169449.00000537533

current state: [13  1] -> action: 1 
	  #   #   #   #   #   #   #   #   #   #

<class 'float'>
