#### Defining the agent

In [None]:
class Agent:
    
    # initialize
    def __init__(self, maze, experience_buffer,
                 epsilon, epsilon_end, epsilon_games, use_softmax=True, enable_decay = False):
        self.env = maze
        self.buffer = experience_buffer
        self.epsilon = epsilon
        self.epsilon_end = epsilon_end
        self.n_actions = 4
        self.decrement = (epsilon - epsilon_end)/epsilon_games
        #self.max_length = max_length
        self.use_softmax = use_softmax
        self.maze = maze.observe(empty=True)
        self.tot_reward = 0
        #self.counter = 0
        self.enable_decay = enable_decay
        
    def play(self, net, epsilon=None, device='cuda'):
        #self.counter += 1
        if epsilon is not None:
            eps = epsilon
        else:
            eps = self.epsilon
        action = self.select_action(net, eps, device)
        next_state, reward, status = self.env.act(action)
        exp = Experience(self.maze, action, reward, status, next_state)
        self.buffer.append(exp)
        self.tot_reward += reward
        if status!=0:# or self.counter>self.max_length:
            self.env.reset()
            self.maze = maze.observe(empty=True)
            tot_reward = copy.deepcopy(self.tot_reward)
            self.tot_reward = 0
            #self.counter = 0
            if self.enable_decay:
                self.epsilon = self.epsilon - self.decrement if self.epsilon>self.epsilon_end else self.epsilon_end
            return tot_reward, status
        self.maze = next_state
        return None, status
        
        
    def select_action(self, net, epsilon, device):
        if self.use_softmax:
            state = torch.Tensor(self.maze).to(device).view(1,-1)
            q_vals = net(state).cpu().detach().numpy().squeeze()
            probs = sp.softmax(q_vals/self.epsilon).squeeze()
            action = np.random.choice(self.n_actions, p = probs)
        else:
            if np.random.random() < self.epsilon:
                action = np.random.randint(self.n_actions, size=1)[0]
            else:
                state = torch.Tensor(self.maze).to(device).view(1,-1)
                q_vals = net(state).cpu().detach().numpy().squeeze()
                act_v = np.argmax(q_vals, axis=0)
                action = int(act_v)
        return action%run Libraries.ipynb

#### Defining the buffer

In [None]:
Experience = collections.namedtuple('Experience',
                                    field_names=['state', 'action', 'reward', 'status', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size, device='cuda'):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, status, next_states = zip(*[self.buffer[idx] for idx in indices])
        return torch.Tensor(states).type(torch.float).to(device), \
               torch.Tensor(actions).type(torch.long).to(device), \
               torch.Tensor(rewards), torch.Tensor(status).type(torch.int8).to(device), \
               torch.Tensor(next_states).to(device)