## Método Q Learning

In [1]:
import numpy as np
from collections import defaultdict
from pacman.env import PacmanEnv
from pacman.agents import Agent
from pacman.actions import Actions

In [3]:
class QLearning(Agent):
    def __init__(self, alpha, gamma, env, eps):
        self.alpha  = alpha
        self.gamma  = gamma
        self.env    = env
        self.eps    = eps
        self.Qtable = defaultdict(lambda : {action: 0 for action in Actions.actions})


    def training(self, n_episodes=1000):
        for i in range(n_episodes):
            state, done = self.env.reset()
            action = self.act(state)

            while not done:
                next_state, reward, done, info = self.env.step(action)
                next_action = self.act(next_state)
                #self.Qtable[state][action] += self.alpha * (reward + self.gamma * self.Qtable[next_state][next_action] - self.Qtable[state][action])

                old_value = self.Qtable[state][action]
                next_max = max(self.Qtable[next_state].values())
                
                new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
                self.Qtable[state][action] = new_value

                state  = next_state
                action = next_action

            self.eps = 0.99 * self.eps


    def act(self, state):
        if np.random.rand() > self.eps:
            action = max(self.Qtable[state], key=self.Qtable[state].get)

        else:
            action = Actions.sample()

        return action


    def eval(self):
        self.eps = 0

---

* Utilizamos o ambiente determinístico: **gridWorld**

---

In [5]:
#env = PacmanEnv.from_file('smallClassic',render_mode=None, config={'TIME_PENALTY' : 1})
env = PacmanEnv.from_file('gridWorld', render_mode=None, ghost_names='ImmobileGhost', config={'TIME_PENALTY' : 1})
alpha = 0.1
gamma = 0.6
epsilon = 0.1
q_learning = QLearning(alpha, gamma, env, epsilon)
q_learning.training()

In [6]:
q_learning.eval()
env.set_render('ansi')
env.run_policy(q_learning, 0, .5)

% % % % % % % % % % % % % % % % % % % % % % %
%                                           %
%                                           %
%                                           %
%         [31mG[0m [36mG[0m [35mG[0m [32mG[0m [31mG[0m [36mG[0m [35mG[0m [32mG[0m [31mG[0m [36mG[0m [35mG[0m [32mG[0m [31mG[0m         %
%       % % % % % % % % % % % % % % %       %
%       % % % % % % % % % % % % % % %   [33mv[0m   %
% % % % % % % % % % % % % % % % % % % % % % %
Score: 27


[(((2, 1),
   (0.0,),
   ((5, 3),
    (6, 3),
    (7, 3),
    (8, 3),
    (9, 3),
    (10, 3),
    (11, 3),
    (12, 3),
    (13, 3),
    (14, 3),
    (15, 3),
    (16, 3),
    (17, 3)),
   (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
  'Up',
  -1,
  ((2, 2),
   (0.0,),
   ((5, 3),
    (6, 3),
    (7, 3),
    (8, 3),
    (9, 3),
    (10, 3),
    (11, 3),
    (12, 3),
    (13, 3),
    (14, 3),
    (15, 3),
    (16, 3),
    (17, 3)),
   (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))),
 (((2, 2),
   (0.0,),
   ((5, 3),
    (6, 3),
    (7, 3),
    (8, 3),
    (9, 3),
    (10, 3),
    (11, 3),
    (12, 3),
    (13, 3),
    (14, 3),
    (15, 3),
    (16, 3),
    (17, 3)),
   (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
  'Up',
  -1,
  ((2, 3),
   (0.0,),
   ((5, 3),
    (6, 3),
    (7, 3),
    (8, 3),
    (9, 3),
    (10, 3),
    (11, 3),
    (12, 3),
    (13, 3),
    (14, 3),
    (15, 3),
    (16, 3),
    (17, 3)),
   (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))),
 (((2, 3),
   (0.0,),
   ((5, 3),
    (6, 

---
Utilizamos o ambiente estocástico: **gridWorld**

---

In [None]:
#env = PacmanEnv.from_file('smallClassic',render_mode=None, config={'TIME_PENALTY' : 1})
env = PacmanEnv.from_file('gridWorld', render_mode=None, ghost_names='ImmobileGhost', config={'TIME_PENALTY' : 1})
alpha = 0.1
gamma = 0.6
epsilon = 0.1
q_learning = QLearning(alpha, gamma, env, epsilon)
q_learning.training()

In [None]:
q_learning.eval()
env.set_render('ansi')
env.run_policy(q_learning, 0, .5)