In [11]:
# Monte-Carlo Control with Q-value function approximation
# Policy evaluation: Q(s, a) <- Q(s, a) + (G - Q(s, a)) / N(s, a)
# Policy improvement: epsilon-greedy exploration
# Q-value function approximation: Two-layer perception (input layer and output layer only)


from pacman.actions import Actions
from pacman.agents import Agent
from pacman.env import PacmanEnv
import numpy as np
from collections import defaultdict
import random

In [12]:


#logger = getLogger(__name__)


class MonteCarlo(Agent):
    def __init__(self, gamma, env, eps):
        #self.alpha  = alpha
        self.gamma  = gamma
        self.env    = env
        self.eps    = eps
        self.Qtable = defaultdict(lambda : {action: 0 for action in Actions.actions})
    
    def train(self, max_episodes=None, tol=1e-7):
        curr_episode = 0
        Ntable  = defaultdict(lambda : {action: 0 for action in Actions.actions})

        if max_episodes is None:
            max_episodes = np.inf


        while curr_episode < max_episodes:
            #errors = []
            state, done = self.env.reset()
            G=0

            while not done:
                action = self.act(state)
                Ntable[state][action] = Ntable[state][action] + 1
               
                next_state, reward, done, info = self.env.step(action)
                G= (self.gamma*G) + reward

                self.Qtable[state][action] += (G - self.Qtable[state][action])/Ntable[state][action]

                #next_action = self.act(next_state)

                #td_error = reward + self.gamma * self.Qtable[next_state][next_action] - self.Qtable[state][action]
                #errors.append(abs(td_error))
                #self.update_tables(Ntable, td_error)

                state  = next_state
                #action = next_action

            self.eps = 0.99*self.eps


    
    def act(self, state):
        if np.random.rand() > self.eps:
            action = max(self.Qtable[state], key=self.Qtable[state].get)

        else:
            action = Actions.sample()

        return action


    def eval(self):
        self.eps = 0

env = PacmanEnv.contourDanger(7, ghost_name= "FollowGhost",render_mode=None, config={"TIME_PENALTY" : 1})
mc = MonteCarlo(env = env, gamma = 0.99, eps = 1) 
mc.train()

KeyboardInterrupt: 

In [None]:
sarsa.eval()
env.set_render('ansi')
env.run_policy(mc, 0, 1)