In [9]:
import numpy as np
import random
import os

#Pacman

In [10]:
class Pacman:

    def __init__(self, grid_size, num_food_pellets):
        self.grid_size = grid_size
        self.num_food_pellets = num_food_pellets
        # Directions, Up Down Right Left and stay
        self.action_space = [(1, 0), (-1, 0), (0, 1), (0, -1), (0, 0)]
        self.init()

    def init(self,):
        # 1 to self.grid_size-1 cuz the borders are walls
        locs = [(i, j) for i in range(1, self.grid_size-1)
                for j in range(1, self.grid_size-1)]
        random.shuffle(locs)

        self.pacman = locs.pop()  # set pacman's location
        self.food_pellets = set()

        self.set_food()

        self.create_ghost()
        self.reward = 0
        self.score = 0

    def set_food(self,):
        (r, c) = self.pacman  # row and col of pacman's location

        valid_locs = [(i, j) for i in range(1, self.grid_size-1) for j in range(
            1, self.grid_size-1) if (i, j) != (r, c)]  # Initialise food pellet positions randomly
        random.shuffle(valid_locs)
        for i in range(self.num_food_pellets):
            self.food_pellets.add(valid_locs.pop())

        self.food_pellets_left = self.num_food_pellets

    def create_ghost(self,):
        (r, c) = self.pacman
        # Initialize the ghost in same column as pacman
        if r != 1:
            self.ghost = (1, c)
        else:
            self.ghost = (self.grid_size-2, c)
        self.ghost_action = random.choice(self.action_space)

    def display(self,):
        print("Current score: {}".format(self.score))

        for r in range(self.grid_size):
            for c in range(self.grid_size):
                if (r, c) == self.pacman:
                    print('P', end='')
                elif (r, c) == self.ghost:
                    print('G', end='')
                elif (r, c) in self.food_pellets:
                    print('o', end='')
                elif self.is_boundary((r, c)):
                    print('*', end='')
                else:
                    print(' ', end='')
            print('')

    def is_end(self,):
        if self.reward == -100:
            return True
        return False

    def is_boundary(self, pos):
        if pos[0] == 0 or pos[0] == self.grid_size-1 or pos[1] == 0 or pos[1] == self.grid_size-1:
            return True

    def step(self, action):
        # save current positions of pacman and ghost
        pacman, ghost = self.pacman, self.ghost
        # print('pacman {} action {} ghost {}'.format(pacman,action,ghost))
        # Move pacman acc to action
        (px, py) = self.action_space[action]
        (r, c) = self.pacman
        self.pacman = (r+px, c+py)

        # Move ghost acc to ghost_action ie random
        (gx, gy) = self.ghost_action
        (gr, gc) = self.ghost
        self.ghost = (gr+gx, gc+gy)

        if self.is_boundary(self.ghost):
            self.create_ghost()

        # If both at same place or both crossed through the other this move or pacman hits boundary, end game
        if self.pacman == self.ghost or (pacman, self.ghost) == (ghost, self.pacman) or self.is_boundary(self.pacman):
            self.reward = -100
        elif self.pacman in self.food_pellets:
            self.food_pellets_left -= 1
            self.food_pellets.remove(self.pacman)
            self.reward = 10
            if self.food_pellets_left == 0:
                self.set_food()
        else:
            self.reward = 0
        self.ghost_action = random.choice(self.action_space)
        self.score += self.reward


# SARSA Agent

In [11]:
class sarsa:

    def __init__(self,game,num_episodes, alpha, epsilon, gamma):
        self.num_episodes = num_episodes
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        self.game = game
        self.Q = {}

    def choose_action_epsilon_greedy(self,state):

        if state not in self.Q:
            return random.randint(0,len(self.game.action_space)-1)
        else:
            greedy_action_index = np.argmax(self.Q[state])
            l = len(self.game.action_space)
            probabilities = [1.0 - self.epsilon*(1-(1/l))] + [self.epsilon/l for i in range(l-1)]
            action_choices = [greedy_action_index] + [x for x in range(l) if x!=greedy_action_index]
            # print('probabilities {} action_choices {}'.format(probabilities,action_choices))
            action = np.random.choice(action_choices, p=probabilities)

            return action
    
    def checkQ(self,state):
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.game.action_space))
    
    def get_state(self,pacman,ghost,food_pellets):
        return '{}, {}, {}'.format(pacman, ghost, sorted(food_pellets))
    
    def train_agent(self,):

        episode_steps = 0
        total_steps = 0

        for episode in range(self.num_episodes):
            self.game.init()
            
            pacman = self.game.pacman
            ghost = self.game.ghost
            food = self.game.food_pellets
            state = self.get_state(pacman, ghost, food)

            action = self.choose_action_epsilon_greedy(state)
            self.checkQ(state)
            total_steps += episode_steps
            episode_steps = 0

            while True:
                episode_steps += 1
                self.game.step(action)
                reward = self.game.reward

                next_pacman = self.game.pacman
                next_ghost = self.game.ghost
                next_food_pellets = self.game.food_pellets
                next_state = self.get_state(next_pacman, next_ghost, next_food_pellets)
    
                next_action = self.choose_action_epsilon_greedy(next_state)
                self.checkQ(next_state)
                #SARSA
                # print('state {} action {} next {} na {}'.format(state,action,next_state,next_action))
                # print(self.Q)
                self.Q[state][action] = self.Q[state][action] + self.alpha * (reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action] )

                state = next_state
                action = next_action

                if self.game.is_end():
                    break
            if episode!=0 and episode%1000==0:
                print('Episode: {} Steps this episode: {} Average steps: {}'.format(episode, episode_steps, round(total_steps/episode,2)))
                if episode%10000==0:
                    print('-----------------------------------------------------------------------')
        
    def test_agent(self,):

        self.game.init()
        self.game.display()

        while True:
            
            pacman = self.game.pacman
            ghost = self.game.ghost
            food = self.game.food_pellets
            state = self.get_state(pacman, ghost, food)

            action = self.choose_action_epsilon_greedy(state)
            self.game.step(action)
            self.game.display()

            if self.game.is_end():
                break

# Q Function

In [15]:
class q_func:

    def __init__(self,game,num_episodes, alpha, epsilon, gamma):
        self.num_episodes = num_episodes
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        self.game = game
        self.Q = {}

    def choose_action_epsilon_greedy(self,state):

        if state not in self.Q:
            return random.randint(0,len(self.game.action_space)-1)
        else:
            greedy_action_index = np.argmax(self.Q[state])
            l = len(self.game.action_space)
            probabilities = [1.0 - self.epsilon*(1-(1/l))] + [self.epsilon/l for i in range(l-1)]
            action_choices = [greedy_action_index] + [x for x in range(l) if x!=greedy_action_index]
            # print('probabilities {} action_choices {}'.format(probabilities,action_choices))
            action = np.random.choice(action_choices, p=probabilities)

            return action
    
    def checkQ(self,state):
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.game.action_space))
    
    def get_state(self,pacman,ghost,food_pellets):
        return '{}, {}, {}'.format(pacman, ghost, sorted(food_pellets))
    
    def train_agent(self,):

        episode_steps = 0
        total_steps = 0

        for episode in range(self.num_episodes):
            self.game.init()
            
            pacman = self.game.pacman
            ghost = self.game.ghost
            food = self.game.food_pellets
            state = self.get_state(pacman, ghost, food)

            action = self.choose_action_epsilon_greedy(state)
            self.checkQ(state)
            total_steps += episode_steps
            episode_steps = 0

            while True:
                episode_steps += 1
                self.game.step(action)
                reward = self.game.reward

                next_pacman = self.game.pacman
                next_ghost = self.game.ghost
                next_food_pellets = self.game.food_pellets
                next_state = self.get_state(next_pacman, next_ghost, next_food_pellets)
    
                next_action = self.choose_action_epsilon_greedy(next_state)
                self.checkQ(next_state)
                #QFUNC
                # print('state {} action {} next {} na {}'.format(state,action,next_state,next_action))
                # print(self.Q)
                self.Q[state][action] = self.Q[state][action] + self.alpha * (reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action] )

                state = next_state
                action = next_action

                if self.game.is_end():
                    break
            if episode!=0 and episode%1000==0:
                print('Episode: {} Steps this episode: {} Average steps: {}'.format(episode, episode_steps, round(total_steps/episode,2)))
                if episode%10000==0:
                    print('-----------------------------------------------------------------------')
        
    def test_agent(self,):

        self.game.init()
        self.game.display()

        while True:
            
            pacman = self.game.pacman
            ghost = self.game.ghost
            food = self.game.food_pellets
            state = self.get_state(pacman, ghost, food)

            action = self.choose_action_epsilon_greedy(state)
            self.game.step(action)
            self.game.display()

            if self.game.is_end():
                break

# Train

In [13]:
pacman_game = Pacman(grid_size = 5, num_food_pellets = 2)
agentQ = q_func(pacman_game, num_episodes = 100000, epsilon = 0.1, gamma = 0.8, alpha = 0.1)
agentQ.train_agent()

Episode: 1000 Steps this episode: 1 Average steps: 5.73
Episode: 2000 Steps this episode: 1 Average steps: 9.99
Episode: 3000 Steps this episode: 5 Average steps: 13.16
Episode: 4000 Steps this episode: 31 Average steps: 15.31
Episode: 5000 Steps this episode: 26 Average steps: 17.1
Episode: 6000 Steps this episode: 17 Average steps: 18.64
Episode: 7000 Steps this episode: 57 Average steps: 20.15
Episode: 8000 Steps this episode: 20 Average steps: 21.49
Episode: 9000 Steps this episode: 41 Average steps: 22.58
Episode: 10000 Steps this episode: 9 Average steps: 23.37
-----------------------------------------------------------------------
Episode: 11000 Steps this episode: 23 Average steps: 24.13
Episode: 12000 Steps this episode: 24 Average steps: 24.82
Episode: 13000 Steps this episode: 34 Average steps: 25.46
Episode: 14000 Steps this episode: 31 Average steps: 25.98
Episode: 15000 Steps this episode: 27 Average steps: 26.49
Episode: 16000 Steps this episode: 23 Average steps: 27.03


In [17]:
agentQ = q_func(pacman_game, num_episodes = 100000, epsilon = 0.1, gamma = 0.8, alpha = 0.1)
agentQ.test_agent()

Current score: 0
*****
*P o*
*   *
*G o*
*****
Current score: 0
*****
*P o*
*   *
*G o*
*****


In [6]:
pacman_game = Pacman(grid_size = 5, num_food_pellets = 2)
sarsa_agent = sarsa(pacman_game, num_episodes = 100000, epsilon = 0.1, gamma = 0.8, alpha = 0.1)
sarsa_agent.train_agent()

Episode: 1000 Steps this episode: 3 Average steps: 5.23
Episode: 2000 Steps this episode: 7 Average steps: 7.47
Episode: 3000 Steps this episode: 66 Average steps: 10.15
Episode: 4000 Steps this episode: 85 Average steps: 12.88
Episode: 5000 Steps this episode: 37 Average steps: 15.12
Episode: 6000 Steps this episode: 10 Average steps: 17.25
Episode: 7000 Steps this episode: 5 Average steps: 18.77
Episode: 8000 Steps this episode: 55 Average steps: 20.33
Episode: 9000 Steps this episode: 32 Average steps: 21.68
Episode: 10000 Steps this episode: 76 Average steps: 22.84
-----------------------------------------------------------------------
Episode: 11000 Steps this episode: 13 Average steps: 23.76
Episode: 12000 Steps this episode: 5 Average steps: 24.67
Episode: 13000 Steps this episode: 71 Average steps: 25.44
Episode: 14000 Steps this episode: 29 Average steps: 26.31
Episode: 15000 Steps this episode: 11 Average steps: 26.83
Episode: 16000 Steps this episode: 31 Average steps: 27.33

# Test

In [7]:
agentQ.test_agent()

Current score: 0
*****
*  P*
*   *
*o G*
*****
Current score: 0
*****
*  P*
*   *
*o G*
*****
Current score: 0
*****
* P *
*  G*
*o o*
*****
Current score: 0
*****
*P  *
*   *
*o G*
*****
Current score: 0
*****
*G  *
*P  *
*o o*
*****
Current score: 10
*****
* G *
*   *
*P o*
*****
Current score: 10
*****
*  G*
*   *
* Po*
*****
Current score: 20
*****
*   *
* oG*
* oP*
*****
Current score: 30
*****
*   *
* oG*
* P *
*****
Current score: 30
*****
*   *
* oG*
* P *
*****
Current score: 30
*****
*   *
* G *
* P *
*****
Current score: 30
*****
*   *
* o *
*PG *
*****
Current score: 30
*****
*   *
*Po *
* G *
*****
Current score: 30
*****
*   *
*Po *
*G  *
*****
Current score: 40
*****
*   *
*oP *
*G  *
*****
Current score: 40
*****
*   *
*o P*
*oG *
*****
Current score: 40
*****
*  P*
*o  *
*G  *
*****


In [8]:
sarsa_agent.test_agent()

Current score: 0
*****
*G o*
*Po *
*   *
*****
Current score: 0
*****
*G o*
*Po *
*   *
*****
Current score: 0
*****
* Go*
* o *
*P  *
*****
Current score: 0
*****
*  o*
* G *
* P *
*****
Current score: 0
*****
*  o*
*Go *
*  P*
*****
Current score: 0
*****
*G o*
* oP*
*   *
*****
Current score: 0
*****
* Go*
* oP*
*   *
*****
Current score: 0
*****
*  G*
* oP*
*   *
*****
Current score: 10
*****
* Go*
* P *
*   *
*****
Current score: 10
*****
*  o*
* GP*
*   *
*****
Current score: 20
*****
* GP*
*  o*
* o *
*****
Current score: 30
*****
*G  *
*  P*
* o *
*****
Current score: 30
*****
*G  *
* P *
* o *
*****
Current score: 30
*****
*G  *
* P *
* o *
*****
Current score: 30
*****
*   *
*GP *
* o *
*****
Current score: 40
*****
*   *
*oG *
* P *
*****
Current score: 40
*****
*   *
*oG *
*P  *
*****
Current score: 40
*****
* G *
*oo *
*P  *
*****
Current score: 50
*****
*G  *
*Po *
*   *
*****
Current score: 60
*****
* G *
* P *
* o *
*****
Current score: 70
*****
* o *
* G *
* P *
*****
