In [2]:
import SnakeEnvironment as se
import gym
from gym import spaces
import numpy as np
import pygame
import time
import random

based on the entire grid

In [137]:
import numpy as np
import gym
import random

class QLearningAgent:
    def __init__(self, env, grid_size=10, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995, num_episodes=1000):
        self.env = env
        self.grid_size = grid_size
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon_decay = epsilon_decay
        self.num_episodes = num_episodes
        
        self.action_space_size = self.env.action_space.n  # 4 actions (UP, DOWN, LEFT, RIGHT)
        self.q_table = np.zeros((grid_size * grid_size * grid_size * grid_size, self.action_space_size))  # Q-table
        
    def get_state_index(self, state):
        """
        Converts the state (a 3D grid) into a unique integer index.
        This method extracts the snake's head and the food's position from the grid.
        """
        # Find the snake's head (first segment of the snake)
        head_position = np.argwhere(state[:, :, 0] == 1)[0]  # Get the first occurrence of 1 (snake's head)
        head_x, head_y = head_position[0], head_position[1]
        
        # Find the food position
        food_position = np.argwhere(state[:, :, 0] == -1)[0]  # Get the first occurrence of -1 (food)
        food_x, food_y = food_position[0], food_position[1]
        
        # Combine these positions into a unique integer index
        return (head_x * self.grid_size + head_y) * (self.grid_size ** 2) + (food_x * self.grid_size + food_y)
    
    def choose_action(self, state):
        """
        Selects an action using epsilon-greedy policy.
        """
        state_index = self.get_state_index(state)
        
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space_size)  # Explore: Random action
        else:
            return np.argmax(self.q_table[state_index])  # Exploit: Best known action
    
    def update_q_table(self, state, action, reward, next_state):
        """
        Updates the Q-table using the Q-Learning formula.
        """
        state_index = self.get_state_index(state)
        next_state_index = self.get_state_index(next_state)
        
        max_next_q = np.max(self.q_table[next_state_index])  # Max Q-value for the next state
        self.q_table[state_index, action] += self.alpha * (reward + self.gamma * max_next_q - self.q_table[state_index, action])
    
    def train(self):
        """
        Trains the agent using Q-learning over multiple episodes.
        """
        for episode in range(self.num_episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                
                # Update Q-table with the reward and the next state
                self.update_q_table(state, action, reward, next_state)
                
                state = next_state
                total_reward += reward
            
            # Decay epsilon to reduce exploration over time
            self.epsilon = max(self.epsilon * self.epsilon_decay, 0.33)
            
            if episode % 10000 == 0:
                print(f"Episode {episode}, Total Reward: {total_reward}")
    
    def evaluate(self, num_episodes=100):
        """
        Evaluates the trained agent by running a number of test episodes.
        """
        total_scores = []
        for _ in range(num_episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            counter = 0
            while not done :
                counter += 1
                if counter >= 1000: break
                action = np.argmax(self.q_table[self.get_state_index(state)])  # Always exploit
                next_state, reward, done, _ = self.env.step(action)
                state = next_state
                total_reward += reward
                
            
            total_scores.append(total_reward)
        
        average_score = np.mean(total_scores)
        print(f"Average Score over {num_episodes} test episodes: {average_score}")

# Example usage
if __name__ == "__main__":
    import gym  #
    
    # Initialize the environment and the QLearningAgent
    env = se.SnakeEnv(grid_size=5) # used 5 to reduce calculations
    agent = QLearningAgent(env, grid_size=5, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.99995, num_episodes=100000)

    # Train the agent
    agent.train()
    
    # Evaluate the agent after training
    agent.evaluate(num_episodes=100)


Episode 0, Total Reward: -18
Episode 10000, Total Reward: 0
Episode 20000, Total Reward: -5
Episode 30000, Total Reward: -7
Episode 40000, Total Reward: 10
Episode 50000, Total Reward: -4
Episode 60000, Total Reward: 8
Episode 70000, Total Reward: -5
Episode 80000, Total Reward: 9
Episode 90000, Total Reward: -7
Average Score over 100 test episodes: -62.38


based on snake head location and food location

In [20]:
import numpy as np
import gym
import random

class QLearningAgent:
    def __init__(self, env, grid_size=10, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995, num_episodes=1000):
        self.env = env
        self.grid_size = grid_size
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon_decay = epsilon_decay
        self.num_episodes = num_episodes
        
        self.action_space_size = self.env.action_space.n  # 4 actions (UP, DOWN, LEFT, RIGHT)
        self.q_table = np.zeros((grid_size * grid_size * grid_size * grid_size, self.action_space_size))  # Q-table
        
    def get_state(self, state):
        """
        Converts the state (snake grid) into a simple tuple (head_x, head_y, food_x, food_y).
        Extracts the snake's head position and the food position from the state.
        """
        # Get the snake's head position (first occurrence of '1' in state)
        head_x, head_y = self.env.snake[0]
        
        food_x, food_y = self.env.food
        
        return (head_x, head_y, food_x, food_y)
    
    def get_state_index(self, state):
        """
        Converts the state (head_x, head_y, food_x, food_y) into a unique integer index.
        """
        head_x, head_y, food_x, food_y = state
        
        # Combine these positions into a unique integer index
        return (head_x * self.grid_size + head_y) * (self.grid_size ** 2) + (food_x * self.grid_size + food_y)
    
    def choose_action(self, state):
        """
        Selects an action using epsilon-greedy policy.
        """
        state = self.get_state(state)  # Convert state to (head_x, head_y, food_x, food_y)
        state_index = self.get_state_index(state)
        
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space_size)  # Explore: Random action
        else:
            return np.argmax(self.q_table[state_index])  # Exploit: Best known action
    
    def update_q_table(self, state, action, reward, next_state):
        """
        Updates the Q-table using the Q-Learning formula.
        """
        state = self.get_state(state)  # Convert state to (head_x, head_y, food_x, food_y)
        next_state = self.get_state(next_state)  # Convert next_state to (head_x, head_y, food_x, food_y)
        
        state_index = self.get_state_index(state)
        next_state_index = self.get_state_index(next_state)
        
        max_next_q = np.max(self.q_table[next_state_index])  # Max Q-value for the next state
        self.q_table[state_index, action] += self.alpha * (reward + self.gamma * max_next_q - self.q_table[state_index, action])
    
    def train(self):
        """
        Trains the agent using Q-learning over multiple episodes.
        """
        for episode in range(self.num_episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                
                # Update Q-table with the reward and the next state
                self.update_q_table(state, action, reward, next_state)
                
                state = next_state
                total_reward += reward
            
            # Decay epsilon to reduce exploration over time
            self.epsilon = max(self.epsilon * self.epsilon_decay, 0.2)
            
            if episode % 1000 == 0:
                print(f"Episode {episode}, Total Reward: {total_reward}")
    
    def evaluate(self, num_episodes=100):
        """
        Evaluates the trained agent by running a number of test episodes.
        """
        total_scores = []
        for _ in range(num_episodes):
            state = self.env.reset()
            state = self.get_state(state)
            done = False
            total_reward = 0
            counter = 0
            while not done :
                counter += 1
                if counter >= 1000: break
                action = np.argmax(self.q_table[self.get_state_index(state)])  # Always exploit
                next_state, reward, done, _ = self.env.step(action)
                state = self.get_state(next_state)
                total_reward += reward
            
            total_scores.append(total_reward)
        
        average_score = np.mean(total_scores)
        print(f"Average Score over {num_episodes} test episodes: {average_score}")

# Example usage
if __name__ == "__main__":
    import gym
    # Initialize the environment and the QLearningAgent
    env = se.SnakeEnv(grid_size=10)
    agent = QLearningAgent(env, grid_size=10, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995, num_episodes=10000)

    # Train the agent
    agent.train()
    
    # Evaluate the agent after training
    agent.evaluate(num_episodes=100)


Episode 0, Total Reward: -50
Episode 1000, Total Reward: -57
Episode 2000, Total Reward: -449
Episode 3000, Total Reward: -512
Episode 4000, Total Reward: -43
Episode 5000, Total Reward: -51
Episode 6000, Total Reward: -48
Episode 7000, Total Reward: -16
Episode 8000, Total Reward: -44
Episode 9000, Total Reward: -129
Average Score over 100 test episodes: -999.0


In [10]:
env.snake[0]

(5, 5)

focus on the difference between snake head and food locations (we do this to reduce q-table size for faster calculations)

In [25]:
import numpy as np
import gym
import random

class QLearningAgent:
    def __init__(self, env, grid_size=10, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995, num_episodes=1000):
        self.env = env
        self.grid_size = grid_size
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon_decay = epsilon_decay
        self.num_episodes = num_episodes
        
        self.action_space_size = self.env.action_space.n  # 4 actions (UP, DOWN, LEFT, RIGHT)
        # Q-table has fewer states because we're using relative positions
        self.q_table = np.zeros((grid_size * grid_size * 4, self.action_space_size))  # 4 directions for food (up, down, left, right)
        
    def get_relative_food_position(self, head_x, head_y, food_x, food_y):
        """
        Returns the relative direction of the food with respect to the snake's head:
        - 'up', 'down', 'left', or 'right'
        """
        if food_x < head_x:
            return "left"
        elif food_x > head_x:
            return "right"
        elif food_y < head_y:
            return "up"
        elif food_y > head_y:
            return "down"
        return "none"  # Should never reach here if food and head are not the same position
    
    def get_state(self, state):
        """
        Convert the state (snake grid) into a simple tuple (head_x, head_y, relative_food_direction).
        Extracts the snake's head position and the food position from the state.
        """
        # Get the snake's head position (first occurrence of '1' in state)
        head_x, head_y = self.env.snake[0]
        
        # Get the food position
        food_x, food_y = self.env.food
        
        # Get relative food position
        food_direction = self.get_relative_food_position(head_x, head_y, food_x, food_y)
        
        return (head_x, head_y, food_direction)
    
    def get_state_index(self, state):
        """
        Converts the state (head_x, head_y, food_direction) into a unique integer index.
        """
        head_x, head_y, food_direction = state
        
        # Map food direction to integer
        food_direction_map = {"left": 0, "right": 1, "up": 2, "down": 3}
        food_direction_index = food_direction_map[food_direction]
        
        # Combine these positions into a unique integer index
        return (head_x * self.grid_size + head_y) * 4 + food_direction_index
    
    def choose_action(self, state):
        """
        Selects an action using epsilon-greedy policy.
        """
        state = self.get_state(state)  # Convert state to (head_x, head_y, food_direction)
        state_index = self.get_state_index(state)
        
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space_size)  # Explore: Random action
        else:
            return np.argmax(self.q_table[state_index])  # Exploit: Best known action
    
    def update_q_table(self, state, action, reward, next_state):
        """
        Updates the Q-table using the Q-Learning formula.
        """
        state = self.get_state(state)  # Convert state to (head_x, head_y, food_direction)
        next_state = self.get_state(next_state)  # Convert next_state to (head_x, head_y, food_direction)
        
        state_index = self.get_state_index(state)
        next_state_index = self.get_state_index(next_state)
        
        max_next_q = np.max(self.q_table[next_state_index])  # Max Q-value for the next state
        self.q_table[state_index, action] += self.alpha * (reward + self.gamma * max_next_q - self.q_table[state_index, action])
    
    def train(self):
        """
        Trains the agent using Q-learning over multiple episodes.
        """
        for episode in range(self.num_episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                
                # Update Q-table with the reward and the next state
                self.update_q_table(state, action, reward, next_state)
                
                state = next_state
                total_reward += reward
            
            # Decay epsilon to reduce exploration over time
            self.epsilon = max(self.epsilon * self.epsilon_decay, 0.2)
            
            if episode % 10000 == 0:
                print(f"Episode {episode}, Total Reward: {total_reward}")
    
    def evaluate(self, num_episodes=100):
        """
        Evaluates the trained agent by running a number of test episodes.
        """
        total_scores = []
        for _ in range(num_episodes):
            state = self.env.reset()
            state = self.get_state(state)
            done = False
            total_reward = 0
            counter = 0
            while not done:
                counter += 1
                if counter >= 1000: break
                action = np.argmax(self.q_table[self.get_state_index(state)])  # Always exploit
                next_state, reward, done, _ = self.env.step(action)
                state = self.get_state(next_state)
                total_reward += reward
            
            total_scores.append(total_reward)
        
        average_score = np.mean(total_scores)
        print(f"Average Score over {num_episodes} test episodes: {average_score}")

# Example usage
if __name__ == "__main__":
    import gym
    # Initialize the environment and the QLearningAgent
    env = se.SnakeEnv(grid_size=10)
    agent = QLearningAgent(env, grid_size=10, epsilon=1.0, alpha=0.1, gamma=0.99, epsilon_decay=0.995, num_episodes=100000)

    # Train the agent
    agent.train()
    
    # Evaluate the agent after training
    agent.evaluate(num_episodes=100)


Episode 0, Total Reward: -27
Episode 10000, Total Reward: -185
Episode 20000, Total Reward: -22
Episode 30000, Total Reward: -1072
Episode 40000, Total Reward: -32
Episode 50000, Total Reward: -33
Episode 60000, Total Reward: -33
Episode 70000, Total Reward: -44
Episode 80000, Total Reward: -91
Episode 90000, Total Reward: -114
Average Score over 100 test episodes: -899.63


In [27]:
agent.q_table.shape

(400, 4)

obviously the first one is the best model because it has more information about the space, but the last one is the fastest one to train