In [18]:
# Importing required libraries
import numpy as np
import matplotlib.pyplot as plt

# Define world size
grid_size = (100, 100)

# Define Agent
class Agent:
    def __init__(self, position):
        self.position = position
        self.actions = ['up', 'down', 'left', 'right']

    def choose_action(self):
        # Exploratory behaviour: Returns a random action
        return np.random.choice(self.actions)
    
    def take_action(self, action):
        # Change the agent's position based on the action taken
        x, y = self.position
        if action == 'up' and x > 0:
            self.position = (x - 1, y)
        elif action == 'down' and x < grid_size[0] - 1:
            self.position = (x + 1, y)
        elif action == 'left' and y > 0:
            self.position = (x, y - 1)
        elif action == 'right' and y < grid_size[1] - 1:
            self.position = (x, y + 1)

# Initialize the agent
start_position = (np.random.randint(0, grid_size[0]), np.random.randint(0, grid_size[1]))  # Random start position for the agent
agent = Agent(start_position)
agent.position

# Define reward function
def calculate_reward(agent_position, nutrient_position, previous_distance, found_nutrient_reward=1):
    # Calculate the Manhattan distance from the agent to the nutrient source
    current_distance = abs(agent_position[0] - nutrient_position[0]) + abs(agent_position[1] - nutrient_position[1])
    
    # Check if the agent has found the nutrient source
    if current_distance == 0:
        return found_nutrient_reward
    
    # Reward function --> PLAY WITH IT
    if current_distance <= previous_distance:
        reward_tmp = (100 / current_distance)
        return reward_tmp # Moved closer
    else:
        reward_tmp = -(200 / current_distance) - 5
        return reward_tmp # Moved away or stayed the same

In [None]:
# Environment setup
class GridWorld:
    def __init__(self, size, nutrient_position, nutrient_gradient):
        self.size = size  # Grid size
        self.nutrient_position = nutrient_position
        self.nutrient_gradient = nutrient_gradient

        # Initialize the agent within the GridWorld
        start_position = (np.random.randint(0, self.size[0]), np.random.randint(0, self.size[1]))
        self.agent = Agent(start_position)  # 'Agent' class needs to be defined beforehand

        self.grid = np.zeros(size)  # Initialize a grid of zeros
        self._set_nutrient()

    def _set_nutrient(self, gradient_type='diamond'):  # Choose 'radial', 'rectangular', or 'diamond'
        # Set the nutrient source on the grid
        x_center, y_center = self.nutrient_position
        self.grid[x_center, y_center] = 1  # Mark the nutrient source

        if gradient_type == 'rectangular':
            for i in range(1, self.nutrient_gradient):
                gradient_area = [(x_center+dx, y_center+dy) for dx in range(-i, i+1) for dy in range(-i, i+1) if 0 <= x_center+dx < self.size[0] and 0 <= y_center+dy < self.size[1]]
                for pos in gradient_area:
                    self.grid[pos] += 0.1  # Increase the value to represent the gradient

        elif gradient_type == 'radial':
            for dx in range(-self.nutrient_gradient, self.nutrient_gradient):
                for dy in range(-self.nutrient_gradient, self.nutrient_gradient):
                    dist = np.sqrt(dx**2 + dy**2)
                    if dist < self.nutrient_gradient and 0 <= x_center+dx < self.size[0] and 0 <= y_center+dy < self.size[1]:
                        self.grid[x_center+dx, y_center+dy] += (self.nutrient_gradient - dist) / self.nutrient_gradient

        elif gradient_type == 'diamond':
            for dx in range(-self.nutrient_gradient, self.nutrient_gradient):
                for dy in range(-self.nutrient_gradient, self.nutrient_gradient):
                    dist = abs(dx) + abs(dy)  # Manhattan distance for a diamond shape
                    if dist < self.nutrient_gradient and 0 <= x_center+dx < self.size[0] and 0 <= y_center+dy < self.size[1]:
                        self.grid[x_center+dx, y_center+dy] += (self.nutrient_gradient - dist) / self.nutrient_gradient

    def show_grid(self, agent_position=None):
        plt.figure(figsize=(8, 8))
        plt.imshow(self.grid, cmap='hot', interpolation='nearest')

        # If an agent's position is provided, overlay a green dot at the agent's position
        if agent_position:
            plt.scatter(agent_position[1], agent_position[0], color='green', s=25)  # x and y are swapped for scatter

        plt.colorbar()
        plt.show()

    def reset(self):
        # You can choose to reset the agent to a random position
        # or to a fixed position like (0,0) every time
        agent_position = (np.random.randint(0, self.size[0]), np.random.randint(0, self.size[1]))
        
        # If there are other variables that should be reset at the start of each episode,
        # they should be reset here as well.
        
        # Return the initial state of the agent
        return agent_position
    
    def step(self, action):
        # Assume that the agent has a method `take_action` which updates its position
        previous_position = agent.position  # Store the previous position
        agent.take_action(action)
        new_position = agent.position
        print(f"Prev. Pos.: {previous_position}, New Pos.: {new_position}")
              
        # Calculate the new distance to the nutrient source for the reward function
        new_distance = abs(new_position[0] - self.nutrient_position[0]) + abs(new_position[1] - self.nutrient_position[1])
        previous_distance = abs(previous_position[0] - self.nutrient_position[0]) + abs(previous_position[1] - self.nutrient_position[1])

        # Call your reward function here
        reward = calculate_reward(new_position, self.nutrient_position, previous_distance)

        # Check if the agent has found the nutrient source
        done = new_distance == 0

        return new_position, reward, done

# Parameters
grid_size = (100, 100)
nutrient_source_position = (50, 50)  # Center of the grid
nutrient_gradient = 100

# Initialize the environment
environment = GridWorld(grid_size, nutrient_source_position, nutrient_gradient)
environment._set_nutrient(gradient_type='radial') 

# Show the initial grid world
environment.show_grid()

In [None]:
# Simulate and visualize the agent's movement for 100 steps without prior training

import time
from IPython.display import clear_output

for _ in range(100):
    action = agent.choose_action()
    agent.take_action(action)
    clear_output(wait=True)  # Clear the previous output
    environment.show_grid(agent.position)


In [None]:
### TRAINING 

# Parameters
grid_size = (100, 100)
nutrient_source_position = (50, 50)  # Center of the grid
nutrient_gradient = 100

# Initialize the environment
environment = GridWorld(grid_size, nutrient_source_position, nutrient_gradient)

# Initialize the agent
start_position = (np.random.randint(0, grid_size[0]), np.random.randint(0, grid_size[1]))  # Random start position for the agent
agent = Agent(start_position)

# Define the number of episodes and the maximum number of steps per episode --> PLAY WITH THEM
num_episodes = 100
max_steps_per_episode = 10000000

# Learning parameters --> PLAY WITH THEM
alpha = 0.4 # Learning rate
gamma = 0.99 # Discount factor
epsilon = 1.0  # Starting value for epsilon-greedy strategy
min_epsilon = 0.005
epsilon_decay = 0.5

# Keep track of the learning progress
all_epochs = []  # Stores the number of moves it takes to reach the goal for each episode
all_penalties = []  # Stores the number of penalties the agent incurs in each episode
all_states = []  # Stores the spawning position of the agent in each episode

# Define the mappings outside of the loop, so they are only defined once
action_to_index = {'up': 0, 'down': 1, 'left': 2, 'right': 3}
index_to_action = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}

# Initialize Q-table
q_table = np.zeros((*grid_size, len(agent.actions)))

for episode in range(num_episodes):
    state = (np.random.randint(0, grid_size[0]), np.random.randint(0, grid_size[1]))  # Random start position for the agent
    epochs, penalties, reward, = 0, 0, 0
    done = False
    all_states.append(state)
    
    for step in range(max_steps_per_episode):
        if np.random.uniform(0, 1) < epsilon:
            action = agent.choose_action()  # This returns a random string e.g. 'up', 'down', etc.
        else:
            # Exploit learned values using the current state
            action_index = np.argmax(q_table[state])
            action = index_to_action[action_index]  # Convert index back to the action string if necessary

        # Take the action and get the new state and reward from the environment
        new_state, reward, done = environment.step(action)

        # Convert the action string to an index using the mapping
        action_index = action_to_index[action]  # Now action_index is an integer

        # Now use the integer index for the Q-table
        old_value = q_table[state + (action_index,)]
        next_max = np.max(q_table[new_state])

        # Update the Q-table using the integer action index
        q_table[state + (action_index,)] = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

        if reward < 0:  # Assuming that negative rewards are penalties
            penalties += 1
        
        state = new_state
        epochs += 1
        
        if done:
            break

    # Decaying epsilon value to reduce the number of exploratory moves as it learns
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    
    all_epochs.append(epochs)
    all_penalties.append(penalties)


In [None]:
# Simulate and visualize the agent's movement for 100 steps after training; results will be saved as GIF

import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import imageio

# Setup the environment and agent to the initial state for the demonstration
start_position = (np.random.randint(0, grid_size[0]), np.random.randint(0, grid_size[1]))
agent = Agent(start_position)
state = agent.position
epsilon = 0 # Suppresses 

# Prepare the plot
plt.ion()  # Turn on interactive mode
fig, ax = plt.subplots(1, 2, figsize=(12, 5))  # Two subplots: one for the grid and one for the reward

rewards = []  # List to keep track of the rewards

# Prepare the plot
plt.ioff()  # Turn off interactive mode for GIF creation
fig, ax = plt.subplots(1, 2, figsize=(12, 5))  # Two subplots: one for the grid and one for the reward

frames = []  # List to keep track of the frames for the GIF

# Simulate and visualize the agent's movement for 100 steps
for t in range(100):
    # Choose the best action from the Q-table for the current state
    action_index = np.argmax(q_table[state])
    action = index_to_action[action_index]

    # Perform the chosen action
    new_state, reward, done = environment.step(action)

    # Update rewards list
    rewards.append(reward)

    # Update the state and the agent's position
    state = new_state
    agent.position = state

    # Clear current output and display the grid and reward plot
    clear_output(wait=True)
    display(fig)  # Display the figure
    
    # Plot the grid
    ax[0].clear()
    ax[0].imshow(environment.grid, cmap='hot', interpolation='nearest')
    ax[0].scatter(agent.position[1], agent.position[0], color='green', s=25)  # Plot the agent
    ax[0].set_title('Agent Movement')

    # Plot the rewards
    ax[1].clear()
    ax[1].plot(rewards, label='Reward')
    ax[1].set_title('Reward per Step')
    ax[1].set_xlabel('Step')
    ax[1].set_ylabel('Reward')
    ax[1].legend()

    # Save the current frame
    fig.canvas.draw()
    image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
    image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    frames.append(image)

    if done:
        print("Episode finished after {} timesteps".format(t + 1))
        break

# Save frames as a GIF
imageio.mimsave('agent_movement.gif', frames, fps=5)
