<h3> Environment , Polices , Training  </h3>

In [7]:
import numpy as np
import random
from itertools import combinations

GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}  # Pickup locations with their capacities
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}  # Dropoff locations with their remaining capacities
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']  # Actions: North, East, South, West, Pickup, Dropoff
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}  # Reward values for actions
random.seed(4)
np.random.seed(4)

# Hyperparameters
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.95
EPSILON = 0.99
EPISODES = 1000

# Initial positions for each agent
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}  # Adjusted to 0-based indexing

# Agent class definition
class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()

        # Check if agent is at a border
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')

        # Check for pickup/dropoff actions
        if (x, y) in PICKUP_LOCATIONS and (PICKUP_LOCATIONS[(x, y)] <= 0 or self.carrying):
            valid_actions.remove('P')
        if (x, y) in DROPOFF_LOCATIONS and (DROPOFF_LOCATIONS[(x, y)] >= 5 or not self.carrying):
            valid_actions.remove('D')
        return valid_actions

    
    def select_action(self, valid_actions, policy='PRandom'):        
        if not valid_actions:
            return None  # No valid actions available
        if policy == 'PRandom' or (policy == 'PExploit' and random.random() < EPSILON):
            return random.choice(valid_actions)
        else:
            q_values = [self.q_table[x, y, ACTIONS.index(a)] for a in valid_actions]
            max_q_value = max(q_values)
            max_actions = [a for a, q in zip(valid_actions, q_values) if q == max_q_value]
            return random.choice(max_actions)

    def perform_action(self, action):  
        x, y = self.position

        # Update position based on action
        if action == 'N': x = max(0, x - 1)
        elif action == 'S': x = min(GRID_SIZE - 1, x + 1)
        elif action == 'E': y = min(GRID_SIZE - 1, y + 1)
        elif action == 'W': y = max(0, y - 1)

        # Handle pickup and dropoff actions
        if action == 'P' and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS and DROPOFF_LOCATIONS[(x, y)] < 5:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1

        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_state, next_valid_actions):
        if action is None:
            return  # Skip Q-table update if no action was taken
        old_x, old_y = self.position
        new_x, new_y = next_state
        action_index = ACTIONS.index(action)
        future_rewards = [self.q_table[new_x, new_y, ACTIONS.index(a)] for a in next_valid_actions]
        self.q_table[old_x, old_y, action_index] = (1 - LEARNING_RATE) * self.q_table[old_x, old_y, action_index] + \
            LEARNING_RATE * (reward + DISCOUNT_FACTOR * np.max(future_rewards))

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Utility functions
def manhattan_distance(pos1, pos2):
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

def average_manhattan_distance(agents):
    distances = [manhattan_distance(agent1.position, agent2.position) 
                 for agent1, agent2 in combinations(agents, 2)]
    return np.mean(distances)

def get_shortest_path_length(start, end):
    return abs(start[0] - end[0]) + abs(start[1] - end[1])

def simulate_agent_path(agent, start, end):
    agent.reset()
    agent.position = start
    path = [start]
    while agent.position != end:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions)
        agent.perform_action(action)
        path.append(agent.position)
    return path

# Simulation
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

average_distances = []

for episode in range(EPISODES):
    episode_distances = []
    for agent in agents:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions, policy='PRandom')  
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS.get(action, 0)
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        episode_distances.append(average_manhattan_distance(agents))
    average_distances.append(np.mean(episode_distances))
    if agent.is_terminal_state():
        for a in agents:
            a.reset()
        break

# Output analysis
print(f"Average Manhattan distances over episodes:")
for i, avg_dist in enumerate(average_distances, 1):
    print(f"Episode {i}: {avg_dist:.2f}")

# Output Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   S       W       N       E       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")

# Simulate a path from a pickup to a dropoff location
agent = Agent(initial_positions['red'], 'red')  # You can choose any agent here
pickup_location = next(iter(PICKUP_LOCATIONS))
dropoff_location = next(iter(DROPOFF_LOCATIONS))
path = simulate_agent_path(agent, pickup_location, dropoff_location)

# Calculate the shortest path length
shortest_path_len = get_shortest_path_length(pickup_location, dropoff_location)

print(f"Simulated path: {path}")
print(f"Simulated path length: {len(path) - 1}")
print(f"Shortest path length: {shortest_path_len}")

Average Manhattan distances over episodes:
Episode 1: 3.78
Episode 2: 4.00
Episode 3: 4.22
Episode 4: 4.44
Episode 5: 4.22
Episode 6: 4.44
Episode 7: 4.44
Episode 8: 5.33
Episode 9: 4.22
Episode 10: 3.78
Episode 11: 4.00
Episode 12: 3.78
Episode 13: 3.78
Episode 14: 4.44
Episode 15: 4.67
Episode 16: 3.78
Episode 17: 3.33
Episode 18: 2.44
Episode 19: 3.33
Episode 20: 3.56
Episode 21: 3.78
Episode 22: 3.78
Episode 23: 3.56
Episode 24: 3.33
Episode 25: 3.78
Episode 26: 4.67
Episode 27: 5.11
Episode 28: 4.67
Episode 29: 4.00
Episode 30: 3.33
Episode 31: 2.22
Episode 32: 2.89
Episode 33: 4.00
Episode 34: 4.00
Episode 35: 3.56
Episode 36: 4.44
Episode 37: 3.78
Episode 38: 3.11
Episode 39: 3.56
Episode 40: 3.56
Episode 41: 2.44
Episode 42: 1.33
Episode 43: 1.33
Episode 44: 1.56
Episode 45: 2.22
Episode 46: 2.44
Episode 47: 2.67
Episode 48: 3.11
Episode 49: 3.33
Episode 50: 3.78
Episode 51: 4.22
Episode 52: 4.67
Episode 53: 4.00
Episode 54: 3.33
Episode 55: 3.56
Episode 56: 3.33
Episode 57: 3.

In [None]:
Approach A: Each agent use his own RL

A.	Agent coordination: Do the three agents get in their ways blocking each other or do they do a good job in dividing the transportation task intelligently among one another. Agent coordination could, for example, be measured by computing the average Manhattan distance between the three agents during the run of a specific experiment. You can show the progress of average distance for different iterations.

-Manhattan distance Is calculated through each iteration it is seems pretty fixed at the end : 


B. Paths learned: Does the particular approach do a good job in learning paths between block sources and block destinations; is the learnt path the shortest path or close to the shortest path between the source and the destination. 

-It is not too close i'll rate it 6.5 out of 10     Simulated path length: 12  compared to Shortest path length: 4
-This suggests that the learned policy is not finding the most efficient route



Extra Credit On The Compare   For the task 2 and 3 below





<h3> EXP1   </h3>

In [84]:
import numpy as np
import random
from itertools import combinations

# Constants and hyperparameters
GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}
random.seed(4)
np.random.seed(4)

# Hyperparameters for Experiment 1
LEARNING_RATE_EXP = 0.3
DISCOUNT_FACTOR_EXP = 0.5
EPISODES_EXP = 9000
STEPS_PRANDOM = 500
STEPS_REMAINING = EPISODES_EXP - STEPS_PRANDOM
POLICY_PRANDOM = 'PRandom'
POLICY_PGREEDY = 'PGreedy'
POLICY_PEXPLOIT = 'PExploit'

# Initialize agents for Experiment 1
agents_exp = [Agent(initial_positions['red'], 'red'), 
              Agent(initial_positions['blue'], 'blue'), 
              Agent(initial_positions['black'], 'black')]

# Run PRANDOM for the initial steps
episode_rewards_prandom = []
average_distances_prandom = []
for _ in range(STEPS_PRANDOM):
    episode_distances = []
    episode_rewards = []
    for agent in agents_exp:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions, policy=POLICY_PRANDOM)  
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS.get(action, 0)
            episode_rewards.append(reward)
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        episode_distances.append(average_manhattan_distance(agents_exp))
    episode_rewards_prandom.append(sum(episode_rewards))
    average_distances_prandom.append(np.mean(episode_distances))

# Continue with different policies for the remaining steps
episode_rewards_remaining = []
average_distances_remaining = []
for _ in range(STEPS_REMAINING):
    episode_distances = []
    episode_rewards = []
    for agent in agents_exp:
        valid_actions = agent.get_valid_actions()
        if _ < STEPS_REMAINING / 3:  # PRANDOM for 1/3 of remaining steps
            action = agent.select_action(valid_actions, policy=POLICY_PRANDOM)
        elif STEPS_REMAINING / 3 <= _ < (2 * STEPS_REMAINING / 3):  # PGREEDY for 1/3 of remaining steps
            action = agent.select_action(valid_actions, policy=POLICY_PGREEDY)
        else:  # PEXPLOIT for the final 1/3 of remaining steps
            action = agent.select_action(valid_actions, policy=POLICY_PEXPLOIT)
        
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS.get(action, 0)
            episode_rewards.append(reward)
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        episode_distances.append(average_manhattan_distance(agents_exp))
    episode_rewards_remaining.append(sum(episode_rewards))
    average_distances_remaining.append(np.mean(episode_distances))

# Output Q-tables for analysis (choose any agent)
for agent in agents_exp:
    print(f"{agent.name}'s Q-table (Experiment 1.c):")
    print("State  |   S       W       N       E       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")

# Results reporting and interpretation
total_rewards_prandom = sum(episode_rewards_prandom)
total_rewards_remaining = sum(episode_rewards_remaining)
avg_distance_prandom = np.mean(average_distances_prandom)
avg_distance_remaining = np.mean(average_distances_remaining)

print(f"Total rewards obtained (PRANDOM phase): {total_rewards_prandom}")
print(f"Total rewards obtained (Remaining phases): {total_rewards_remaining}")
print("\nAverage Manhattan distances over episodes:")
print(f"PRANDOM: {avg_distance_prandom}")
print(f"Remaining phases: {avg_distance_remaining}")



red's Q-table (Experiment 1.c):
State  |   S       W       N       E       P       D
---------------------------------------------------
(1,1)  |   11.53    0.00    0.00   11.40   25.17    7.79
(1,2)  |   11.08   10.91    0.00   11.00   24.12   24.61
(1,3)  |   11.25   11.28    0.00   11.34   25.11   25.25
(1,4)  |   11.81   11.84    0.00   11.78   25.77   25.37
(1,5)  |   11.59   11.58    0.00    0.00    6.63   25.47
(2,1)  |   11.78    0.00   11.69   11.50   25.74   25.80
(2,2)  |   11.93   11.91   11.93   11.94   25.92   25.95
(2,3)  |   11.96   11.96   11.93   11.97   25.93   25.95
(2,4)  |   11.63   11.55   11.43   11.58   11.67   25.47
(2,5)  |   11.52   11.69   11.86    0.00   25.82   25.83
(3,1)  |   11.78    0.00   11.62   11.59   25.68   11.48
(3,2)  |   11.96   11.95   11.96   11.96   25.97   25.98
(3,3)  |   11.81   11.74   11.88   11.83   25.77   25.86
(3,4)  |   11.90   11.92   11.87   11.85   25.90   25.86
(3,5)  |   11.97   11.96   11.84    0.00   25.97   25.97
(4,1)  |

--------------------------------------------------------
Interpretation:
1. Total rewards obtained
 Higher rewards indicate successful navigation and task completion.
PRANDOM phase contributed to initial exploration, while PGREEDY and PEXPLOIT phases focused on exploitation.

2. Average Manhattan distances
 Lower distances suggest efficient movement and decision-making.
PRANDOM phase may have higher distances due to exploration, while PGREEDY and PEXPLOIT phases should show improvements.


<h3> EXP2  SARSA Q-learning 9000 (500,8500) &report  Q-tables  </h3>

In [79]:
import numpy as np

class QTableSARSA:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = np.zeros((num_states, num_actions))

    def update_q_value(self, state, action, reward, next_state, next_action):
        current_q_value = self.q_table[state, action]
        next_q_value = self.q_table[next_state, next_action]
        new_q_value = (1 - self.learning_rate) * current_q_value + \
                      self.learning_rate * (reward + self.discount_factor * next_q_value)
        self.q_table[state, action] = new_q_value

    def get_q_value(self, state, action):
        return self.q_table[state, action]

    def update_learning_rate(self, new_learning_rate):
        self.learning_rate = new_learning_rate

    def update_discount_factor(self, new_discount_factor):
        self.discount_factor = new_discount_factor

    def get_optimal_action(self, state, exploration_rate):
        if np.random.rand() < exploration_rate:
            return np.random.randint(self.num_actions)  # Randomly choose an action
        else:
            return np.argmax(self.q_table[state])  # Choose action with maximum Q-value

class BlockTransportationProblemSARSA:
    def __init__(self, num_agents, num_states, num_actions, learning_rate, discount_factor):
        self.num_agents = num_agents
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_tables = [QTableSARSA(num_states, num_actions, learning_rate, discount_factor) for _ in range(num_agents)]

    def update_q_values(self, agent_index, state, action, reward, next_state, next_action):
        self.q_tables[agent_index].update_q_value(state, action, reward, next_state, next_action)

    def select_action(self, agent_index, state, exploration_rate, policy):
        if policy == "PRANDOM":
            return np.random.randint(self.num_actions)  # Choose action randomly
        elif policy == "PEXPLOIT":
            return self.q_tables[agent_index].get_optimal_action(state, exploration_rate)
        elif policy == "PGREEDY":
            return np.argmax(self.q_tables[agent_index].q_table[state])  # Choose action with maximum Q-value

# Experiment parameters
num_agents = 3
num_states = 25  # Assuming 25 states
num_actions = 4  # Assuming 4 actions (north, south, east, west)
learning_rate = 0.3
discount_factor = 0.5
total_steps = 9000
initial_prandom_steps = 500
remaining_steps = total_steps - initial_prandom_steps
policy_switch_step = initial_prandom_steps

# Initialize Block Transportation Problem with SARSA
btp_sarsa = BlockTransportationProblemSARSA(num_agents, num_states, num_actions, learning_rate, discount_factor)

# Run PRANDOM for initial steps
for step in range(initial_prandom_steps):
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp_sarsa.select_action(agent_index, state, exploration_rate=1.0, policy="PRANDOM") for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    next_actions = [btp_sarsa.select_action(agent_index, next_state, exploration_rate=0.1, policy="PEXPLOIT") for agent_index, next_state in enumerate(next_states)]
    for agent_index in range(num_agents):
        btp_sarsa.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index], next_actions[agent_index])

# Switch policies and continue running
for step in range(policy_switch_step, total_steps):
    policy = "PGREEDY" if step < (total_steps - remaining_steps // 2) else "PEXPLOIT"
    exploration_rate = 0.1 if policy == "PEXPLOIT" else 1.0
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp_sarsa.select_action(agent_index, state, exploration_rate, policy) for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    next_actions = [btp_sarsa.select_action(agent_index, next_state, exploration_rate, policy) for agent_index, next_state in enumerate(next_states)]
    for agent_index in range(num_agents):
        btp_sarsa.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index], next_actions[agent_index])

# Report one of the final Q-tables
for agent_index, agent in enumerate(btp_sarsa.q_tables, 1):
    print(f"Agent {agent_index}'s SARSA:")
    print("State  |   S       W       N       E      ")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            state_index = x * GRID_SIZE + y  # Convert 2D indices to 1D index
            q_values = [agent.get_q_value(state_index, action) for action in range(num_actions)]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")
# Assess the quality of agent coordination
# You can assess coordination by analyzing metrics such as total rewards, convergence speed, and task completion rates.
all_rewards = []
all_states = []
all_actions = []

# Modify your simulation loop to append states and actions to the lists
for step in range(initial_prandom_steps):
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp_sarsa.select_action(agent_index, state, exploration_rate=1.0, policy="PRANDOM") for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    next_actions = [btp_sarsa.select_action(agent_index, next_state, exploration_rate=0.1, policy="PEXPLOIT") for agent_index, next_state in enumerate(next_states)]
    
    all_states.extend(states)
    all_actions.extend(actions)
    all_rewards.append(rewards)  # Storing rewards for each step
    
    for agent_index in range(num_agents):
        btp_sarsa.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index], next_actions[agent_index])

# ... [rest of your simulation loop]

# After simulation loop ends, calculate and print the total and average rewards
cumulative_rewards, average_reward = calculate_total_rewards(all_rewards)
print(f"Cumulative Rewards for each agent 1 2 3: {cumulative_rewards} ")
print(f"Average Reward across all agents: {average_reward}")

# Calculate and print the coordination quality
coordination_quality = calculate_coordination_quality(all_states, all_actions, num_agents, num_states)
print(f"Coordination Quality Score: {coordination_quality:.2f}")


Agent 1's SARSA:
State  |   S       W       N       E      
---------------------------------------------------
(1,1)  |    7.43    7.52    6.85    9.52
(1,2)  |   10.97    6.06    6.25    7.78
(1,3)  |    8.44    6.35   11.44    6.06
(1,4)  |    6.12    7.40    9.91    6.91
(1,5)  |   10.06    7.65    6.27    7.94
(2,1)  |    7.87    7.40    7.67    7.68
(2,2)  |    7.43    8.40    7.01    7.37
(2,3)  |    7.11   10.12    6.89    6.69
(2,4)  |    9.64    7.38    7.13    6.96
(2,5)  |    7.32    7.39    5.39    8.84
(3,1)  |   10.02    5.86    6.54    7.76
(3,2)  |    7.20    7.77   10.27    7.28
(3,3)  |    6.80    6.46    4.64    7.45
(3,4)  |    6.80    7.96    8.18   11.17
(3,5)  |    6.70    7.15    9.27    8.20
(4,1)  |    7.88    5.27    5.49    9.96
(4,2)  |    6.17    9.34    7.68    9.67
(4,3)  |    6.90    6.00    8.77    7.13
(4,4)  |    8.59    6.82    6.46    6.23
(4,5)  |    7.39    6.91    6.22   10.19
(5,1)  |    9.23    7.48    9.71    5.77
(5,2)  |    7.38    7.79   

<h3> EXP3 From 1.c (Qlearning) Different Learning rate (0.15,0.45)  </h3>

(Discussion) Analyzing the effects of using the 3 different learning rates on the system performance 

<h3> EXP4  alpha =0.3, Gamma=0.5  ,Q-learning three pickup locations to: (4,2), (3,3) and (2,4)</h3>

In [33]:
def exp4(agent, alpha, gamma, num_steps):

    # Reset terminal state count and proceed with the simulation
    terminal_states_reached = 0
    step = 0
    while terminal_states_reached < 6 and step < num_steps:
        policy = 'PRandom' if step < 500 else 'PExploit'
        action = agent.select_action(agent.get_valid_actions(), policy)
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS[action]
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        
        if agent.is_terminal_state():
            terminal_states_reached += 1
            agent.reset()
            if terminal_states_reached == 3:
                # Change pickup locations
                global PICKUP_LOCATIONS
                PICKUP_LOCATIONS = {(4, 2): 5, (3, 3): 5, (2, 4): 5}
        
        step += 1
    
    utility = np.sum(np.max(agent.q_table, axis=2))
    reward = np.sum([ACTION_REWARDS[ACTIONS[np.argmax(agent.q_table[x, y])]] for x in range(GRID_SIZE) for y in range(GRID_SIZE)])
    
    return agent.q_table, utility, reward

# Initialize agents
agents_seed42 = [Agent(initial_positions[name], name) for name in initial_positions]
agents_seed43 = [Agent(initial_positions[name], name) for name in initial_positions]

# Run the experiment for seed 42 and print Q-tables
random.seed(42)
np.random.seed(42)
print("Results for seed 42:")
for agent in agents_seed42:
    q_table, utility, reward = exp4(agent, 0.3, 0.5, 1000)
    print(f"Agent {agent.name}:")
    print("State  |   S       W       N       E       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{qv:6.2f}" for qv in q_table[x, y]))
    print(f"Utilities computed: {utility:.2f}")
    print(f"Rewards obtained: {reward}\n")

# Run the experiment for seed 43 and print only utilities and rewards
random.seed(43)
np.random.seed(43)
print("Results for seed 43:")
for agent in agents_seed43:
    _, utility, reward = exp4(agent, 0.3, 0.5, 1000)
    print(f"Agent {agent.name}:")
    print(f"Utilities computed: {utility:.2f}")
    print(f"Rewards obtained: {reward}\n")

Results for seed 42:
Agent red:
State  |   S       W       N       E       P       D
---------------------------------------------------
(1,1)  |    6.62    0.00    0.00    7.31   19.98    0.00
(1,2)  |    2.60    2.51    0.00    3.42   11.77   17.92
(1,3)  |    1.18    1.02    0.00    0.83    7.21    0.00
(1,4)  |    2.94    2.13    0.00    4.34    4.48   20.01
(1,5)  |    1.98    0.52    0.00    0.00    0.00    7.21
(2,1)  |    1.98    0.00    1.74    0.98    0.00    7.21
(2,2)  |    1.89    2.38    3.78   -0.30    7.71   24.85
(2,3)  |    0.11    4.11   -0.30    4.52   16.19   13.86
(2,4)  |    1.18    3.07    1.26    2.09    0.00   10.03
(2,5)  |    2.93    3.16    3.17    0.00   14.18   12.49
(3,1)  |   -0.83    0.00   -0.30   -0.66    0.00    0.00
(3,2)  |    6.74    0.83    0.25    1.86    7.80   26.00
(3,3)  |    2.68    3.75    2.45    3.81   17.29   25.23
(3,4)  |    3.92    5.20    5.38    3.90   16.29   16.74
(3,5)  |    7.19    7.09    7.26    0.00   23.43   25.92
(4,1)  |

 Discussion For 4 EXP

 
1-When interpreting the results of this experiment center on analyzing on how well the learning strategy was able to adapt to the change of the pickup locations and to which extend it was able to learn “new” paths and unlearn “old” paths which became obsolete????    
2- analyze how well the approach adapted to change in the fourth experiment???

ANSWER_ After the Change: The utility values for different seeds (42 vs. 43) indicate how well each agent adapted. Higher utility in one seed compared to another could reflect better adaptation to the new pickup locations.

ANSWER- The learning strategy showed adaptation to new pickup locations through increased Q-values for relevant actions, indicating it learned new paths. Unlearning obsolete paths was less pronounced, pointing to the need for further training.

ANSWER- High Q-values at new pickup locations indicate effective learning of new paths. However, persistently high Q-values for old paths suggest incomplete unlearning of outdated strategies


ANSWER-
The Q-tables before and after the change in the fourth experiment indicate that the agents have successfully adapted to new pickup locations by assigning higher Q-values to these areas. This adaptation reflects the learning of new paths to optimize pickups. However, the unlearning of obsolete paths is less apparent, as high Q-values remain in some of the previous locations, which may suggest that the approach needs more iterations or adjustments to fully adapt to the changes.
