<h3> Base Code , Polices , Training  </h3>

In [9]:
import numpy as np
import random
from itertools import combinations

GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}  # Pickup locations with their capacities
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}  # Dropoff locations with their remaining capacities
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']  # Actions: North, East, South, West, Pickup, Dropoff
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}  # Reward values for actions
random.seed(4)
np.random.seed(4)

# Hyperparameters
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.95
EPSILON = 0.99
EPISODES = 1000

# Initial positions for each agent
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}  # Adjusted to 0-based indexing

# Agent class definition
class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()

        # Check if agent is at a border
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')

        # Check for pickup/dropoff actions
        if (x, y) in PICKUP_LOCATIONS and (PICKUP_LOCATIONS[(x, y)] <= 0 or self.carrying):
            valid_actions.remove('P')
        if (x, y) in DROPOFF_LOCATIONS and (DROPOFF_LOCATIONS[(x, y)] >= 5 or not self.carrying):
            valid_actions.remove('D')
        return valid_actions

    
    def select_action(self, valid_actions, policy='PRandom'):        
        if not valid_actions:
            return None  # No valid actions available
        if policy == 'PRandom' or (policy == 'PExploit' and random.random() < EPSILON):
            return random.choice(valid_actions)
        else:
            q_values = [self.q_table[x, y, ACTIONS.index(a)] for a in valid_actions]
            max_q_value = max(q_values)
            max_actions = [a for a, q in zip(valid_actions, q_values) if q == max_q_value]
            return random.choice(max_actions)

    def perform_action(self, action):  
        x, y = self.position

        # Update position based on action
        if action == 'N': x = max(0, x - 1)
        elif action == 'S': x = min(GRID_SIZE - 1, x + 1)
        elif action == 'E': y = min(GRID_SIZE - 1, y + 1)
        elif action == 'W': y = max(0, y - 1)

        # Handle pickup and dropoff actions
        if action == 'P' and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS and DROPOFF_LOCATIONS[(x, y)] < 5:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1

        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_state, next_valid_actions):
        if action is None:
            return  # Skip Q-table update if no action was taken
        old_x, old_y = self.position
        new_x, new_y = next_state
        action_index = ACTIONS.index(action)
        future_rewards = [self.q_table[new_x, new_y, ACTIONS.index(a)] for a in next_valid_actions]
        self.q_table[old_x, old_y, action_index] = (1 - LEARNING_RATE) * self.q_table[old_x, old_y, action_index] + \
            LEARNING_RATE * (reward + DISCOUNT_FACTOR * np.max(future_rewards))

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Utility functions
def manhattan_distance(pos1, pos2):
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

def average_manhattan_distance(agents):
    distances = [manhattan_distance(agent1.position, agent2.position) 
                 for agent1, agent2 in combinations(agents, 2)]
    return np.mean(distances)

def get_shortest_path_length(start, end):
    return abs(start[0] - end[0]) + abs(start[1] - end[1])

def simulate_agent_path(agent, start, end):
    agent.reset()
    agent.position = start
    path = [start]
    while agent.position != end:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions)
        agent.perform_action(action)
        path.append(agent.position)
    return path

# Simulation
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

average_distances = []

for episode in range(EPISODES):
    episode_distances = []
    for agent in agents:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions, policy='PRandom')  
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS.get(action, 0)
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        episode_distances.append(average_manhattan_distance(agents))
    average_distances.append(np.mean(episode_distances))
    if agent.is_terminal_state():
        for a in agents:
            a.reset()
        break

# Output analysis
print(f"Average Manhattan distances over episodes:")
for i, avg_dist in enumerate(average_distances, 1):
    print(f"Episode {i}: {avg_dist:.2f}")

# Output Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   S       E       N       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")

# Simulate a path from a pickup to a dropoff location
agent = Agent(initial_positions['red'], 'red')  # You can choose any agent here
pickup_location = next(iter(PICKUP_LOCATIONS))
dropoff_location = next(iter(DROPOFF_LOCATIONS))
path = simulate_agent_path(agent, pickup_location, dropoff_location)

# Calculate the shortest path length
shortest_path_len = get_shortest_path_length(pickup_location, dropoff_location)

print(f"Simulated path: {path}")
print(f"Simulated path length: {len(path) - 1}")
print(f"Shortest path length: {shortest_path_len}")

Average Manhattan distances over episodes:
Episode 1: 3.78
Episode 2: 4.00
Episode 3: 4.22
Episode 4: 4.44
Episode 5: 4.22
Episode 6: 4.44
Episode 7: 4.44
Episode 8: 5.33
Episode 9: 4.22
Episode 10: 3.78
Episode 11: 4.00
Episode 12: 3.78
Episode 13: 3.78
Episode 14: 4.44
Episode 15: 4.67
Episode 16: 3.78
Episode 17: 3.33
Episode 18: 2.44
Episode 19: 3.33
Episode 20: 3.56
Episode 21: 3.78
Episode 22: 3.78
Episode 23: 3.56
Episode 24: 3.33
Episode 25: 3.78
Episode 26: 4.67
Episode 27: 5.11
Episode 28: 4.67
Episode 29: 4.00
Episode 30: 3.33
Episode 31: 2.22
Episode 32: 2.89
Episode 33: 4.00
Episode 34: 4.00
Episode 35: 3.56
Episode 36: 4.44
Episode 37: 3.78
Episode 38: 3.11
Episode 39: 3.56
Episode 40: 3.56
Episode 41: 2.44
Episode 42: 1.33
Episode 43: 1.33
Episode 44: 1.56
Episode 45: 2.22
Episode 46: 2.44
Episode 47: 2.67
Episode 48: 3.11
Episode 49: 3.33
Episode 50: 3.78
Episode 51: 4.22
Episode 52: 4.67
Episode 53: 4.00
Episode 54: 3.33
Episode 55: 3.56
Episode 56: 3.33
Episode 57: 3.

In [None]:
Approach A: Each agent use his own RL

A.	Agent coordination: Do the three agents get in their ways blocking each other or do they do a good job in dividing the transportation task intelligently among one another. Agent coordination could, for example, be measured by computing the average Manhattan distance between the three agents during the run of a specific experiment. You can show the progress of average distance for different iterations.

-Manhattan distance Is calculated through each iteration it is seems pretty fixed at the end : 


B. Paths learned: Does the particular approach do a good job in learning paths between block sources and block destinations; is the learnt path the shortest path or close to the shortest path between the source and the destination. 

-It is not too close i'll rate it 6.5 out of 10     Simulated path length: 12  compared to Shortest path length: 4
-This suggests that the learned policy is not finding the most efficient route



Extra Credit On The Compare   For the task 2 and 3 below





<h3> EXP1   </h3>

<h3> EXP2  SARSA Q-learning 9000 (500,8500) &report  Q-tables  </h3>

<h3> EXP3 From 1.c (Qlearning) Different Learning rate (0.15,0.45)  </h3>

(Discussion) Analyzing the effects of using the 3 different learning rates on the system performance 

<h3> EXP4 From 1c alpha =0.3, Gamma=0.5  ,Q-learning three pickup locations to: (4,2), (3,3) and (2,4)</h3>

In [13]:
import numpy as np
import random

# Definitions of other necessary components, including the Agent class, are assumed to be provided

def exp4(agent, alpha, gamma, num_steps):

    # Reset terminal state count and proceed with the simulation
    terminal_states_reached = 0
    step = 0
    while terminal_states_reached < 6 and step < num_steps:
        policy = 'PRandom' if step < 500 else 'PExploit'
        action = agent.select_action(agent.get_valid_actions(), policy)
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS[action]
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        
        if agent.is_terminal_state():
            terminal_states_reached += 1
            agent.reset()
            if terminal_states_reached == 3:
                # Change pickup locations
                global PICKUP_LOCATIONS
                PICKUP_LOCATIONS = {(4, 2): 5, (3, 3): 5, (2, 4): 5}
        
        step += 1
    
    utility = np.sum(np.max(agent.q_table, axis=2))
    reward = np.sum([ACTION_REWARDS[ACTIONS[np.argmax(agent.q_table[x, y])]] for x in range(GRID_SIZE) for y in range(GRID_SIZE)])
    
    return agent.q_table, utility, reward

# Initialize agents
agents_seed42 = [Agent(initial_positions[name], name) for name in initial_positions]
agents_seed43 = [Agent(initial_positions[name], name) for name in initial_positions]

# Run the experiment for seed 42 and print Q-tables
random.seed(42)
np.random.seed(42)
print("Results for seed 42:")
for agent in agents_seed42:
    q_table, utility, reward = exp4(agent, 0.3, 0.5, 1000)
    print(f"Agent {agent.name}:")
    print("State  |   S       E       N       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{qv:6.2f}" for qv in q_table[x, y]))
    print(f"Utilities computed: {utility:.2f}")
    print(f"Rewards obtained: {reward}\n")

# Run the experiment for seed 43 and print only utilities and rewards
random.seed(43)
np.random.seed(43)
print("Results for seed 43:")
for agent in agents_seed43:
    _, utility, reward = exp4(agent, 0.3, 0.5, 1000)
    print(f"Agent {agent.name}:")
    print(f"Utilities computed: {utility:.2f}")
    print(f"Rewards obtained: {reward}\n")

Results for seed 42:
Agent red:
State  |   S       E       N       W       P       D
---------------------------------------------------
(1,1)  |    8.01    0.00    0.00    7.21   20.04    0.00
(1,2)  |    6.35    6.86    0.00    3.74   22.15   20.16
(1,3)  |    0.44    0.75    0.00    0.38    2.59    2.94
(1,4)  |    3.15    4.96    0.00    6.92   11.92   15.19
(1,5)  |    1.04    1.33    0.00    0.00    0.00    5.16
(2,1)  |    3.78    0.00    2.28    2.56   11.00   10.66
(2,2)  |    3.35    1.77    2.80    2.00    7.46   11.60
(2,3)  |    2.51    2.72    2.46    3.64   10.53   12.24
(2,4)  |    1.90    2.01    1.92    1.81    0.00    8.96
(2,5)  |    3.29    2.99    3.05    0.00    7.58   10.30
(3,1)  |    1.27    0.00    1.21    2.36    7.70    0.00
(3,2)  |    1.77    1.13    1.06    1.54    8.56    7.98
(3,3)  |    2.00    3.01    2.30    5.37   16.00   14.85
(3,4)  |    4.06    4.31    5.49    3.27   14.22   14.24
(3,5)  |    3.70    8.39    6.21    0.00   15.07   16.20
(4,1)  |

 Discussion))) When interpreting the results of this experiment center on analyzing on how well the learning strategy was able to adapt to the change of the pickup locations and to which extend it was able to learn “new” paths and unlearn “old” paths which became obsolete

note:For all EXPERMINENTS, if a terminal state is reached, restart the experiment by resetting the PD world to the initial state, but do not reset the Q-table. Run each experiment twice, 

Report  and interpret the results; e.g., utilities computed, rewards obtained in various stages of each experiment. 

Assess which experiment obtained the best results . Next, analyze the various q-tables you created and try identify attractive paths  in the obtained q-tables, if there are any. Moreover, briefly assess if your system gets better after it solved a few PD-world problems—reached the terminal state at least once. Briefly analyze to which extend the results of the two different runs agree and disagree in the 4 experiments. Analyze agent coordination for experiments 1.c and 4. Finally, analyze how well the approach adapted to change in the fourth experiment
