<h3> Base Code , Polices , Training  </h3>

In [24]:
import numpy as np
import random
from itertools import combinations

random.seed(4)
np.random.seed(4)
GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}  # Adjusted to 0-based indexing
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}  # Adjusted to 0-based indexing
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}

# Hyperparameters
LEARNING_RATE = 0.3
DISCOUNT_FACTOR = 0.5
EPSILON = 0.1
EPISODES = 1000

class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))
        
    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()

        # Check if agent is at a border
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')

        # Check for pickup/dropoff actions
        if (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] <= 0 or self.carrying:
            valid_actions.remove('P')
        if (x, y) in DROPOFF_LOCATIONS and (DROPOFF_LOCATIONS[(x, y)] >= 5 or not self.carrying):
            valid_actions.remove('D')
        return valid_actions
    
    def select_action(self, valid_actions, policy='PRandom'):
        if not valid_actions:
            return None  # No valid actions available
        if policy == 'PRandom' or (policy == 'PExploit' and random.random() < EPSILON):
            return random.choice(valid_actions)
        else:
            q_values = [self.q_table[x, y, ACTIONS.index(a)] for a in valid_actions]
            max_q_value = max(q_values)
            max_actions = [a for a, q in zip(valid_actions, q_values) if q == max_q_value]
            return random.choice(max_actions)

    def perform_action(self, action):
        x, y = self.position

        # Update position based on action
        if action == 'N': x = max(0, x - 1)
        elif action == 'S': x = min(GRID_SIZE - 1, x + 1)
        elif action == 'E': y = min(GRID_SIZE - 1, y + 1)
        elif action == 'W': y = max(0, y - 1)

        # Handle pickup and dropoff actions
        if action == 'P' and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS and DROPOFF_LOCATIONS[(x, y)] < 5:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1

        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_state, next_valid_actions):
        if action is None:
            return  # Skip Q-table update if no action was taken
        old_x, old_y = self.position
        new_x, new_y = next_state
        action_index = ACTIONS.index(action)
        future_rewards = [self.q_table[new_x, new_y, ACTIONS.index(a)] for a in next_valid_actions]
        self.q_table[old_x, old_y, action_index] = (1 - LEARNING_RATE) * self.q_table[old_x, old_y, action_index] + \
            LEARNING_RATE * (reward + DISCOUNT_FACTOR * np.max(future_rewards))

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Function to compute Manhattan distance between two points
def manhattan_distance(pos1, pos2):
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

# Function to compute the average Manhattan distance between all pairs of agents
def average_manhattan_distance(agents):
    distances = [manhattan_distance(agent1.position, agent2.position) 
                 for agent1, agent2 in combinations(agents, 2)]
    return np.mean(distances)

# Initial positions for each agent
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}  # Adjusted to 0-based indexing

# Creating agents
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

# List to keep track of the average distances at each step
average_distances = []

for episode in range(EPISODES):
    episode_distances = []
    for agent in agents:
        valid_actions = agent.get_valid_actions()
        action = agent.select_action(valid_actions, policy='PRandom')  # You can change the policy here
        if action:
            new_position = agent.perform_action(action)
            reward = ACTION_REWARDS.get(action, 0)
            next_valid_actions = agent.get_valid_actions()
            agent.update_q_table(action, reward, new_position, next_valid_actions)
        # After each agent has moved, calculate the average distance
        episode_distances.append(average_manhattan_distance(agents))
        
    average_distances.append(np.mean(episode_distances))
    
    if agent.is_terminal_state():
        for a in agents:
            a.reset()
        break

# Output the average Manhattan distance for analysis
print(f"Average Manhattan distances over episodes:")
for i, avg_dist in enumerate(average_distances, 1):
    print(f"Episode {i}: {avg_dist:.2f}")

# Output the Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   N       E       S       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")



Average Manhattan distances over episodes:
Episode 1: 3.78
Episode 2: 4.00
Episode 3: 4.22
Episode 4: 4.44
Episode 5: 4.22
Episode 6: 4.44
Episode 7: 4.44
Episode 8: 5.33
Episode 9: 4.44
Episode 10: 4.89
Episode 11: 4.67
Episode 12: 4.89
Episode 13: 4.22
Episode 14: 3.11
Episode 15: 3.33
Episode 16: 2.22
Episode 17: 1.56
Episode 18: 2.67
Episode 19: 3.11
Episode 20: 3.11
Episode 21: 3.11
Episode 22: 3.11
Episode 23: 1.56
Episode 24: 2.00
Episode 25: 2.89
Episode 26: 4.00
Episode 27: 3.33
Episode 28: 2.89
Episode 29: 2.89
Episode 30: 3.78
Episode 31: 3.56
Episode 32: 4.00
Episode 33: 4.67
Episode 34: 4.00
Episode 35: 3.33
Episode 36: 2.44
Episode 37: 1.56
Episode 38: 2.67
Episode 39: 3.11
Episode 40: 2.89
Episode 41: 2.89
Episode 42: 2.67
Episode 43: 2.44
Episode 44: 2.89
Episode 45: 3.78
Episode 46: 4.22
Episode 47: 5.11
Episode 48: 4.44
Episode 49: 3.33
Episode 50: 3.33
Episode 51: 3.78
Episode 52: 4.00
Episode 53: 3.33
Episode 54: 3.78
Episode 55: 3.33
Episode 56: 4.00
Episode 57: 4.

a.	Agent coordination: Do the three agents get in their ways blocking each other or do they do a good job in dividing the transportation task intelligently among one another. Agent coordination could, for example, be measured by computing the average Manhattan distance between the three agents during the run of a specific experiment. You can show the progress of average distance for different iterations. b.	Paths learned: Does the particular approach do a good job in learning paths between block sources and block destinations; is the learnt path the shortest path or close to the shortest path between the source and the destination.  c.	Cost Efficient Learning: Does the particular approach do a good job learning efficient paths or avoiding risky areas. How the risky areas are traversed by the agents as the training goes on and is there any avoidance procedure followed by the trained agents. 

Extra Credit On The Compare   For the task 2 and 3 below





<h3> EXP1    Alpha=0.3 Y=0.5   Steps=9000   , PRandom=500 </h3>    

<h3> EXP1 A.PRandom=8500</h3>  

In [29]:
import numpy as np
import random

# Environment setup constants
GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}

# Hyperparameters
LEARNING_RATE = 0.3
DISCOUNT_FACTOR = 0.5
STEPS = 9000
INITIAL_RANDOM_STEPS = 500

# Agent class definition
class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')
        if self.carrying and (x, y) in DROPOFF_LOCATIONS:
            valid_actions.append('D')
        elif not self.carrying and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            valid_actions.append('P')
        return valid_actions

    def select_action(self, valid_actions, policy='PRandom'):
        if not valid_actions:
            return None
        return random.choice(valid_actions)

    def perform_action(self, action):
        x, y = self.position
        if action in ['N', 'E', 'S', 'W']:
            if action == 'N': x -= 1
            elif action == 'S': x += 1
            elif action == 'E': y += 1
            elif action == 'W': y -= 1
        if action == 'P' and (x, y) in PICKUP_LOCATIONS:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1
        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_position):
        old_state = self.position
        old_q = self.q_table[old_state[0], old_state[1], ACTIONS.index(action)]
        next_max_q = np.max(self.q_table[next_position[0], next_position[1]])
        self.q_table[old_state[0], old_state[1], ACTIONS.index(action)] = \
            (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (reward + DISCOUNT_FACTOR * next_max_q)

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Initialize agents
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

def run_experiment(policy, steps, agents):
    for step in range(steps):
        for agent in agents:
            valid_actions = agent.get_valid_actions()
            action = agent.select_action(valid_actions, policy)
            if action:
                new_position = agent.perform_action(action)
                reward = ACTION_REWARDS.get(action, 0)
                agent.update_q_table(action, reward, new_position)
            if agent.is_terminal_state():
                for a in agents:
                    a.reset()

# Run initial PRANDOM policy for 500 steps
run_experiment('PRandom', INITIAL_RANDOM_STEPS, agents)

# Continue with PRANDOM for the remaining 8500 steps
run_experiment('PRandom', STEPS - INITIAL_RANDOM_STEPS, agents)

# Output the final Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   N       E       S       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")


red's Q-table:
State  |   N       E       S       W       P       D
---------------------------------------------------
(1,1)  |   12.00    0.00    0.00   12.00   26.00   26.00
(1,2)  |   12.00   12.00    0.00   12.00   26.00   26.00
(1,3)  |   12.00   12.00    0.00   12.00   26.00   26.00
(1,4)  |   12.00   12.00    0.00   12.00   26.00   26.00
(1,5)  |   12.00   12.00    0.00    0.00   26.00   26.00
(2,1)  |   12.00    0.00   12.00   12.00   26.00   26.00
(2,2)  |   12.00   12.00   12.00   12.00   26.00   26.00
(2,3)  |   12.00   12.00   12.00   12.00   26.00   26.00
(2,4)  |   12.00   12.00   12.00   12.00   26.00   26.00
(2,5)  |   12.00   12.00   12.00    0.00   26.00   26.00
(3,1)  |   12.00    0.00   12.00   12.00   26.00   26.00
(3,2)  |   12.00   12.00   12.00   12.00   26.00   26.00
(3,3)  |   12.00   12.00   12.00   12.00   26.00   26.00
(3,4)  |   12.00   12.00   12.00   12.00   26.00   26.00
(3,5)  |   12.00   12.00   12.00    0.00   26.00   26.00
(4,1)  |   12.00    0.00 

<h3> EXP1 B.GREEDY=8500</h3> 

In [28]:
import numpy as np
import random

# Environment setup constants
GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}

# Hyperparameters
LEARNING_RATE = 0.3
DISCOUNT_FACTOR = 0.5
STEPS = 9000
INITIAL_RANDOM_STEPS = 500
EPSILON = 0.1  # Epsilon for greedy selection

# Agent class definition
class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')
        if self.carrying and (x, y) in DROPOFF_LOCATIONS:
            valid_actions.append('D')
        elif not self.carrying and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            valid_actions.append('P')
        return valid_actions

    def select_action(self, valid_actions, policy='PRandom'):
        if not valid_actions:
            return None
        if policy == 'PGreedy' and random.random() > EPSILON:
            q_values = [self.q_table[self.position[0], self.position[1], ACTIONS.index(a)] for a in valid_actions]
            max_value = max(q_values)
            return valid_actions[q_values.index(max_value)]
        return random.choice(valid_actions)

    def perform_action(self, action):
        x, y = self.position
        if action in ['N', 'E', 'S', 'W']:
            if action == 'N': x -= 1
            elif action == 'S': x += 1
            elif action == 'E': y += 1
            elif action == 'W': y -= 1
        if action == 'P' and (x, y) in PICKUP_LOCATIONS:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1
        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_position):
        old_state = self.position
        old_q = self.q_table[old_state[0], old_state[1], ACTIONS.index(action)]
        next_max_q = np.max(self.q_table[next_position[0], next_position[1]])
        self.q_table[old_state[0], old_state[1], ACTIONS.index(action)] = \
            (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (reward + DISCOUNT_FACTOR * next_max_q)

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Initialize agents
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

def run_experiment(policy, steps, agents):
    for step in range(steps):
        for agent in agents:
            valid_actions = agent.get_valid_actions()
            action = agent.select_action(valid_actions, policy)
            if action:
                new_position = agent.perform_action(action)
                reward = ACTION_REWARDS.get(action, 0)
                agent.update_q_table(action, reward, new_position)
            if agent.is_terminal_state():
                for a in agents:
                    a.reset()

# Run initial PRANDOM policy for 500 steps
run_experiment('PRandom', INITIAL_RANDOM_STEPS, agents)

# Switch to PGreedy for the remaining 8500 steps
run_experiment('PGreedy', STEPS - INITIAL_RANDOM_STEPS, agents)

# Output the final Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   N       E       S       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")


red's Q-table:
State  |   N       E       S       W       P       D
---------------------------------------------------
(1,1)  |   10.52    0.00    0.00   10.55   24.13   26.00
(1,2)  |   10.12   11.15    0.00   10.80   24.84   26.00
(1,3)  |   10.94   11.27    0.00   11.39   25.84   26.00
(1,4)  |   11.08   11.57    0.00   11.21   26.00   25.80
(1,5)  |   11.72   11.23    0.00    0.00   25.87   26.00
(2,1)  |   11.37    0.00   10.02    6.59   23.26   26.00
(2,2)  |    9.06    1.61   10.17    9.40   21.83   26.00
(2,3)  |   11.11    8.53   10.91   11.41   26.00   20.22
(2,4)  |   10.16   10.71   11.11   10.10   26.00   24.87
(2,5)  |   11.55   11.20   11.78    0.00   26.00   24.95
(3,1)  |   11.54    0.00   11.64   11.44   26.00   25.79
(3,2)  |    8.00   11.82    8.69    7.78   24.57   26.00
(3,3)  |    6.96    9.38   11.53   10.81   22.30   26.00
(3,4)  |   11.64   10.69    7.30   11.36   26.00   21.34
(3,5)  |   11.54   11.34   11.85    0.00   26.00   24.17
(4,1)  |    9.52    0.00 

<h3> EXP1 C.Exploit =8500</h3> 

In [19]:
import numpy as np
import random

# Environment setup constants
GRID_SIZE = 5
PICKUP_LOCATIONS = {(0, 4): 5, (1, 3): 5, (4, 1): 5}
DROPOFF_LOCATIONS = {(0, 0): 0, (2, 0): 0, (3, 4): 0}
ACTIONS = ['N', 'E', 'S', 'W', 'P', 'D']
ACTION_REWARDS = {'P': 13, 'D': 13, 'N': -1, 'E': -1, 'S': -1, 'W': -1}

# Hyperparameters
LEARNING_RATE = 0.3
DISCOUNT_FACTOR = 0.5
STEPS = 9000
INITIAL_RANDOM_STEPS = 500
EPSILON = 0.01  # Lower epsilon for more exploitation in PExploit

# Agent class definition
class Agent:
    def __init__(self, start_position, name):
        self.position = start_position
        self.name = name
        self.carrying = False
        self.q_table = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))

    def get_valid_actions(self):
        x, y = self.position
        valid_actions = ACTIONS.copy()
        if x == 0: valid_actions.remove('N')
        if x == GRID_SIZE - 1: valid_actions.remove('S')
        if y == 0: valid_actions.remove('W')
        if y == GRID_SIZE - 1: valid_actions.remove('E')
        if self.carrying and (x, y) in DROPOFF_LOCATIONS:
            valid_actions.append('D')
        elif not self.carrying and (x, y) in PICKUP_LOCATIONS and PICKUP_LOCATIONS[(x, y)] > 0:
            valid_actions.append('P')
        return valid_actions

    def select_action(self, valid_actions, policy='PRandom'):
        if not valid_actions:
            return None
        if policy == 'PExploit' and random.random() > EPSILON:
            q_values = [self.q_table[self.position[0], self.position[1], ACTIONS.index(a)] for a in valid_actions]
            max_value = max(q_values)
            return valid_actions[q_values.index(max_value)]
        return random.choice(valid_actions)

    def perform_action(self, action):
        x, y = self.position
        if action in ['N', 'E', 'S', 'W']:
            if action == 'N': x -= 1
            elif action == 'S': x += 1
            elif action == 'E': y += 1
            elif action == 'W': y -= 1
        if action == 'P' and (x, y) in PICKUP_LOCATIONS:
            self.carrying = True
            PICKUP_LOCATIONS[(x, y)] -= 1
        elif action == 'D' and (x, y) in DROPOFF_LOCATIONS:
            self.carrying = False
            DROPOFF_LOCATIONS[(x, y)] += 1
        self.position = (x, y)
        return self.position

    def update_q_table(self, action, reward, next_position):
        old_state = self.position
        old_q = self.q_table[old_state[0], old_state[1], ACTIONS.index(action)]
        next_max_q = np.max(self.q_table[next_position[0], next_position[1]])
        self.q_table[old_state[0], old_state[1], ACTIONS.index(action)] = \
            (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (reward + DISCOUNT_FACTOR * next_max_q)

    def is_terminal_state(self):
        return all(blocks == 5 for blocks in DROPOFF_LOCATIONS.values())

    def reset(self):
        self.position = initial_positions[self.name]
        self.carrying = False
        for loc in PICKUP_LOCATIONS:
            PICKUP_LOCATIONS[loc] = 5
        for loc in DROPOFF_LOCATIONS:
            DROPOFF_LOCATIONS[loc] = 0

# Initialize agents
initial_positions = {'red': (2, 2), 'blue': (4, 2), 'black': (0, 2)}
agents = [Agent(initial_positions['red'], 'red'), 
          Agent(initial_positions['blue'], 'blue'), 
          Agent(initial_positions['black'], 'black')]

def run_experiment(policy, steps, agents):
    for step in range(steps):
        for agent in agents:
            valid_actions = agent.get_valid_actions()
            action = agent.select_action(valid_actions, policy)
            if action:
                new_position = agent.perform_action(action)
                reward = ACTION_REWARDS.get(action, 0)
                agent.update_q_table(action, reward, new_position)
            if agent.is_terminal_state():
                for a in agents:
                    a.reset()

# Run initial PRANDOM policy for 500 steps
run_experiment('PRandom', INITIAL_RANDOM_STEPS, agents)

# Switch to PExploit for the remaining 8500 steps
run_experiment('PExploit', STEPS - INITIAL_RANDOM_STEPS, agents)

# Output the final Q-tables for analysis
for agent in agents:
    print(f"{agent.name}'s Q-table:")
    print("State  |   N       E       S       W       P       D")
    print("---------------------------------------------------")
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            q_values = agent.q_table[x, y]
            print(f"({x+1},{y+1})  |  " + "  ".join(f"{q_value:6.2f}" for q_value in q_values))
    print("\n")


red's Q-table:
State  |   N       E       S       W       P       D
---------------------------------------------------
(1,1)  |   -0.51    0.00    0.00    1.87   12.91   26.00
(1,2)  |    1.01    4.00    0.00    7.60   11.29   26.00
(1,3)  |    8.69    8.76    0.00    8.56   16.26   26.00
(1,4)  |    2.46    6.54    0.00    5.87   26.00   17.42
(1,5)  |    7.56    3.69    0.00    0.00   26.00   19.11
(2,1)  |   -0.66    0.00   -0.51   -0.76    0.00    0.00
(2,2)  |    5.87    7.73    8.96    2.94   26.00   21.43
(2,3)  |    7.72    7.86    2.89    6.17   11.10   26.00
(2,4)  |    0.57    7.56    1.44    0.00   26.00    4.98
(2,5)  |    0.97    2.89    6.31    0.00   26.00   18.27
(3,1)  |    0.43    0.00    0.59    1.63    4.48   26.00
(3,2)  |    5.23    1.61    7.17    8.49   26.00    8.89
(3,3)  |    4.05    8.27    7.83    6.83   26.00   16.05
(3,4)  |    4.30    5.50    1.20    3.19   13.26   26.00
(3,5)  |    3.14    0.49    3.53    0.00   24.94   16.76
(4,1)  |    1.40    0.00 

In [217]:
import numpy as np

class QTable:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = np.zeros((num_states, num_actions))

    def update_q_value(self, state, action, reward, next_state):
        current_q_value = self.q_table[state, action]
        max_next_q_value = np.max(self.q_table[next_state])
        new_q_value = (1 - self.learning_rate) * current_q_value + \
                      self.learning_rate * (reward + self.discount_factor * max_next_q_value)
        self.q_table[state, action] = new_q_value

    def get_q_value(self, state, action):
        return self.q_table[state, action]

    def update_learning_rate(self, new_learning_rate):
        self.learning_rate = new_learning_rate

    def update_discount_factor(self, new_discount_factor):
        self.discount_factor = new_discount_factor

    def get_optimal_action(self, state, exploration_rate):
        if np.random.rand() < exploration_rate:
            return np.random.randint(self.num_actions)  # Randomly choose an action
        else:
            return np.argmax(self.q_table[state])  # Choose action with maximum Q-value

class BlockTransportationProblem:
    def __init__(self, num_agents, num_states, num_actions, learning_rate, discount_factor):
        self.num_agents = num_agents
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_tables = [QTable(num_states, num_actions, learning_rate, discount_factor) for _ in range(num_agents)]

    def update_q_values(self, agent_index, state, action, reward, next_state):
        self.q_tables[agent_index].update_q_value(state, action, reward, next_state)

    def select_action(self, agent_index, state, exploration_rate, policy):
        if policy == "PRANDOM":
            return np.random.randint(self.num_actions)  # Choose action randomly
        elif policy == "PEXPLOIT":
            return self.q_tables[agent_index].get_optimal_action(state, exploration_rate)
        elif policy == "PGREEDY":
            return np.argmax(self.q_tables[agent_index].q_table[state])  # Choose action with maximum Q-value

# Experiment parameters
num_agents = 3
num_states = 25  # Assuming 25 states
num_actions = 4  # Assuming 4 actions (north, south, east, west)
learning_rate = 0.3
discount_factor = 0.5
total_steps = 9000
initial_prandom_steps = 500
remaining_steps = total_steps - initial_prandom_steps
policy_switch_step = initial_prandom_steps

# Initialize Block Transportation Problem
btp = BlockTransportationProblem(num_agents, num_states, num_actions,learning_rate,discount_factor)

# Run PRANDOM for initial steps
for step in range(initial_prandom_steps):
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp.select_action(agent_index, state, exploration_rate=1.0, policy="PRANDOM") for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    for agent_index in range(num_agents):
        btp.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index])

# Switch policies and continue running
for step in range(policy_switch_step, total_steps):
    policy = "PGREEDY" if step < (total_steps - remaining_steps // 2) else "PEXPLOIT"
    exploration_rate = 0.1 if policy == "PEXPLOIT" else 1.0
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp.select_action(agent_index, state, exploration_rate, policy) for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    for agent_index in range(num_agents):
        btp.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index])

final_states = [np.random.randint(num_states) for _ in range(num_agents)]

policy = "PEXPLOIT"
final_q_table = btp.q_tables[0].q_table  # Assuming agents have the same Q-table

In [219]:
print(final_q_table)

[[ 7.00503694  7.78732465  8.59852831  8.60352702]
 [ 6.05355659  6.9877843   6.61093801  8.45854186]
 [11.69697107  6.85421518  5.51693656  7.12918729]
 [ 8.67679623  6.30499648  7.90454484  7.76605359]
 [ 6.5740658   6.14716011  6.70127726  7.89620298]
 [ 7.20960279  7.07304751  6.12619347  7.89538157]
 [ 5.71143053  7.4342129   8.68682991 10.22936457]
 [ 8.11074145  8.49385569  6.93596339  7.83008148]
 [ 7.22462064  7.54871298  7.67656901 10.45340967]
 [ 7.2511047   9.97652313  6.76792038  6.25029643]
 [ 5.11537363  9.62588791  6.95030787  6.67769375]
 [ 7.95021029 11.15103703  7.57058169  7.42208313]
 [ 6.3392159   8.66268703  6.88790682 10.38479038]
 [ 5.55388604  8.64765435  6.41312333  6.93900456]
 [ 9.64091392  6.97437254  6.79949482  6.67494951]
 [ 7.31530805  7.43805912  7.21960983  9.33020618]
 [ 7.75194522  5.0191504   8.15040355 11.2883345 ]
 [ 6.49864004  6.59574904  8.89215252  7.03100242]
 [ 9.47166001  6.83349677  7.67398663  7.73384051]
 [ 6.6392951   5.53516481  6.41

In [189]:
import numpy as np

class QTableSARSA:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = np.zeros((num_states, num_actions))

    def update_q_value(self, state, action, reward, next_state, next_action):
        current_q_value = self.q_table[state, action]
        next_q_value = self.q_table[next_state, next_action]
        new_q_value = (1 - self.learning_rate) * current_q_value + \
                      self.learning_rate * (reward + self.discount_factor * next_q_value)
        self.q_table[state, action] = new_q_value

    def get_q_value(self, state, action):
        return self.q_table[state, action]

    def update_learning_rate(self, new_learning_rate):
        self.learning_rate = new_learning_rate

    def update_discount_factor(self, new_discount_factor):
        self.discount_factor = new_discount_factor

    def get_optimal_action(self, state, exploration_rate):
        if np.random.rand() < exploration_rate:
            return np.random.randint(self.num_actions)  # Randomly choose an action
        else:
            return np.argmax(self.q_table[state])  # Choose action with maximum Q-value

class BlockTransportationProblemSARSA:
    def __init__(self, num_agents, num_states, num_actions, learning_rate, discount_factor):
        self.num_agents = num_agents
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_tables = [QTableSARSA(num_states, num_actions, learning_rate, discount_factor) for _ in range(num_agents)]

    def update_q_values(self, agent_index, state, action, reward, next_state, next_action):
        self.q_tables[agent_index].update_q_value(state, action, reward, next_state, next_action)

    def select_action(self, agent_index, state, exploration_rate, policy):
        if policy == "PRANDOM":
            return np.random.randint(self.num_actions)  # Choose action randomly
        elif policy == "PEXPLOIT":
            return self.q_tables[agent_index].get_optimal_action(state, exploration_rate)
        elif policy == "PGREEDY":
            return np.argmax(self.q_tables[agent_index].q_table[state])  # Choose action with maximum Q-value

# Experiment parameters
num_agents = 3
num_states = 25  # Assuming 25 states
num_actions = 4  # Assuming 4 actions (north, south, east, west)
learning_rate = 0.3
discount_factor = 0.5
total_steps = 9000
initial_prandom_steps = 500
remaining_steps = total_steps - initial_prandom_steps
policy_switch_step = initial_prandom_steps

# Initialize Block Transportation Problem with SARSA
btp_sarsa = BlockTransportationProblemSARSA(num_agents, num_states, num_actions, learning_rate, discount_factor)

# Run PRANDOM for initial steps
for step in range(initial_prandom_steps):
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp_sarsa.select_action(agent_index, state, exploration_rate=1.0, policy="PRANDOM") for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    next_actions = [btp_sarsa.select_action(agent_index, next_state, exploration_rate=0.1, policy="PEXPLOIT") for agent_index, next_state in enumerate(next_states)]
    for agent_index in range(num_agents):
        btp_sarsa.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index], next_actions[agent_index])

# Switch policies and continue running
for step in range(policy_switch_step, total_steps):
    policy = "PGREEDY" if step < (total_steps - remaining_steps // 2) else "PEXPLOIT"
    exploration_rate = 0.1 if policy == "PEXPLOIT" else 1.0
    states = [np.random.randint(num_states) for _ in range(num_agents)]
    actions = [btp_sarsa.select_action(agent_index, state, exploration_rate, policy) for agent_index, state in enumerate(states)]
    rewards = [np.random.randint(10) for _ in range(num_agents)]  # Random rewards for demonstration
    next_states = [np.random.randint(num_states) for _ in range(num_agents)]
    next_actions = [btp_sarsa.select_action(agent_index, next_state, exploration_rate, policy) for agent_index, next_state in enumerate(next_states)]
    for agent_index in range(num_agents):
        btp_sarsa.update_q_values(agent_index, states[agent_index], actions[agent_index], rewards[agent_index], next_states[agent_index], next_actions[agent_index])

# Report one of the final Q-tables
final_q_table_sarsa = btp_sarsa.q_tables[0].q_table  # Assuming agents have the same Q-table
print("Final Q-Table for Experiment 2 (SARSA):")
print(final_q_table_sarsa)

# Assess the quality of agent coordination
# You can assess coordination by analyzing metrics such as total rewards, convergence speed, and task completion rates.


Final Q-Table for Experiment 2 (SARSA):
[[ 7.69489145  7.30731207  9.15917982  7.20759023]
 [ 7.70551939  7.83009582 10.18136422  7.69689014]
 [ 6.43137058  5.36012178  6.84581668  9.67628003]
 [ 6.92972656  7.88601297  6.96209785  9.64069805]
 [ 5.91817267  7.74922127  6.53013358  7.04838564]
 [ 7.17631581  7.85516956  6.86019335  8.90321939]
 [11.49593275  7.18641286  6.91201564  9.00027687]
 [ 7.21555706  7.23357596  8.13285405  6.9126108 ]
 [ 7.56162342  9.21199453  7.48434565  6.56246388]
 [ 7.51717167  6.8564807  10.24051227  6.74161605]
 [ 8.06489509  5.82176862  7.33390791  6.44948564]
 [ 6.04554829  9.9207904   5.757327    5.67044394]
 [ 8.55599842  7.77909734  6.57297255  6.0612052 ]
 [ 7.14260543  6.63142825  6.90321385  9.29520028]
 [ 8.00602835  7.52976584  6.29494984  6.65833033]
 [ 6.42009631  9.19182138  6.48752771  7.14018259]
 [ 6.31456673 10.28249488  6.02792077  7.12299293]
 [ 6.07957032  5.58712984  8.43679549  9.65963799]
 [ 6.40197019  7.41625154  7.87675334 10.2

<h3> EXP2  SARSA Q-learning 9000 (500,8500) &report  Q-tables  </h3>

<h3> EXP3 From 1.c (Qlearning) Different Learning rate (0.15,0.45)  </h3>

(Discussion) Analyzing the effects of using the 3 different learning rates on the system performance 

<h3> EXP4 From 1c alpha =0.3, Gamma=0.5  ,Q-learning three pickup locations to: (4,2), (3,3) and (2,4)</h3>

 Discussion))) When interpreting the results of this experiment center on analyzing on how well the learning strategy was able to adapt to the change of the pickup locations and to which extend it was able to learn “new” paths and unlearn “old” paths which became obsolete

note:For all EXPERMINENTS, if a terminal state is reached, restart the experiment by resetting the PD world to the initial state, but do not reset the Q-table. Run each experiment twice, 

Report  and interpret the results; e.g., utilities computed, rewards obtained in various stages of each experiment. 

Assess which experiment obtained the best results . Next, analyze the various q-tables you created and try identify attractive paths  in the obtained q-tables, if there are any. Moreover, briefly assess if your system gets better after it solved a few PD-world problems—reached the terminal state at least once. Briefly analyze to which extend the results of the two different runs agree and disagree in the 4 experiments. Analyze agent coordination for experiments 1.c and 4. Finally, analyze how well the approach adapted to change in the fourth experiment
