In [10]:
import numpy as np
import random

# Define the grid world environment
class GridWorld:
    def __init__(self):
        self.grid = [['A1', 'A2', 'A3', 'A4'],
                     ['B1', 'B2', 'B3', 'B4'],
                     ['C1', 'C2', 'C3', 'C4']]
        self.terminal_states = {'A4': 10, 'B4': -10}
        self.blocked_states = {'B2' }
        self.actions = ['up', 'down', 'left', 'right']
        self.state = 'C4'

    def reset(self, start_state=None):
        if start_state is not None:
            self.state = start_state
        else:
            self.state = 'C4'
        return self.state

    def step(self, action):
        row, col = self.state
        if action == 'up' and row > 'A':
            next_state = chr(ord(row) - 1) + col
        elif action == 'down' and row < 'C':
            next_state = chr(ord(row) + 1) + col
        elif action == 'left' and col > '1':
            next_state = row + chr(ord(col) - 1)
        elif action == 'right' and col < '4':
            next_state = row + chr(ord(col) + 1)
        else:
            next_state = self.state

        if next_state in self.blocked_states:
            next_state = self.state

        self.state = next_state
        reward = self.terminal_states.get(next_state, 0)
        done = next_state in self.terminal_states
        return next_state, reward, done

# Q-learning parameters
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 1000

# Initialize Q-table
Q = {}
for row in ['A', 'B', 'C']:
    for col in ['1', '2', '3', '4']:
        state = row + col
        Q[state] = {a: 0 for a in ['up', 'down', 'left', 'right']}

env = GridWorld()

for episode in range(episodes):
    state = env.reset()
    done = False
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = random.choice(env.actions)
        else:
            action = max(Q[state], key=Q[state].get)

        next_state, reward, done = env.step(action)

        old_value = Q[state][action]
        next_max = max(Q[next_state].values())

        Q[state][action] = old_value + alpha * (reward + gamma * next_max - old_value)

        state = next_state

# Extract the policy
policy = {}
for state in Q:
    policy[state] = max(Q[state], key=Q[state].get)

print("Learned policy:")
for row in ['A', 'B', 'C']:
    for col in ['1', '2', '3', '4']:
        state = row + col
        if state in policy:
            print(f"{state}: {policy[state]}")


Learned policy:
A1: right
A2: right
A3: right
A4: up
B1: up
B2: up
B3: up
B4: up
C1: up
C2: right
C3: up
C4: left


In [16]:
# Define the specific path
path1 = ['A1','A2', 'A3', 'B3', 'B4']
path2 = ['C2', 'C1', 'B1', 'A1', 'A2', 'A3', 'A4']
path3 = ['C4', 'C3', 'B3','A3', 'A4']

all_path = [path1, path2,path3]

# Function to calculate the total value for the specified path
def calculate_path_value(path, Q):
    total_value = 0
    for i in range(len(path) - 1):
        state = path[i]
        next_state = path[i + 1]

        # Find the action that leads to the next state
        action = None
        for a in Q[state]:
            env.state = state
            resulting_state, _, _ = env.step(a)
            if resulting_state == next_state:
                action = a
                break

        if action is None:
            raise ValueError(f"No valid action from {state} to {next_state} in the path")
            # action = 2

        # Add the Q-value of the action to the total value
        total_value += Q[state][action]

    return total_value

# Calculate the total value for the specified path
for i,path in enumerate(all_path):
    path_value = calculate_path_value(path, Q)
    print(f"Total value for path ({path}) is {path_value:0.3f}")


Total value for path (['A1', 'A2', 'A3', 'B3', 'B4']) is 8.163
Total value for path (['C2', 'C1', 'B1', 'A1', 'A2', 'A3', 'A4']) is 20.085
Total value for path (['C4', 'C3', 'B3', 'A3', 'A4']) is 34.390
