In [15]:
import numpy as np
import pandas as pd

# Define the grid world environment
rows, cols = 3, 4
actions = ["up", "down", "left", "right"]
action_dict = {"up": (-1, 0), "down": (1, 0), "left": (0, -1), "right": (0, 1)}

# Initialize rewards and environment constraints
reward_grid = np.zeros((rows, cols))
reward_grid[1, 1] = np.nan  # Wall (gray)
reward_grid[1, 2] = -100    # Fire (red)
reward_grid[0, 3] = 100     # Flag (green)

# Hyperparameters
alpha = 0.1   # Learning rate
gamma = 0.9   # Discount factor
epsilon = 1.0 # Initial exploration rate
epsilon_decay = 0.995
min_epsilon = 0.01
num_episodes = 1000

# Initialize Q-table
Q_table = np.zeros((rows, cols, len(actions)))

# Function to choose an action using epsilon-greedy policy
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(actions))  # Explore
    return np.argmax(Q_table[state])  # Exploit

# Function to get the next state and reward
def step(state, action):
    r, c = state
    dr, dc = action_dict[actions[action]]
    new_r, new_c = r + dr, c + dc

    # Check if new position is valid
    if 0 <= new_r < rows and 0 <= new_c < cols and not np.isnan(reward_grid[new_r, new_c]):
        next_state = (new_r, new_c)
    else:
        next_state = state  # Stay in place if hitting a wall

    reward = reward_grid[next_state] if not np.isnan(reward_grid[next_state]) else 0
    return next_state, reward

# Training loop
for episode in range(num_episodes):
    state = (2, 0)  # Start position
    done = False
    epsilon = max(min_epsilon, epsilon * epsilon_decay)  # Decay epsilon

    while not done:
        action = choose_action(state, epsilon)
        next_state, reward = step(state, action)

        # Q-learning update rule
        Q_table[state][action] = (1 - alpha) * Q_table[state][action] + \
                                 alpha * (reward + gamma * np.max(Q_table[next_state]))

        state = next_state
        if reward == 100 or reward == -100:
            done = True

# Convert Q-table to a more readable format
q_table_max_values = np.max(Q_table, axis=2)  # Extract max Q-value for each state

# Create a DataFrame for better visualization
q_table_df = pd.DataFrame(q_table_max_values, columns=[f"Col {i}" for i in range(cols)])
q_table_df.index = [f"Row {i}" for i in range(rows)]

# Print the Q-table in the console
print("Final Q-table (Max Q-values per state):")
print(q_table_df)

# Save the Q-table as a CSV file for external viewing
q_table_df.to_csv("q_table.csv", index=True)


Final Q-table (Max Q-values per state):
       Col 0      Col 1       Col 2     Col 3
Row 0  81.00  90.000000  100.000000   0.00000
Row 1  72.90   0.000000    0.000000  40.95100
Row 2  65.61  58.885522   31.588957   9.10287
