In [None]:
import numpy as np

# Game Environment Setup
game_environment = [
    [0, 0, 0, 0],
    [0, -1, 0, -1],
    [0, 0, 0, 0],
    [-1, 0, 0, 1]
]

# Q-Table Initialization
num_states = len(game_environment)
num_actions = 4  # 4 possible actions: up, down, left, right
q_table = np.zeros((num_states, num_actions))

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1
num_episodes = 1000

# Training Loop
for episode in range(num_episodes):
    state = 0  # Start from the initial state
    done = False

    while not done:
        # Exploration vs. Exploitation
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(num_actions)  # Explore: choose a random action
        else:
            action = np.argmax(q_table[state])  # Exploit: choose the action with the highest Q-value

        next_state = action  # Assume the next state is determined by the chosen action
        reward = game_environment[state][action]
        
        # Q-Learning Update Equation
        q_table[state][action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state][action])

        state = next_state

        if reward == 1 or reward == -1:
            done = True

# Evaluation
total_score = 0
num_episodes = 100

for episode in range(num_episodes):
    state = 0  # Start from the initial state
    done = False
    score = 0

    while not done:
        action = np.argmax(q_table[state])  # Choose the action with the highest Q-value
        next_state = action
        reward = game_environment[state][action]

        state = next_state
        score += reward

        if reward == 1 or reward == -1:
            done = True

    total_score += score

average_score = total_score / num_episodes
print("Average Score:", average_score)
