<a href="https://colab.research.google.com/github/Akanksha-cell-max/Advanced-Artificial-Intelligence/blob/main/Practical_No_5_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

# Define environment
grid_size = 5
goal_state = (4, 4)

# Define actions
actions = ['up', 'down', 'left', 'right']

# Initialize Q-table
q_table = np.zeros((grid_size, grid_size, len(actions)))

# Hyperparameters
alpha = 0.1       # learning rate
gamma = 0.9       # discount factor
epsilon = 0.2     # exploration rate
episodes = 500

# Helper function to get next state
def get_next_state(state, action):
    i, j = state
    if action == 'up' and i > 0:
        i -= 1
    elif action == 'down' and i < grid_size - 1:
        i += 1
    elif action == 'left' and j > 0:
        j -= 1
    elif action == 'right' and j < grid_size - 1:
        j += 1
    return (i, j)

# Helper function for reward
def get_reward(state):
    if state == goal_state:
        return 100
    else:
        return -1

# Training
for episode in range(episodes):
    state = (0, 0)  # start state
    done = False

    while not done:
        if random.uniform(0,1) < epsilon:
            action_idx = random.randint(0, len(actions)-1)  # explore
        else:
            action_idx = np.argmax(q_table[state[0], state[1]])  # exploit

        action = actions[action_idx]
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        # Update Q-value
        old_value = q_table[state[0], state[1], action_idx]
        next_max = np.max(q_table[next_state[0], next_state[1]])

        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        q_table[state[0], state[1], action_idx] = new_value

        state = next_state

        if state == goal_state:
            done = True

print("Training completed!\n")
print("Final Q-Table:\n")
print(q_table)

# Test learned policy
def test_policy():
    state = (0, 0)
    path = [state]
    steps = 0
    while state != goal_state and steps < 50:
        action_idx = np.argmax(q_table[state[0], state[1]])
        action = actions[action_idx]
        state = get_next_state(state, action)
        path.append(state)
        steps += 1
    return path

path = test_policy()

print("\nLearned path from Start to Goal:")
for step in path:
    print(step)


Training completed!

Final Q-Table:

[[[ 34.78478561  42.612659    35.43487018  31.805103  ]
  [ -1.46727312  46.73468907   8.46630234   5.4479647 ]
  [ -0.94370472  52.0360332   -1.02902615   8.64533862]
  [  2.42722796  59.9104745    0.23693086  -0.63757766]
  [ -0.57655314   2.17008318  -0.52176003  -0.58441256]]

 [[ 34.71903699  23.07066325  35.96046766  48.45851   ]
  [ 25.85185918  37.09204022  34.57526347  54.9539    ]
  [ 37.81589775  59.49984557  39.12059637  62.171     ]
  [ 44.81870219  70.19        49.58701568  27.6697026 ]
  [ -0.51821273  59.72304238   5.82145361   1.26199766]]

 [[  3.37613809  -1.0454856    0.17617728  42.06348334]
  [  4.27393532   0.79389807  -0.64457877  60.60021567]
  [  6.95412577  23.07069417  15.62620444  70.18498873]
  [ 59.6666639   79.1         57.08585497  67.16499715]
  [  5.73753319  87.80301141  32.61055787  24.87012429]]

 [[ -0.65133966  -0.61304608  -0.58519851   0.21822906]
  [ -0.4019131   -0.4033485   -0.52774278  18.10356211]
  [ -