# Q-Learning Implementation in Python

#### Q-Learning
Q-Learning is a model-free reinforcement learning algorithm used to find the optimal action-selection policy for a given problem. It learns by interacting with an environment, updating a Q-table (a matrix of state-action values), and maximising the expected cumulative reward. Q-learning is effective in problems where the environment can be represented by discrete states and actions.

In [4]:
# import necessary libraries
import numpy as np
import random

# define the environment (4x4 grid)
num_states = 16 # 4x4 grid
num_actions = 4 # up, down, left, right
q_table = np.zeros((num_states, num_actions))

# define the paramaeters
alpha = 0.1 # learning rate
gamma = 0.9 # discount factor
epsilon = 0.1 # exploration rate
num_episodes = 1000 # number of episodes

# define a simple reward structure
rewards = np.zeros(num_states)
rewards[15] = 1

# functon to determine the next state based on the action
def get_next_state(state,action):
    if action == 0 and state >=4: # up
        return state - 4
    elif action == 1 and (state + 1) % 4 !=0: # right
        return state + 1
    elif action == 2 and state < 12: # down
        return state + 4
    elif action == 3 and state % 4 != 0: # left
        return state - 1
    else:
        return state # if action goes out of bounds, stay in the same state

# Q-Learning algorithm
for episode in range(num_episodes):
    state = random.randint(0,num_states-1) # start in a random state
    while state!=15: # loop until reaching the goal state
        if random.uniform(0,1) < epsilon: # explore
            action = random.randint(0,num_actions-1)
        else: # exploit
            action = np.argmax(q_table[state])
        
        next_state = get_next_state(state,action)
        reward = rewards[next_state]
        old_value = q_table[state,action]
        next_max = np.max(q_table[next_state])
        # Q-Learning update rule
        new_value = old_value + alpha*(reward + gamma*next_max - old_value)
        q_table[state,action] = new_value
        state = next_state

# display the learned Q-table
print("Learned Q-table:\n",q_table)

# visualise the learned policy
policy = np.argmax(q_table, axis=1)
policy = policy.reshape(4,4)
print("Learned policy:\n",policy)

Learned Q-table:
 [[0.21934498 0.59049    0.08275192 0.05310723]
 [0.46143098 0.6561     0.44141008 0.21548598]
 [0.59995453 0.729      0.53141762 0.52582009]
 [0.68937286 0.61846349 0.81       0.62176308]
 [0.53143705 0.13719023 0.04959375 0.12444136]
 [0.59049    0.33903361 0.09846198 0.29278765]
 [0.6561     0.3317031  0.22439345 0.        ]
 [0.65674792 0.7675824  0.9        0.53017256]
 [0.47478692 0.16422346 0.03287078 0.00638288]
 [0.53144098 0.18275593 0.04858476 0.14429141]
 [0.59048986 0.46952517 0.04760709 0.09087629]
 [0.73023335 0.76490632 1.         0.44825812]
 [0.04128297 0.42807872 0.         0.04041451]
 [0.47828594 0.15508944 0.05428075 0.08015191]
 [0.53081192 0.19       0.         0.08177904]
 [0.         0.         0.         0.        ]]
Learned policy:
 [[1 1 1 2]
 [0 0 0 2]
 [0 0 0 2]
 [1 0 0 0]]
