Implementation of Q-learning in python using numpy


Q-learning is a popular reinforcement learning algorithm used to find the optimal action-selection policy for a given finite Markov decision process (MDP).

Import the packages 

In [3]:
import numpy as np


Initialize parameters

In [4]:
gamma=0.75 #Discount factor
alpha=0.9 #Learning rate

Define the states

In [5]:
location_to_state={
    'L1':0,
    'L2':1,
    'L3':2,
    'L4':3,
    'L5':4,
    'L6':5,
    'L7':6,
    'L8':7,
    'L9':8
}

Define the actions

In [6]:
actions=[0,1,2,3,4,5,6,7,8]

Define the rewards

In [8]:
rewards=np.array([[0,1,0,0,0,0,0,0,0],
                [1,0,1,0,0,0,0,0,0],
                [0,1,0,0,0,1,0,0,0],
                [0,0,0,0,0,0,1,0,0],
                [0,1,0,0,0,0,0,1,0],
                [0,0,1,0,0,0,0,0,0],
                [0,0,0,1,0,0,0,1,0],
                [0,0,0,0,1,0,1,0,1],
                [0,0,0,0,0,0,0,1,0]])

Maps indices to locations

In [9]:
state_to_location=dict((state,location) for location,state in location_to_state.items())

Define a function get_optimal_route

In [14]:
def get_optimal_route(start_location, end_location):
    rewards_new = np.copy(rewards)
    ending_state = location_to_state[end_location]
    rewards_new[ending_state, ending_state] = 999  # High reward for reaching the end location

    Q = np.zeros((9, 9))  # Initialize Q-values
    for i in range(1000):
        current_state = np.random.randint(0, 9)
        playable_actions = [j for j in range(9) if rewards_new[current_state, j] > 0]
        if not playable_actions:
            continue  # Skip if no valid actions

        next_state = np.random.choice(playable_actions)     
        TD = rewards_new[current_state, next_state] + gamma * np.max(Q[next_state, :]) - Q[current_state, next_state]
        Q[current_state, next_state] += alpha * TD

    # Construct the route
    route = [start_location]
    next_location = start_location
    while next_location != end_location:
        starting_state = location_to_state[next_location]
        next_state = np.argmax(Q[starting_state, :])
        next_location = state_to_location[next_state]
        route.append(next_location)
        if next_location == end_location:
            break

    return route


In [15]:
print(get_optimal_route('L9','L1'))

['L9', 'L8', 'L5', 'L2', 'L1']
