<a href="https://colab.research.google.com/github/Fidelisaboke/ml-crash-course/blob/main/reinforcement_learning/q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reinforcement Learning using Q-Learning


## Required Libraries

In [None]:
import numpy as np

## The Environment
- A 2D structure repesenting the states

In [None]:
# Environment
rewards = np.array([
    [-10, 1, 0],
    [0, -10, 10]
])

## Defining Q-table
- The Q-table will store pairings of states-action and q-values

In [None]:
# 6 states, 4 actions
Q_table = np.zeros((6, 4))

# Each grid position is assigned to a state number
state_map = {
    (0, 0): 0,
    (0, 1): 1,
    (0, 2): 2,
    (1, 0): 3,
    (1, 1): 4,
    (1, 2): 5
}

## Q-learning Algorithm Definition

In [None]:
# Actions
actions = {
    0: (-1, 0),  # Up
    1: (1, 0),   # Down
    2: (0, 1),   # Right
    3: (0, -1)   # Left
}

# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.1   # Exploration factor


### Useful Helper Functions

In [None]:
def is_valid_position(position):
  """Check if a position is within the grid."""
  return 0 <= position[0] < rewards.shape[0] and 0 <= position[1] < rewards.shape[1]

def get_next_state(current_position, action):
  """Return the next state based on the current position and action."""
  move = actions[action]
  next_position = (current_position[0] + move[0], current_position[1] + move[1])
  if is_valid_position(next_position):
      return next_position

  return current_position

## Q-Learning Training (Iterations)

In [None]:
EPISODES = 20000

for episode in range(EPISODES):
  # Set the start position, which is (0, 0)
  position = (0, 0)

  # Map the start position to a state index to be used in the Q-table
  state = state_map[position]

  # Initialise step count
  step_count = 0

  while True:
    # Select a random action based on epsilon-greedy
    if np.random.rand() < epsilon:
      action = np.random.choice(list(actions.keys()))
    else:
      # Choose the best-known action (highest Q-value for current state)
      action = np.argmax(Q_table[state])

    # Obtain the new position, state and reward
    new_position = get_next_state(position, action)
    new_state = state_map[new_position]
    reward = rewards[new_position]

    # Update the Q-value using the Q-learning formula
    Q_table[state, action] = Q_table[state, action] + alpha * (
        reward + gamma * np.max(Q_table[new_state]) - Q_table[state, action]
    )

    # Update state, position, and increment step count
    position = new_position
    state = new_state
    step_count += 1

    # End the episode if the goal has been reached or too many steps taken
    if reward == 10 or step_count >= 4:
      break



## Final Q-values

In [None]:
# Final Q-values
print("Final Q-values:")
print(Q_table)

Final Q-values:
[[-1.          0.         10.         -1.        ]
 [10.         -1.          9.         -1.        ]
 [ 8.35386201 10.          8.61830234  9.01512626]
 [-1.14479951  0.         -1.21772458  0.        ]
 [10.         -0.98044297  8.33228183  0.        ]
 [ 0.          0.          0.          0.        ]]
