## Team 6 Lunar Lander Study

### Set Up Environment

In [5]:
# EXTRACT Imports

import math
import gymnasium as gym
import numpy as np
import copy
import pandas as pd

from typing import List, Optional, Tuple, Union, Callable

### Visualisation tools

In [6]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

In [7]:
from IPython.display import Video

In [8]:
sns.set_context("talk")

## Setup the Lunar Lander problem with Gymnasium

For the purpose of focusing on the algorithms, we will use standard environments provided
by the Gymnasium framework.
As a reminder, this environment is described [here](https://gymnasium.org.cn/environments/box2d/lunar_lander/).

The action indices are outlined below:

| Action Index | Action     |
|--------------|------------|
| 0            | nothing  |
| 1            |  left orientation engine  |
| 2            |  main engine |
| 3            |  right orientation engine |


In [9]:
action_labels = {0: "Nothing", 1: "Left Engine", 2: "Main Engine", 3: " Right Engine"}

In [None]:
DISPLAY_EVERY_N_EPISODES = 50

def q_learning(
    environment: gym.Env,
    alpha: float = 0.1,
    alpha_factor: float = 0.9995,
    gamma: float = 0.99,
    epsilon: float = 0.5,
    num_episodes: int = 10000,
    display: bool = False,
) -> Tuple[np.ndarray, List[np.ndarray], List[float]]:
    """
    Perform Q-learning on a given environment.

    Parameters
    ----------
    environment : gym.Env
        The environment to learn in.
    alpha : float, optional
        The learning rate, between 0 and 1. By default 0.1
    alpha_factor : float, optional
        The factor to decrease alpha by each episode, by default 0.9995
    gamma : float, optional
        The discount factor, between 0 and 1. By default 0.99
    epsilon : float, optional
        The probability of choosing a random action, by default 0.5
    num_episodes : int, optional
        The number of episodes to run, by default 10000
    display : bool, optional
        Whether to display the Q-table (every DISPLAY_EVERY_N_EPISODES episodes), by default False

    Returns
    -------
    np.ndarray
        The learned Q-table.
        Each row corresponds to a state, and each column corresponds to an action.
        In the frozen lake environment, there are 16 states and 4 actions thus the Q-table has a shape of (16, 4).
        For instance, q_array[0, 3] is the Q-value (estimation of the expected reward) for performing action 3 ("move up") in state 0 (the top left square).
    """
    # Initialize the history of the Q-table and learning rate
    q_array_history = []
    alpha_history = []

    observation_space = cast(gym.spaces.Discrete, environment.observation_space)
    action_space = cast(gym.spaces.Discrete, environment.action_space)

    # Get the number of states in the environment
    num_states = observation_space.n

    # Get the number of actions in the environment
    num_actions = action_space.n

    # Initialize the Q-table to zeros
    q_array = np.zeros([num_states, num_actions])

    # Loop over the episodes
    for episode_index in tqdm(range(1, num_episodes)):
        # Display the Q-table every DISPLAY_EVERY_N_EPISODES episodes if display is True
        if display and episode_index % DISPLAY_EVERY_N_EPISODES == 0:
            display_qtable(q_array, title="Q table")

        # Save the current Q-table and learning rate
        q_array_history.append(q_array.copy())
        alpha_history.append(alpha)

        # Decrease the learning rate if alpha_factor is not None
        if alpha_factor is not None:
            alpha = alpha * alpha_factor

        # TODO...
        # 1. Reset the environment (new Gym typically returns (obs, info))
        obs, info = environment.reset()
        state = obs

        done = False
        while not done:
            # Epsilon-greedy action selection
            if np.random.rand() < epsilon:
                action = np.random.randint(num_actions)  # random action
            else:
                action = np.argmax(q_array[state])       # greedy action

            # Execute the action in the environment
            next_obs, reward, terminated, truncated, info = environment.step(action)
            next_state = next_obs
            done = terminated or truncated

            # Q-learning update:
            # Q(S, A) ← Q(S, A) + α * [ R + γ * max_{a'} Q(S', a') - Q(S, A) ]
            best_next_action = np.argmax(q_array[next_state])  # best action in next state
            td_target = reward + gamma * q_array[next_state, best_next_action] * (1 - done)
            q_array[state, action] += alpha * (td_target - q_array[state, action])

            # Transition to the next state
            state = next_state        

    # Return the learned Q-table
    return q_array, q_array_history, alpha_history

In [None]:
# Create the environment and set the maximum number of steps per episode
environment = gym.make("FrozenLake-v1", max_episode_steps=1000)

# Apply Q-learning to calculate the Q-table for the FrozenLake environment
q_array_ex3, q_array_history_ex3, alpha_history_ex3 = q_learning(environment, display=False)

environment.close()

  0%|          | 0/9999 [00:00<?, ?it/s]