# DQN\REINFORCE\PPO Part of Project

## Preparation

In [None]:
import collections
import gymnasium as gym
import itertools
import numpy as np

# from numpy.typing import NDArray
import pandas as pd
from pathlib import Path
import random
import optuna
import torch
from torch.nn import functional as F
from typing import Callable, cast, List, Tuple, Union
from collections import deque

In [None]:
import pickle

In [None]:
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm.notebook import tqdm

In [None]:
from IPython.display import Video
from ipywidgets import interact

In [None]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
sns.set_context("talk")

In [None]:
FIGS_DIR = Path("figs/") / "project"       # Where to save figures (.gif or .mp4 files)
PLOTS_DIR = Path("figs/") / "project"      # Where to save plots (.png or .svg files)
MODELS_DIR = Path("models/") / "project"   # Where to save models (.pth files)

In [None]:
if not FIGS_DIR.exists():
    FIGS_DIR.mkdir(parents=True)
if not PLOTS_DIR.exists():
    PLOTS_DIR.mkdir(parents=True)
if not MODELS_DIR.exists():
    MODELS_DIR.mkdir(parents=True)

In [None]:
def video_selector(file_path: List[Path]):
    return Video(file_path, embed=True, html_attributes="controls autoplay loop")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
DEFAULT_NUMBER_OF_TRAININGS = 3

## Discrete Environment

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

In [None]:
LL_observation_dim = env.observation_space._shape[0]
LL_action_number = env.action_space.n
env.close()

### Random Policy Baseline

In [None]:
def random_baseline(num_episodes):
    episode_reward_list = []
    for episode in range(num_episodes):
        episode_reward = 0.0
        observation, info = env.reset()
        end = False
        while not end:
            action = env.action_space.sample()
            observation, reward, terminated, truncated, info = env.step(action)
            episode_reward += reward
            end = terminated or truncated
        episode_reward_list.append(episode_reward)
    return episode_reward_list

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
baseline_reward = random_baseline(num_episodes = 200)
baseline_index = np.arange(200)
env.close()

In [None]:
print(np.mean(baseline_reward),np.max(baseline_reward))

In [None]:
g = sns.relplot(
    x= baseline_index,
    y= baseline_reward,
    kind="line",
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_random_baseline.png")

In [None]:
VIDEO_PREFIX_INITIALIZATION = "project_random_policy"

(FIGS_DIR / f"{VIDEO_PREFIX_INITIALIZATION}-episode-0.mp4").unlink(missing_ok=True)
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
env = gym.wrappers.RecordVideo(env, video_folder=str(FIGS_DIR), name_prefix=VIDEO_PREFIX_INITIALIZATION)
observation, info = env.reset()
end = False
while not end:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    end = terminated or truncated
env.close()

In [None]:
Video(
    FIGS_DIR / f"{VIDEO_PREFIX_INITIALIZATION}-episode-0.mp4",
    embed=True,
    html_attributes="controls autoplay loop",
)

### DQN

In [None]:
class LinearNet(torch.nn.Module):
    def __init__(self, dim_observation: int, n_action: int, hidden_dim: int, n_layer: int):
        super().__init__()
        self.dim_observation = dim_observation
        self.n_action = n_action
        self.hidden_dim = hidden_dim
        self.n_layer = n_layer
        assert n_layer > 0
        if n_layer == 1:
            self.layer = torch.nn.Linear(dim_observation, n_action)
        else:
            layers = [torch.nn.Linear(dim_observation,hidden_dim),torch.nn.ReLU()]
            for i in range(n_layer-2):
                layers.append(torch.nn.Linear(hidden_dim,hidden_dim))
                layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Linear(hidden_dim, n_action))
            self.layer = torch.nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        output_tensor = self.layer(x)
        return output_tensor



In [None]:
class EpsilonGreedy:
    """
    An Epsilon-Greedy policy.

    Attributes
    ----------
    epsilon : float
        The initial probability of choosing a random action.
    epsilon_min : float
        The minimum probability of choosing a random action.
    epsilon_decay : float
        The decay rate for the epsilon value after each action.
    env : gym.Env
        The environment in which the agent is acting.
    q_network : torch.nn.Module
        The Q-Network used to estimate action values.

    Methods
    -------
    __call__(state: np.ndarray) -> np.int64
        Select an action for the given state using the epsilon-greedy policy.
    decay_epsilon()
        Decay the epsilon value after each action.
    """

    def __init__(
        self,
        epsilon_start: float,
        epsilon_min: float,
        epsilon_decay: float,
        env: gym.Env,
        q_network: torch.nn.Module,
    ):
        """
        Initialize a new instance of EpsilonGreedy.

        Parameters
        ----------
        epsilon_start : float
            The initial probability of choosing a random action.
        epsilon_min : float
            The minimum probability of choosing a random action.
        epsilon_decay : float
            The decay rate for the epsilon value after each episode.
        env : gym.Env
            The environment in which the agent is acting.
        q_network : torch.nn.Module
            The Q-Network used to estimate action values.
        """
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.env = env
        self.q_network = q_network

    def __call__(self, state: np.ndarray) -> np.int64:
        """
        Select an action for the given state using the epsilon-greedy policy.

        If a randomly chosen number is less than epsilon, a random action is chosen.
        Otherwise, the action with the highest estimated action value is chosen.

        Parameters
        ----------
        state : np.ndarray
            The current state of the environment.

        Returns
        -------
        np.int64
            The chosen action.
        """

        if random.random() < self.epsilon:
            action = np.random.choice(self.env.action_space.n)
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

                q_values = self.q_network(state_tensor)

                action = torch.argmax(q_values).item()

        return action

    def decay_epsilon(self):
        """
        Decay the epsilon value after each episode.

        The new epsilon value is the maximum of `epsilon_min` and the product of the current
        epsilon value and `epsilon_decay`.
        """
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

In [None]:
class MinimumExponentialLR(torch.optim.lr_scheduler.ExponentialLR):
    def __init__(
        self,
        optimizer: torch.optim.Optimizer,
        lr_decay: float,
        last_epoch: int = -1,
        min_lr: float = 1e-6,
    ):
        """
        Initialize a new instance of MinimumExponentialLR.

        Parameters
        ----------
        optimizer : torch.optim.Optimizer
            The optimizer whose learning rate should be scheduled.
        lr_decay : float
            The multiplicative factor of learning rate decay.
        last_epoch : int, optional
            The index of the last epoch. Default is -1.
        min_lr : float, optional
            The minimum learning rate. Default is 1e-6.
        """
        self.min_lr = min_lr
        super().__init__(optimizer, lr_decay, last_epoch=-1)

    def get_lr(self) -> List[float]:
        """
        Compute learning rate using chainable form of the scheduler.

        Returns
        -------
        List[float]
            The learning rates of each parameter group.
        """
        return [
            max(base_lr * self.gamma**self.last_epoch, self.min_lr)
            for base_lr in self.base_lrs
        ]

In [None]:
class ReplayBuffer:
    """
    A Replay Buffer.

    Attributes
    ----------
    buffer : collections.deque
        A double-ended queue where the transitions are stored.

    Methods
    -------
    add(state: np.ndarray, action: np.int64, reward: float, next_state: np.ndarray, done: bool)
        Add a new transition to the buffer.
    sample(batch_size: int) -> Tuple[np.ndarray, float, float, np.ndarray, bool]
        Sample a batch of transitions from the buffer.
    __len__()
        Return the current size of the buffer.
    """

    def __init__(self, capacity: int):
        """
        Initializes a ReplayBuffer instance.

        Parameters
        ----------
        capacity : int
            The maximum number of transitions that can be stored in the buffer.
        """
        self.buffer: collections.deque = collections.deque(maxlen=capacity)

    def add(
        self,
        state: np.ndarray,
        action: np.int64,
        reward: float,
        next_state: np.ndarray,
        done: bool,
    ):
        """
        Add a new transition to the buffer.

        Parameters
        ----------
        state : np.ndarray
            The state vector of the added transition.
        action : np.int64
            The action of the added transition.
        reward : float
            The reward of the added transition.
        next_state : np.ndarray
            The next state vector of the added transition.
        done : bool
            The final state of the added transition.
        """
        self.buffer.append((state, action, reward, next_state, done))

    def sample(
        self, batch_size: int
    ) -> Tuple[np.ndarray, Tuple[int], Tuple[float], np.ndarray, Tuple[bool]]:
        """
        Sample a batch of transitions from the buffer.

        Parameters
        ----------
        batch_size : int
            The number of transitions to sample.

        Returns
        -------
        Tuple[np.ndarray, float, float, np.ndarray, bool]
            A batch of `batch_size` transitions.
        """
        # Here, `random.sample(self.buffer, batch_size)`
        # returns a list of tuples `(state, action, reward, next_state, done)`
        # where:
        # - `state`  and `next_state` are numpy arrays
        # - `action` and `reward` are floats
        # - `done` is a boolean
        #
        # `states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))`
        # generates 5 tuples `state`, `action`, `reward`, `next_state` and `done`, each having `batch_size` elements.
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        return np.array(states), actions, rewards, np.array(next_states), dones

    def __len__(self):
        """
        Return the current size of the buffer.

        Returns
        -------
        int
            The current size of the buffer.
        """
        return len(self.buffer)

In [None]:
def test_q_network_agent(
    env: gym.Env, q_network: torch.nn.Module, num_episode: int = 1
) -> List[float]:
    """
    Test a naive agent in the given environment using the provided Q-network.

    Parameters
    ----------
    env : gym.Env
        The environment in which to test the agent.
    q_network : torch.nn.Module
        The Q-network to use for decision making.
    num_episode : int, optional
        The number of episodes to run, by default 1.

    Returns
    -------
    List[float]
        A list of rewards per episode.
    """
    episode_reward_list = []

    for episode_id in range(num_episode):
        state, info = env.reset()
        done = False
        episode_reward = 0.0

        while not done:
            # Convert the state to a PyTorch tensor and add a batch dimension (unsqueeze)
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

            q_values = q_network(state_tensor)

            action = torch.argmax(q_values).item()

            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            episode_reward += float(reward)

            state = next_state

        episode_reward_list.append(episode_reward)
        # print(f"Episode reward: {episode_reward}")

    return episode_reward_list

In [None]:
q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=128, n_layer = 3).to(device)

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
VIDEO_PREFIX_DQN_NAIVE_UNTRAINED = "_naive_untained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_DQN_NAIVE_UNTRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_DQN_NAIVE_UNTRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

test_q_network_agent(env, q_network, num_episode=NUM_EPISODES)

# print(f'Episode time taken: {env.time_queue}')
# print(f'Episode total rewards: {env.return_queue}')
# print(f'Episode lengths: {env.length_queue}')

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

In [None]:
def train_dqn2_agent(
    env: gym.Env,
    q_network: torch.nn.Module,
    target_q_network: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_fn: Callable,
    epsilon_greedy: EpsilonGreedy,
    device: torch.device,
    lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
    num_episodes: int,
    gamma: float,
    batch_size: int,
    replay_buffer: ReplayBuffer,
    target_q_network_sync_period: int,
) -> List[float]:
    """
    Train the Q-network on the given environment.

    Parameters
    ----------
    env : gym.Env
        The environment to train on.
    q_network : torch.nn.Module
        The Q-network to train.
    target_q_network : torch.nn.Module
        The target Q-network to use for estimating the target Q-values.
    optimizer : torch.optim.Optimizer
        The optimizer to use for training.
    loss_fn : callable
        The loss function to use for training.
    epsilon_greedy : EpsilonGreedy
        The epsilon-greedy policy to use for action selection.
    device : torch.device
        The device to use for PyTorch computations.
    lr_scheduler : torch.optim.lr_scheduler.LRScheduler
        The learning rate scheduler to adjust the learning rate during training.
    num_episodes : int
        The number of episodes to train for.
    gamma : float
        The discount factor for future rewards.
    batch_size : int
        The size of the batch to use for training.
    replay_buffer : ReplayBuffer
        The replay buffer storing the experiences with their priorities.
    target_q_network_sync_period : int
        The number of episodes after which the target Q-network should be updated with the weights of the Q-network.

    Returns
    -------
    List[float]
        A list of cumulated rewards per episode.
    """
    iteration = 0
    episode_reward_list = []

    for episode_index in tqdm(range(1, num_episodes)):
        state, info = env.reset()
        episode_reward = 0.0

        for t in itertools.count():
            # Get action, next_state and reward

            action = epsilon_greedy(state)

            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            replay_buffer.add(state, action, float(reward), next_state, done)

            episode_reward += float(reward)

            # Update the q_network weights with a batch of experiences from the buffer

            if len(replay_buffer) > batch_size:
                batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = replay_buffer.sample(batch_size)

                # Convert to PyTorch tensors
                batch_states_tensor = torch.tensor(batch_states, dtype=torch.float32, device=device)
                batch_actions_tensor = torch.tensor(batch_actions, dtype=torch.long, device=device)
                batch_rewards_tensor = torch.tensor(batch_rewards, dtype=torch.float32, device=device)
                batch_next_states_tensor = torch.tensor(batch_next_states, dtype=torch.float32, device=device)
                batch_dones_tensor = torch.tensor(batch_dones, dtype=torch.float32, device=device)

                # Compute the target Q values for the batch
                with torch.no_grad():
                    next_state_q_values = target_q_network(batch_next_states_tensor)
                    next_state_max_q_values, _ = next_state_q_values.max(1)

                    targets = batch_rewards_tensor + (gamma * next_state_max_q_values * (1 - batch_dones_tensor))

                current_q_values = q_network(batch_states_tensor).gather(1, batch_actions_tensor.unsqueeze(1)).squeeze(1)

                # Compute loss
                loss = loss_fn(current_q_values, targets)

                # Optimize the model
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                lr_scheduler.step()

            # Update the target q-network weights
            # Every episodes (e.g., every `target_q_network_sync_period` episodes), the weights of the target network are updated with the weights of the Q-network
            iteration += 1
            if episode_index % target_q_network_sync_period == 0:
              target_q_network.load_state_dict(q_network.state_dict())

            if done:
                break

            state = next_state

        episode_reward_list.append(episode_reward)
        epsilon_greedy.decay_epsilon()

    return episode_reward_list

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
NUMBER_OF_TRAININGS = DEFAULT_NUMBER_OF_TRAININGS  # Change the default (global) value here if you want a specific number of trainings for this exercise
dqn2_trains_result_list: List[List[Union[int, float]]] = [[], [], []]

for train_index in range(NUMBER_OF_TRAININGS):
    # Instantiate required objects

    q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=128, n_layer = 3).to(device)

    target_q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=128, n_layer = 3).to(device) # The target Q-network is used to compute the target Q-values for the loss function

    # Initialize the target Q-network with the same weights as the Q-network (c.f. the "Practical tips" section of the exercise)
    target_q_network.load_state_dict(q_network.state_dict())

    optimizer = torch.optim.AdamW(q_network.parameters(), lr=0.004, amsgrad=True)
    # lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=0.97, min_lr=0.0001)
    loss_fn = torch.nn.MSELoss()

    epsilon_greedy = EpsilonGreedy(
        epsilon_start=0.85,
        epsilon_min=0.010,
        epsilon_decay=0.9675,
        env=env,
        q_network=q_network,
    )

    replay_buffer = ReplayBuffer(3000)

    # Train the q-network

    episode_reward_list = train_dqn2_agent(
        env,
        q_network,
        target_q_network,
        optimizer,
        loss_fn,
        epsilon_greedy,
        device,
        lr_scheduler,
        num_episodes=200,
        gamma=0.9,
        batch_size=128,
        replay_buffer=replay_buffer,
        target_q_network_sync_period=12,
    )
    print(np.mean(episode_reward_list))
    dqn2_trains_result_list[0].extend(range(len(episode_reward_list)))
    dqn2_trains_result_list[1].extend(episode_reward_list)
    dqn2_trains_result_list[2].extend([train_index for _ in episode_reward_list])

dqn2_trains_result_df = pd.DataFrame(
    np.array(dqn2_trains_result_list).T,
    columns=["num_episodes", "mean_final_episode_reward", "training_index"],
)
dqn2_trains_result_df["agent"] = "DQN v2"

# Save the action-value estimation function

torch.save(q_network, MODELS_DIR / "project_dqn2_q_network.pth")

env.close()

In [None]:
dqn2_trains_result_df.to_csv("./data/project_dqn2_train_result.csv")

In [None]:
print(len(replay_buffer))

In [None]:
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    estimator=None,
    units="training_index",
    data=dqn2_trains_result_df,
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_dqn2_trains_result.png")

In [None]:
VIDEO_PREFIX_EX4_DQN2_TRAINED = "project_dqn2_tained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_EX4_DQN2_TRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_EX4_DQN2_TRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

test_reward_list = test_q_network_agent(env, q_network, num_episode=NUM_EPISODES)

# print(f'Episode time taken: {env.time_queue}')
# print(f'Episode total rewards: {env.return_queue}')
# print(f'Episode lengths: {env.length_queue}')

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

In [None]:
mean_score_dqn2 = dqn2_trains_result_df["mean_final_episode_reward"].mean()
mean_score_dqn2

In [None]:
score_dqn2 = dqn2_trains_result_df[["num_episodes", "mean_final_episode_reward"]].groupby("num_episodes").mean().max()
score_dqn2

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
dqn_reward_list = test_q_network_agent(env, q_network, num_episode=200)
dqn_reward_index = np.arange(200)
env.close()
print(np.mean(dqn_reward_list), np.max(dqn_reward_list))

In [None]:
g = sns.relplot(
    x= dqn_reward_index,
    y= dqn_reward_list,
    kind="line",
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_random_baseline.png")

### Hyperparameter finetuning code for DQN

In [None]:
# Objective function for Optuna
def objective_dqn(trial):
    # Hyperparameters to be tuned
    hidden_size = trial.suggest_int('hidden_size', 32, 256, log = True)
    layer_num = trial.suggest_int('layer_num', 2,4)
    lr_start = trial.suggest_loguniform('lr_start', 1e-3, 1e-2)
    lr_min = trial.suggest_loguniform('lr_min', 1e-5, 1e-4)
    lr_decay = trial.suggest_float('lr_decay', 0.95, 0.99)
    epsilon_start = trial.suggest_float('epsilon_start', 0.8, 0.99)
    epsilon_min = trial.suggest_float('epsilon_min', 0.005, 0.015)
    epsilon_decay = trial.suggest_float('epsilon_decay', 0.95, 0.99)
    len_buffer = trial.suggest_int('len_buffer', 2000, 10000, step = 500)
    target_update_period = trial.suggest_int('target_update_period', 6, 20, step = 2)

    env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
    q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim= hidden_size, n_layer = layer_num).to(device)

    target_q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=hidden_size, n_layer = layer_num).to(device) # The target Q-network is used to compute the target Q-values for the loss function

    # Initialize the target Q-network with the same weights as the Q-network (c.f. the "Practical tips" section of the exercise)
    target_q_network.load_state_dict(q_network.state_dict())

    optimizer = torch.optim.AdamW(q_network.parameters(), lr=lr_start, amsgrad=True)
    # lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=lr_decay, min_lr=lr_min)
    loss_fn = torch.nn.MSELoss()

    epsilon_greedy = EpsilonGreedy(
        epsilon_start=epsilon_start,
        epsilon_min=epsilon_min,
        epsilon_decay=epsilon_decay,
        env=env,
        q_network=q_network,
    )

    replay_buffer = ReplayBuffer(len_buffer)

    # Train the q-network

    episode_reward_list = train_dqn2_agent(
        env,
        q_network,
        target_q_network,
        optimizer,
        loss_fn,
        epsilon_greedy,
        device,
        lr_scheduler,
        num_episodes=200,
        gamma=0.9,
        batch_size=128,
        replay_buffer=replay_buffer,
        target_q_network_sync_period=target_update_period,
    )


    mean_reward = np.mean(episode_reward_list)
    std_reward = np.std(episode_reward_list)
    min_reward = np.min(episode_reward_list)

    lambda_std = 0.4
    lambda_min = 0.2

    objective_value = -mean_reward + lambda_std * std_reward - lambda_min * min_reward

    return objective_value





In [None]:
# Optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective_dqn, n_trials=50)
best_hyperparams = study.best_params
best_objective_value = study.best_value
best_trial = study.best_trial

### Policy gradient

In [None]:
class PolicyNetwork(torch.nn.Module):
    def __init__(self, dim_observation: int, n_action: int, hidden_dim: int, n_layer: int):
        super().__init__()
        self.dim_observation = dim_observation
        self.n_action = n_action
        self.hidden_dim = hidden_dim
        self.n_layer = n_layer
        assert n_layer > 0
        if n_layer == 1:
            self.layer = torch.nn.Linear(dim_observation, n_action)
        else:
            layers = [torch.nn.Linear(dim_observation,hidden_dim),torch.nn.ReLU()]
            for i in range(n_layer-2):
                layers.append(torch.nn.Linear(hidden_dim,hidden_dim))
                layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Linear(hidden_dim, n_action))
            self.layer = torch.nn.Sequential(*layers)

    def forward(self, state_tensor: torch.Tensor) -> torch.Tensor:
        action_tensor = self.layer(state_tensor)
        out = F.softmax(action_tensor)
        return out

In [None]:
def sample_discrete_action(
    policy_nn: PolicyNetwork, state: np.ndarray
) -> Tuple[int, torch.Tensor]:
    """
    Sample a discrete action based on the given state and policy network.

    This function takes a state and a policy network, and returns a sampled action and its log probability.
    The action is sampled from a categorical distribution defined by the output of the policy network.

    Parameters
    ----------
    policy_nn : PolicyNetwork
        The policy network that defines the probability distribution of the actions.
    state : np.ndarray
        The state based on which an action needs to be sampled.

    Returns
    -------
    Tuple[int, torch.Tensor]
        The sampled action and its log probability.

    """

    # Convert the state into a tensor, specify its data type as float32, and send it to the device (CPU or GPU).
    # The unsqueeze(0) function is used to add an extra dimension to the tensor to match the input shape required by the policy network.
    state_tensor = torch.tensor(state,dtype = torch.float32, device = device).unsqueeze(0)

    # Pass the state tensor through the policy network to get the parameters of the action probability distribution.
    actions_probability_distribution_params = policy_nn(state_tensor)

    # Create the categorical distribution used to sample an action from the parameters obtained from the policy network.
    # See https://pytorch.org/docs/stable/distributions.html#categorical
    actions_probability_distribution = torch.distributions.categorical.Categorical(probs=actions_probability_distribution_params)

    # Sample an action from the categorical distribution.
    sampled_action_tensor = actions_probability_distribution.sample()

    # Convert the tensor containing the sampled action into a Python integer.
    sampled_action = sampled_action_tensor.item()

    # Calculate the log probability of the sampled action according to the categorical distribution.
    sampled_action_log_probability = actions_probability_distribution.log_prob(sampled_action_tensor)

    # Return the sampled action and its log probability.
    return sampled_action, sampled_action_log_probability

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

policy_nn = PolicyNetwork(LL_observation_dim, LL_action_number).to(device)

state, info = env.reset()
theta = list(policy_nn.parameters())
action, action_log_probability = sample_discrete_action(policy_nn, state)

print("state:", state)
print("theta:", theta)
print("sampled action:", action)
print("log probability of the sampled action:", action_log_probability)

env.close()

In [None]:
def sample_one_episode(
    env: gym.Env, policy_nn: PolicyNetwork, max_episode_duration: int
) -> Tuple[List[np.ndarray], List[int], List[float], List[torch.Tensor]]:
    """
    Execute one episode within the `env` environment utilizing the policy defined by the `policy_nn` parameter.

    Parameters
    ----------
    env : gym.Env
        The environment to play in.
    policy_nn : PolicyNetwork
        The policy neural network.
    max_episode_duration : int
        The maximum duration of the episode.

    Returns
    -------
    Tuple[List[np.ndarray], List[int], List[float], List[torch.Tensor]]
        The states, actions, rewards, and log probability of action for each time step in the episode.
    """
    state_t, info = env.reset()

    episode_states = []
    episode_actions = []
    episode_log_prob_actions = []
    episode_rewards = []
    episode_states.append(state_t)

    for t in range(max_episode_duration):

        # Sample a discrete action and its log probability from the policy network based on the current state
        action_t,log_prob_action_t = sample_discrete_action(policy_nn, state_t)

        # Execute the sampled action in the environment, which returns the new state, reward, and whether the episode has terminated or been truncated
        state_t, reward_t,terminated, truncated, info = env.step(action_t)

        # Check if the episode is done, either due to termination (reaching a terminal state) or truncation (reaching a maximum number of steps)
        done = terminated or truncated

        # Append the new state, action, action log probability and reward to their respective lists

        episode_states.append(state_t)
        episode_actions.append(action_t)
        episode_log_prob_actions.append(log_prob_action_t)
        episode_rewards.append(float(reward_t))

        if done:
            break

    return episode_states, episode_actions, episode_rewards, episode_log_prob_actions

In [None]:
VIDEO_PREFIX_REINFORCE_UNTRAINED = "project_reinforce_untained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_REINFORCE_UNTRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_REINFORCE_UNTRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

for episode_index in range(NUM_EPISODES):
    policy_nn = PolicyNetwork(env.observation_space._shape[0], env.action_space.n).to(device)
    episode_states, episode_actions, episode_rewards, episode_log_prob_actions = sample_one_episode(env,policy_nn,max_episode_duration = 500)

print(f"Episode time taken: {env.time_queue}")
print(f"Episode total rewards: {env.return_queue}")
print(f"Episode lengths: {env.length_queue}")

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

In [None]:
def avg_return_on_multiple_episodes(
    env: gym.Env,
    policy_nn: PolicyNetwork,
    num_test_episode: int,
    max_episode_duration: int,
) -> float:
    """
    Play multiple episodes of the environment and calculate the average return.

    Parameters
    ----------
    env : gym.Env
        The environment to play in.
    policy_nn : PolicyNetwork
        The policy neural network.
    num_test_episode : int
        The number of episodes to play.
    max_episode_duration : int
        The maximum duration of an episode.

    Returns
    -------
    float
        The average return.
    """

    # TODO...
    average_return = 0.0
    for episode_index in range(num_test_episode):
        episode_states, episode_actions, episode_rewards, episode_log_prob_actions = sample_one_episode(env,policy_nn,max_episode_duration)
        average_return += sum(episode_rewards)
    average_return /= num_test_episode
    return average_return

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

policy_nn = PolicyNetwork(env.observation_space._shape[0], env.action_space.n).to(device)
average_return = avg_return_on_multiple_episodes(env,policy_nn,20,500)

print(average_return)

env.close()

In [None]:
def train_reinforce_discrete(
    env: gym.Env,
    num_train_episodes: int,
    num_test_per_episode: int,
    max_episode_duration: int,
    learning_rate: float,
) -> Tuple[PolicyNetwork, List[float]]:
    """
    Train a policy using the REINFORCE algorithm.

    Parameters
    ----------
    env : gym.Env
        The environment to train in.
    num_train_episodes : int
        The number of training episodes.
    num_test_per_episode : int
        The number of tests to perform per episode.
    max_episode_duration : int
        The maximum length of an episode, by default EPISODE_DURATION.
    learning_rate : float
        The initial step size.

    Returns
    -------
    Tuple[PolicyNetwork, List[float]]
        The final trained policy and the average returns for each episode.
    """
    episode_avg_return_list = []

    policy_nn = PolicyNetwork(env.observation_space._shape[0], env.action_space.n, 128, 3).to(device)
    optimizer = torch.optim.Adam(policy_nn.parameters(), lr=learning_rate)

    for episode_index in tqdm(range(num_train_episodes)):
        episode_states, episode_actions, episode_rewards, episode_log_prob_actions = sample_one_episode(env,policy_nn,max_episode_duration)
        # T = len(episode_states) - 1
        # returns = torch.tensor(episode_rewards, dtype=torch.float32, device=device)
        # returns = returns.flip(dims=(0,)).cumsum(dim=0).flip(dims=(0,))
        # for i in range(T):
        #     loss = -returns[i]*episode_log_prob_actions[i]
        #     optimizer.zero_grad()
        #     loss.backward()
        #     optimizer.step()
        episode_returns = []
        R = 0
        for r in reversed(episode_rewards):
            R = r + R
            episode_returns.insert(0, R)
        episode_returns = torch.tensor(episode_returns, dtype=torch.float32, device=device)

        loss = 0
        for log_prob, R in zip(episode_log_prob_actions, episode_returns):
            loss -= log_prob * R
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()



        # Test the current policy
        test_avg_return = avg_return_on_multiple_episodes(
            env=env,
            policy_nn=policy_nn,
            num_test_episode=num_test_per_episode,
            max_episode_duration=max_episode_duration,
        )

        # Monitoring
        episode_avg_return_list.append(test_avg_return)

    return policy_nn, episode_avg_return_list

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

NUMBER_OF_TRAININGS = DEFAULT_NUMBER_OF_TRAININGS  # Change the default (global) value here if you want a specific number of trainings for this exercise
reinforce_trains_result_list: List[List[Union[int, float]]] = [[], [], []]

for train_index in range(NUMBER_OF_TRAININGS):
    # Train the agent
    reinforce_policy_nn, episode_reward_list = train_reinforce_discrete(
        env=env,
        num_train_episodes=200,
        num_test_per_episode=5,
        max_episode_duration=500,
        learning_rate=0.001,
    )

    reinforce_trains_result_list[0].extend(range(len(episode_reward_list)))
    reinforce_trains_result_list[1].extend(episode_reward_list)
    reinforce_trains_result_list[2].extend([train_index for _ in episode_reward_list])

reinforce_trains_result_df = pd.DataFrame(
    np.array(reinforce_trains_result_list).T,
    columns=["num_episodes", "mean_final_episode_reward", "training_index"],
)
reinforce_trains_result_df["agent"] = "REINFORCE"

# Save the action-value estimation function of the last train

torch.save(reinforce_policy_nn, MODELS_DIR / "project_reinforce_policy_network.pth")

env.close()

In [None]:
reinforce_trains_result_df.to_csv("./data/project_reinforce_trains_result.csv")

In [None]:
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    estimator=None,
    units="training_index",
    data=reinforce_trains_result_df,
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_reinforce_trains_result.png")

In [None]:
mean_score_reinforce = reinforce_trains_result_df["mean_final_episode_reward"].mean()

mean_score_reinforce

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
mean_test_reward = avg_return_on_multiple_episodes(env,reinforce_policy_nn,200,500)
env.close()
mean_test_reward

In [None]:
dqn2_trains_result_df = pd.read_csv("./data/project_dqn2_train_result.csv")

In [None]:
all_trains_result_df = pd.concat(
    [
        dqn2_trains_result_df,
        reinforce_trains_result_df,
    ]
)
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    data=all_trains_result_df,
    height=7,
    aspect=2,
)
plt.savefig(PLOTS_DIR / "project_reinforce_trains_result_agg.png")

In [None]:
VIDEO_PREFIX_PROJECT_REINFORCE_TRAINED = "project_reinforce_tained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_PROJECT_REINFORCE_TRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_PROJECT_REINFORCE_TRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

max_episode_duration = 500
for episode_index in range(NUM_EPISODES):
    state_t, info = env.reset()
    episode_states = []
    episode_actions = []
    episode_rewards = []
    episode_states.append(state_t)

    for t in range(max_episode_duration):
        state_tensor_t = torch.tensor(state_t,dtype = torch.float32, device = device).unsqueeze(0)
        actions_probability_distribution_params = reinforce_policy_nn(state_tensor_t)
        action_t = torch.argmax(actions_probability_distribution_params).item()
        state_t, reward_t,terminated, truncated, info = env.step(action_t)
        done = terminated or truncated
        episode_states.append(state_t)
        episode_actions.append(action_t)
        episode_rewards.append(float(reward_t))
        if done:
            break

print(f"Episode time taken: {env.time_queue}")
print(f"Episode total rewards: {env.return_queue}")
print(f"Episode lengths: {env.length_queue}")

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

### Hyperparameters Finetuning Code for REINFORCE

In [None]:
def new_train_reinforce_discrete(
    env: gym.Env,
    hidden_dim: int,
    n_layers: int,
    num_train_episodes: int,
    num_test_per_episode: int,
    max_episode_duration: int,
    lr_start = 0.01,
    lr_decay = 0.97,
    min_lr = 0.0001,
) -> Tuple[PolicyNetwork, List[float]]:
    ### Add the same learning rate scheduler as DQN for REINFORCE
    episode_avg_return_list = []

    policy_nn = PolicyNetwork(env.observation_space._shape[0], env.action_space.n, hidden_dim, n_layers).to(device)
    optimizer = torch.optim.Adam(policy_nn.parameters(), lr=lr_start)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=lr_decay, min_lr=min_lr)

    for episode_index in tqdm(range(num_train_episodes)):
        episode_states, episode_actions, episode_rewards, episode_log_prob_actions = sample_one_episode(env,policy_nn,max_episode_duration)

        episode_returns = []
        R = 0
        for r in reversed(episode_rewards):
            R = r + R
            episode_returns.insert(0, R)
        episode_returns = torch.tensor(episode_returns, dtype=torch.float32, device=device)

        loss = 0
        for log_prob, R in zip(episode_log_prob_actions, episode_returns):
            loss -= log_prob * R

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Test the current policy
        test_avg_return = avg_return_on_multiple_episodes(
            env=env,
            policy_nn=policy_nn,
            num_test_episode=num_test_per_episode,
            max_episode_duration=max_episode_duration,
        )

        # Monitoring
        episode_avg_return_list.append(test_avg_return)

    return policy_nn, episode_avg_return_list

In [None]:
def objective_pg(trial):
    # Hyperparameters to be tuned
    hidden_size = trial.suggest_int('hidden_size', 32, 256, log = True)
    layer_num = trial.suggest_int('layer_num', 2,4)
    lr_start = trial.suggest_loguniform('lr_start', 1e-3, 1e-2)
    lr_min = trial.suggest_loguniform('lr_min', 1e-5, 1e-4)
    lr_decay = trial.suggest_float('lr_decay', 0.95, 0.99)


    env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")



    reinforce_policy_nn, episode_reward_list = new_train_reinforce_discrete(
        env=env,
        hidden_dim = hidden_size,
        n_layers = layer_num,
        num_train_episodes=200,
        num_test_per_episode=5,
        max_episode_duration=500,
        lr_start = lr_start,
        lr_decay = lr_decay,
        min_lr = lr_min
    )

    mean_reward = np.mean(episode_reward_list)
    std_reward = np.std(episode_reward_list)
    min_reward = np.min(episode_reward_list)

    lambda_std = 0.4
    lambda_min = 0.2

    objective_value = -mean_reward + lambda_std * std_reward - lambda_min * min_reward

    return objective_value

In [None]:
# Optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective_pg, n_trials=100)
best_hyperparams = study.best_params
best_objective_value = study.best_value
best_trial = study.best_trial

## Continuous Environment

In [None]:
!pip install swig
!pip install box2d-py==2.3.8 --no-build-isolation
!pip install gymnasium[box2d]

In [None]:
env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

### Proximal Policy Optimization

In [None]:
class Actor(torch.nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = torch.nn.Sequential(torch.nn.Linear(state_dim, 128), torch.nn.ReLU(), torch.nn.Linear(128,128), torch.nn.ReLU())
        self.mu = torch.nn.Linear(128, action_dim)
        self.log_std = torch.nn.Linear(128, action_dim)

    def forward(self, state):
        x = self.fc(state)
        mu = torch.tanh(self.mu(x))
        log_std = self.log_std(x)
        std = torch.exp(log_std)
        return mu, std

    def get_action(self, state):
        mu, std = self.forward(state)
        dist = torch.distributions.Normal(mu, std)
        action = dist.rsample()
        action = action.clamp(-1, 1)
        log_prob = dist.log_prob(action).sum(dim=-1)
        # log_prob = torch.clamp(log_prob, min=-1e6, max=1e6)
        return action, log_prob

    def get_log_prob(self, state, action):
        mean, std = self.forward(state)
        dist = torch.distributions.Normal(mean, std)
        log_prob = dist.log_prob(action).sum(dim = -1)
        return log_prob



class Critic(torch.nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(state_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1)
        )

    def forward(self, state):
        return self.fc(state)

In [None]:
def evaluate_policy(actor, env, num_test_episodes=5, max_episode_duration = 500):
    total_reward = 0.0
    for _ in range(num_test_episodes):
        state,info = env.reset()
        done = False
        episode_reward = 0.0

        for i in range(max_episode_duration):
            # print(state, state.shape)
            state = torch.tensor(state, dtype=torch.float32) #.unsqueeze(0)
            action, _ = actor.get_action(state)
            next_state, reward, terminated, truncated, info = env.step(action.detach().numpy())  #.squeeze(0)
            done = terminated or truncated
            episode_reward += reward
            state = next_state
            if done:
                break

        total_reward += episode_reward

    avg_reward = total_reward / num_test_episodes
    # print(f"Average reward over {num_test_episodes} test episodes: {avg_reward}")
    return avg_reward

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, log_prob, reward, value):
        self.buffer.append((state, action, log_prob, reward, value))

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, log_probs, rewards, values = zip(*[self.buffer[idx] for idx in indices])
        return (
            torch.stack(states),
            torch.stack(actions),
            torch.stack(log_probs),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(values, dtype=torch.float32),
        )

    def __len__(self):
        return len(self.buffer)

In [None]:
def compute_advantage(rewards, values, gamma=0.99, lam=0.95):
    returns = []
    adv = 0
    G = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * (values[i + 1] if i + 1 < len(values) else 0) - values[i]
        adv = delta + gamma * lam * adv
        G = rewards[i] + gamma * G
        returns.insert(0, G)

    returns = torch.tensor(returns, dtype=torch.float32)
    advantage = returns - values
    return advantage, returns

def sample_trajectory(env, actor, critic, max_steps=1000):
    state = env.reset()[0]
    log_probs, rewards, values, states, actions = [], [], [], [], []

    for _ in range(max_steps):
        state = torch.tensor(state, dtype=torch.float32)  #.unsqueeze(0)
        value = critic(state).item()
        action, log_prob = actor.get_action(state)

        next_state, reward, terminated, truncated, info = env.step(action.detach().numpy())   #.squeeze(0)
        done = terminated or truncated

        states.append(state)
        actions.append(action)
        log_probs.append(log_prob)
        rewards.append(reward)
        values.append(value)

        state = next_state
        if done:
            break

    return states, actions, log_probs, rewards, torch.tensor(values)

# def train_PPO(env: gym.Env, num_train_episodes: int, num_test_per_episode: int, max_episode_duration: int, learning_rate: float):
#     state_dim = env.observation_space.shape[0]
#     action_dim = env.action_space.shape[0]

#     actor = Actor(state_dim, action_dim)
#     critic = Critic(state_dim)

#     actor_optimizer = torch.optim.Adam(actor.parameters(), lr=learning_rate) #lr=0.0003
#     critic_optimizer = torch.optim.Adam(critic.parameters(), lr=learning_rate)

#     epsilon = 0.2  # PPO clip parameter
#     gamma = 0.9
#     lam = 0.95  # GAE-Lambda
#     # epochs = 10

#     episode_reward_list = []

#     for episode in tqdm(range(num_train_episodes)):

#         states, actions, log_probs, rewards, values = sample_trajectory(env, actor, critic, max_episode_duration)
#         advantage, returns = compute_advantage(rewards, values, gamma, lam)

#         states = torch.stack(states)
#         actions = torch.stack(actions)
#         log_probs = torch.stack(log_probs)

#         new_actions, new_log_probs = actor.get_action(states)
#         ratio = torch.exp(new_log_probs - log_probs)


#         surr1 = ratio * advantage
#         surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
#         loss_actor = -torch.min(surr1, surr2).mean()

#         loss_critic = (returns - critic(states)).pow(2).mean()

#         actor_optimizer.zero_grad()
#         loss_actor.backward()
#         actor_optimizer.step()

#         critic_optimizer.zero_grad()
#         loss_critic.backward()
#         critic_optimizer.step()

#         test_reward = evaluate_policy(actor,env,num_test_per_episode)
#         episode_reward_list.append(test_reward)

#     return actor, critic, episode_reward_list

In [None]:
# def train_PPO(env: gym.Env, num_train_episodes: int, num_test_per_episode: int,
#               max_episode_duration: int, learning_rate: float):

#     state_dim = env.observation_space.shape[0]
#     action_dim = env.action_space.shape[0]

#     actor = Actor(state_dim, action_dim)
#     critic = Critic(state_dim)

#     actor_optimizer = torch.optim.Adam(actor.parameters(), lr=learning_rate)
#     critic_optimizer = torch.optim.Adam(critic.parameters(), lr=learning_rate)

#     epsilon = 0.2
#     gamma = 0.9
#     lam = 0.95

#     episode_reward_list = []

#     for episode in tqdm(range(num_train_episodes)):
#         states, actions, log_probs_old, rewards, values = sample_trajectory(env, actor, critic, max_episode_duration)
#         advantage, returns = compute_advantage(rewards, values, gamma, lam)

#         states = torch.stack(states)
#         actions = torch.stack(actions)
#         # advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
#         log_probs_old = torch.stack(log_probs_old).detach()


#         for epoch in range(10):

#             new_log_probs = actor.get_log_prob(states,actions)
#             print(new_log_probs)
#             ratio = torch.exp(new_log_probs - log_probs_old)
#             surr1 = ratio * advantage
#             surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
#             loss_actor = -torch.min(surr1, surr2).mean()
#             loss_critic = (returns - critic(states)).pow(2).mean()

#             actor_optimizer.zero_grad()
#             critic_optimizer.zero_grad()
#             loss_actor.backward(retain_graph=True)
#             loss_critic.backward(retain_graph=True)
#             actor_optimizer.step()
#             critic_optimizer.step()


#         test_reward = evaluate_policy(actor, env, num_test_per_episode)
#         episode_reward_list.append(test_reward)

#     return actor, critic, episode_reward_list



In [None]:
def train_PPO(env: gym.Env, num_train_episodes: int, num_test_per_episode: int,
              max_episode_duration: int, learning_rate: float):

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    actor = Actor(state_dim, action_dim)
    critic = Critic(state_dim)

    actor_optimizer = torch.optim.Adam(actor.parameters(), lr=learning_rate)
    critic_optimizer = torch.optim.Adam(critic.parameters(), lr=learning_rate)

    epsilon = 0.2
    gamma = 0.9
    lam = 0.95
    num_epochs = 3

    episode_reward_list = []

    for episode in tqdm(range(num_train_episodes)):
        # Collect a trajectory first
        with torch.no_grad():
            states_raw, actions_raw, log_probs_raw, rewards, values_raw = sample_trajectory(env, actor, critic, max_episode_duration)
            advantage, returns = compute_advantage(rewards, values_raw, gamma, lam)

            # Convert everything to tensors
            states = torch.stack(states_raw)
            actions = torch.stack(actions_raw)
            log_probs_old = torch.stack(log_probs_raw)
            advantage = torch.tensor(advantage, dtype=torch.float32)
            returns = torch.tensor(returns, dtype=torch.float32)

        # Multi-epoch PPO update
        for _ in range(num_epochs):
            with torch.enable_grad():
                mu, std = actor.forward(states)

                mu = torch.tanh(mu) # stablize mu with tanh. ranges in [-1, 1] anyway
                std = F.softplus(std) + 1e-5  # add a small constant to prevent std from being 0

                dist = torch.distributions.Normal(mu, std)
                log_probs = dist.log_prob(actions).sum(dim=-1)


                ratio = torch.exp(log_probs - log_probs_old)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
                actor_loss = -torch.min(surr1, surr2).mean()

                values = critic(states)
                critic_loss = (returns - values).pow(2).mean()

            actor_optimizer.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(actor.parameters(), 1.0)  # Clip gradients
            actor_optimizer.step()

            critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(critic.parameters(), 1.0)  # Clip gradients
            critic_optimizer.step()

        test_reward = evaluate_policy(actor, env, num_test_per_episode)
        episode_reward_list.append(test_reward)

    return actor, critic, episode_reward_list

In [None]:
VIDEO_PREFIX_PROJECT_PPO_UNTRAINED = "project_PPO_untained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_PROJECT_PPO_UNTRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_PROJECT_PPO_UNTRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

for episode_index in range(NUM_EPISODES):
    critic_nn = Critic(env.observation_space._shape[0])
    actor_nn = Actor(env.observation_space._shape[0], env.action_space._shape[0]) # TODO...
    episode_states, episode_actions, episode_log_prob_actions, episode_rewards, episode_values = sample_trajectory(env,actor_nn,critic_nn, max_steps = 1000) # TODO...

print(f"Episode time taken: {env.time_queue}")
print(f"Episode total rewards: {env.return_queue}")
print(f"Episode lengths: {env.length_queue}")

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

In [None]:
env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

NUMBER_OF_TRAININGS = DEFAULT_NUMBER_OF_TRAININGS
PPO_trains_result_list: List[List[Union[int, float]]] = [[], [], []]

for train_index in range(NUMBER_OF_TRAININGS):
    # Train the agent
    actor_nn, critic_nn, episode_reward_list = train_PPO(
        env=env,
        num_train_episodes=200,
        num_test_per_episode=5,
        max_episode_duration=500,
        learning_rate=0.0001,
    )

    PPO_trains_result_list[0].extend(range(len(episode_reward_list)))
    PPO_trains_result_list[1].extend(episode_reward_list)
    PPO_trains_result_list[2].extend([train_index for _ in episode_reward_list])

PPO_trains_result_df = pd.DataFrame(
    np.array(PPO_trains_result_list).T,
    columns=["num_episodes", "mean_final_episode_reward", "training_index"],
)
PPO_trains_result_df["agent"] = "PPO"

# Save the action-value estimation function of the last train

torch.save(actor_nn, MODELS_DIR / "project_PPO_actor_network.pth")
torch.save(critic_nn, MODELS_DIR / "project_PPO_critic_network.pth")
env.close()

In [None]:
VIDEO_PREFIX_PROJECT_PPO_TRAINED = "project_PPO_trained"

NUM_EPISODES = 3

file_path_list = [
    FIGS_DIR / f"{VIDEO_PREFIX_PROJECT_PPO_TRAINED}-episode-{episode_index}.mp4"
    for episode_index in range(NUM_EPISODES)
]

for file_path in file_path_list:
    file_path.unlink(missing_ok=True)

# env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
#                enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")


env = gym.wrappers.RecordVideo(
    env,
    video_folder=str(FIGS_DIR),
    name_prefix=VIDEO_PREFIX_PROJECT_PPO_TRAINED,
    episode_trigger=lambda x: True,
)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=NUM_EPISODES)

for episode_index in range(NUM_EPISODES):
    episode_states, episode_actions, episode_log_prob_actions, episode_rewards, episode_values = sample_trajectory(env,actor_nn,critic_nn, max_steps = 1000) # TODO...

print(f"Episode time taken: {env.time_queue}")
print(f"Episode total rewards: {env.return_queue}")
print(f"Episode lengths: {env.length_queue}")

env.close()

print("\nSelect the episode to play here 👇\n")

interact(video_selector, file_path=file_path_list)

In [None]:
PPO_trains_result_df.to_csv("./data/project_PPO_trains_result.csv")

In [None]:
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    estimator=None,
    units="training_index",
    data=PPO_trains_result_df,
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_PPO_trains_result.png")

In [None]:
mean_score_PPO = PPO_trains_result_df["mean_final_episode_reward"].mean()

mean_score_PPO

In [None]:
env = gym.make("LunarLander-v3", continuous=True, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
mean_test_reward = evaluate_policy(actor_nn,env,200,500)
env.close()
mean_test_reward

## Optimized Hyperparameters' Performance

### DQN

In [None]:
with open("data/project_dqn2_hyperparams.pkl", "rb") as f:
    dqn_params = pickle.load(f)

In [None]:
dqn_params

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
NUMBER_OF_TRAININGS = DEFAULT_NUMBER_OF_TRAININGS  # Change the default (global) value here if you want a specific number of trainings for this exercise
dqn2_trains_result_list: List[List[Union[int, float]]] = [[], [], []]

for train_index in range(NUMBER_OF_TRAININGS):
    # Instantiate required objects

    q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=dqn_params["hidden_size"], n_layer = dqn_params["layer_num"]).to(device)

    target_q_network = LinearNet(LL_observation_dim, LL_action_number, hidden_dim=dqn_params["hidden_size"], n_layer = dqn_params["layer_num"]).to(device) # The target Q-network is used to compute the target Q-values for the loss function

    # Initialize the target Q-network with the same weights as the Q-network (c.f. the "Practical tips" section of the exercise)
    target_q_network.load_state_dict(q_network.state_dict())

    optimizer = torch.optim.AdamW(q_network.parameters(), lr=dqn_params["lr_start"], amsgrad=True)
    # lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
    lr_scheduler = MinimumExponentialLR(optimizer, lr_decay=dqn_params["lr_decay"], min_lr=dqn_params["lr_min"])
    loss_fn = torch.nn.MSELoss()

    epsilon_greedy = EpsilonGreedy(
        epsilon_start=dqn_params["epsilon_start"],
        epsilon_min=dqn_params["epsilon_min"],
        epsilon_decay=dqn_params["epsilon_decay"],
        env=env,
        q_network=q_network,
    )

    replay_buffer = ReplayBuffer(dqn_params["len_buffer"])

    # Train the q-network

    episode_reward_list = train_dqn2_agent(
        env,
        q_network,
        target_q_network,
        optimizer,
        loss_fn,
        epsilon_greedy,
        device,
        lr_scheduler,
        num_episodes=200,
        gamma=0.9,
        batch_size=128,
        replay_buffer=replay_buffer,
        target_q_network_sync_period=dqn_params["target_update_period"],
    )
    print(np.mean(episode_reward_list))
    dqn2_trains_result_list[0].extend(range(len(episode_reward_list)))
    dqn2_trains_result_list[1].extend(episode_reward_list)
    dqn2_trains_result_list[2].extend([train_index for _ in episode_reward_list])

dqn2_trains_result_df = pd.DataFrame(
    np.array(dqn2_trains_result_list).T,
    columns=["num_episodes", "mean_final_episode_reward", "training_index"],
)
dqn2_trains_result_df["agent"] = "DQN v2"

# Save the action-value estimation function

torch.save(q_network, MODELS_DIR / "project_new_dqn2_q_network.pth")

env.close()

In [None]:
dqn2_trains_result_df.to_csv("./data/project_new_dqn2_train_result.csv")

In [None]:
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    estimator=None,
    units="training_index",
    data=dqn2_trains_result_df,
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_new_dqn2_trains_result.png")

In [None]:
mean_score_dqn2 = dqn2_trains_result_df["mean_final_episode_reward"].mean()
mean_score_dqn2

In [None]:
score_dqn2 = dqn2_trains_result_df[["num_episodes", "mean_final_episode_reward"]].groupby("num_episodes").mean().max()
score_dqn2

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
dqn_reward_list = test_q_network_agent(env, q_network, num_episode=200)
dqn_reward_index = np.arange(200)
env.close()
print(np.mean(dqn_reward_list), np.max(dqn_reward_list))

### REINFORCE

In [None]:
with open("data/project_reinforce_hyperparams.pkl", "rb") as f:
    reinforce_params = pickle.load(f)

In [None]:
reinforce_params

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")

NUMBER_OF_TRAININGS = DEFAULT_NUMBER_OF_TRAININGS  # Change the default (global) value here if you want a specific number of trainings for this exercise
reinforce_trains_result_list: List[List[Union[int, float]]] = [[], [], []]

for train_index in range(NUMBER_OF_TRAININGS):
    # Train the agent
    reinforce_policy_nn, episode_reward_list = new_train_reinforce_discrete(
        env=env,
        hidden_dim = reinforce_params["hidden_size"],
        n_layers = reinforce_params["layer_num"],
        num_train_episodes=200,
        num_test_per_episode=5,
        max_episode_duration=500,
        lr_start=reinforce_params["lr_start"],
        lr_decay=reinforce_params["lr_decay"],
        min_lr = reinforce_params["lr_min"]
    )

    reinforce_trains_result_list[0].extend(range(len(episode_reward_list)))
    reinforce_trains_result_list[1].extend(episode_reward_list)
    reinforce_trains_result_list[2].extend([train_index for _ in episode_reward_list])

reinforce_trains_result_df = pd.DataFrame(
    np.array(reinforce_trains_result_list).T,
    columns=["num_episodes", "mean_final_episode_reward", "training_index"],
)
reinforce_trains_result_df["agent"] = "REINFORCE"

# Save the action-value estimation function of the last train

torch.save(reinforce_policy_nn, MODELS_DIR / "project_new_reinforce_policy_network.pth")

env.close()

In [None]:
reinforce_trains_result_df.to_csv("./data/project_new_reinforce_trains_result.csv")

In [None]:
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    estimator=None,
    units="training_index",
    data=reinforce_trains_result_df,
    height=7,
    aspect=2,
    alpha=0.5,
)
plt.savefig(PLOTS_DIR / "project_new_reinforce_trains_result.png")

In [None]:
mean_score_reinforce = reinforce_trains_result_df["mean_final_episode_reward"].mean()

mean_score_reinforce

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode = "rgb_array")
mean_test_reward = avg_return_on_multiple_episodes(env,reinforce_policy_nn,200,500)
env.close()
mean_test_reward

In [None]:
all_trains_result_df = pd.concat(
    [
        dqn2_trains_result_df,
        reinforce_trains_result_df,
    ]
)
g = sns.relplot(
    x="num_episodes",
    y="mean_final_episode_reward",
    kind="line",
    hue="agent",
    data=all_trains_result_df,
    height=7,
    aspect=2,
)
plt.savefig(PLOTS_DIR / "project_new_trains_result_agg.png")