# Lunar Lander with Q Learning

In this notebook we are going to train an agent to play the Lunar Lander game using tabular methods as:
- Q Learning
- Q Learning with Eligibility Traces
- Q Learning with Eligibility Traces and spatial decay

In [None]:
# General libraries
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

# Deep learning
import torch
from torch import nn

# Video display
from IPython.display import Video
from moviepy import *

# Custom modules
from src.utils import *


# Set seed
SEED = 31
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f"Using seed {SEED}")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}")

## The Environment

The Lunar Lander environment is a 2D environment where the agent has to land a spaceship on a landing pad.
The agent has 4 actions available:
- Do nothing
- Fire left orientation engine
- Fire main engine
- Fire right orientation engine

The agent receives a reward of 100 for landing on the landing pad and -100 for crashing. The agent also receives a reward proportional to the fuel used to land the spaceship.

The state space is a 8-dimensional vector with the following components:
- x position
- y position
- x velocity
- y velocity
- angle
- angular velocity
- left leg contact
- right leg contact

The environment is considered solved when the agent reaches an average reward of 200 over 100 episodes.

![Lunar Lander](https://www.gymlibrary.dev/_images/lunar_lander.gif)

In [2]:
import gymnasium as gym

env_name = 'LunarLander-v3'
env = gym.make(env_name)

In [3]:
from src.networks import ActorNet, CriticNet

n_episodes = 2_000

initial_lr_actor = 1e-3 * 1.2
final_lr_actor = 1e-3 * 1.2

initial_lr_critic = 5e-3 * 1.2
final_lr_critic = 5e-4 * 1.2
# gamma_actor = np.log(final_lr_actor / initial_lr_actor) / n_episodes
# gamma_critic = np.log(final_lr_critic / initial_lr_critic) / n_episodes
gamma_actor = 1
gamma_critic = 1

actor_net_main = ActorNet(
    input_dim=8,
    output_dim=4,
    hidden_dim=[64,64],
    batchnorm=False,
    activation=nn.ReLU,
    dropout=0.2,
    device=device,
    lr_scheduler_params={
        'gamma': gamma_actor
    },
    optimizer_params={
        "lr": initial_lr_actor,
        "weight_decay": 1e-8,
        "betas": (0.9, 0.99),
    },
    output_activation=nn.Softmax
)

critic_net_main = CriticNet(
    input_dim=8,
    hidden_dim=[64,64],
    output_dim=1,
    batchnorm=False,
    activation=nn.ReLU,
    dropout=0.1,
    device=device,
    lr_scheduler_params={
        'gamma': gamma_critic
    },
    optimizer_params={
        "lr": initial_lr_critic,
        "weight_decay": 1e-8,
        "betas": (0.9, 0.99),
    },
)

In [4]:
from src.DeepAgents import MonteCarloActorCritic as MCAC

agent = MCAC(
    env = env,
    discount_factor = 0.99,
    policy_net=actor_net_main,
    value_net=critic_net_main,
    inertia=0.0
)

In [None]:
rewards, losses_actor, losses_critic = agent.learn(n_episodes, 2_000)

rewards = pd.Series(rewards)
losses_actor = pd.Series(losses_actor)
losses_critic = pd.Series(losses_critic)

Plot the rewards obtained by the agent during training

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

ax[0].plot(rewards.rolling(20).mean(), label="Reward")
ax[0].set_title("Reward")
ax[0].set_xlabel("Episode")
ax[0].set_ylabel("Reward")
ax[0].legend()

ax[1].plot(losses_actor.rolling(20).mean(), label="Actor Loss")
ax[1].set_title("Loss")
ax[1].set_xlabel("Episode")
ax[1].set_ylabel("Loss")
ax[1].legend()

ax[2].plot(losses_critic.rolling(20).mean(), label="Critic Loss")
ax[2].set_title("Loss")
ax[2].set_xlabel("Episode")
ax[2].set_ylabel("Loss")
ax[2].legend()

plt.show()

Generate video of the agent playing the game:

In [None]:
video_dir = Path('./gym_videos')
video_dir.mkdir(parents=True, exist_ok=True)

env = gym.make(env_name, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(
    env,
    video_folder=video_dir,
    name_prefix=env_name,
)

In [None]:
frames, rewards = play(agent, env, 1)

In [None]:
for i, episode_frames in enumerate(frames):
    filename = f"{video_dir}/{env_name}_{i}.mp4"
    clip = ImageSequenceClip(episode_frames, fps=60)
    clip.write_videofile(filename, codec='libx264')
    
    print(f"Episode {i} reward: {rewards[i]:.2f}")    
    
env.close()

In [None]:
videos = [ x for x in list(video_dir.glob('*.mp4')) if env_name in x.name]

for video in videos:
    print(video)
    display(Video(video))