In [1]:
import gymnasium as gym
import torch
from src.data.services.dimensions import get_dims
import json

In [2]:
env_name = "LunarLander-v2"

In [3]:
env = gym.make(env_name)

In [4]:
from src.nn.services.dqn import DQN

In [5]:
state_dim, action_dim = get_dims(env)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
dqn_golden = DQN(state_dim, action_dim).to(device)

In [40]:
from src.training.services.train import full_training as train

In [41]:
from src.data.services.memory import ReplayMemory

In [42]:
from torch.optim import Adam

In [43]:
batch_size = 64
gamma = 0.99
num_episodes = 10000
target_update = 50
epsilon = 0.25

In [44]:
import cv2

In [45]:
def create_video(frames, fps=10, output_name="output"):
    out = cv2.VideoWriter(f"{output_name}.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frames[0].shape[1], frames[0].shape[0]))
    for frame in frames:
        out.write(frame)
    out.release()

In [46]:
from src.policy.services.epsilon_greedy import epsilon_greedy

In [47]:
def test_dqn(dqn, env, device, output_name="output", output_dir="./tmpvideo"):
    frames = []
    rewards = []
    for i in range(10):
        cum_reward = 0
        state, _ = env.reset()
        done = False
        while not done:
            if i == 0:
                frames.append(env.render())
            action = epsilon_greedy(
                torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device), dqn, 0)
            next_state, reward, terminated, truncated, _ = env.step(action)
            cum_reward += reward
            done = terminated or truncated
            state = next_state
        rewards.append(cum_reward)
    create_video(frames, output_name=f"{output_dir}/{output_name}")
    return rewards

In [48]:
bf_rewards = test_dqn(dqn_golden, gym.make(env_name, render_mode="rgb_array"), device, output_name="before_training_lunader_lander")

In [49]:
print(bf_rewards)

[-452.174421901032, -662.972353362637, -396.7225915034367, -391.2459664314594, -752.7123294505722, -840.8575139628855, -503.8150841488567, -557.8205799335751, -484.8628417114906, -826.2194609862426]


In [57]:
pretrain_path = "./src/pretrain/data/LunarLander-v2_10_000.npy"
pretrain_epochs = 200
pretrain_batch_size = batch_size
pretrain_optm_lr = 0.001
pretrain_optimizer = Adam(dqn_golden.parameters(), lr=pretrain_optm_lr)

In [58]:
from tqdm.auto import tqdm

In [59]:
from src.pretrain.services.train import pre_train
pre_train(
    dqn_golden,
    pretrain_optimizer,
    pretrain_path,
    pretrain_epochs,
    pretrain_batch_size,
    device=device,
)

100%|██████████| 200/200 [01:40<00:00,  1.99it/s]


In [62]:
pretrain_rewards = test_dqn(dqn_golden, gym.make(env_name, render_mode="rgb_array"), device, output_name="pretrain_lunar_lander")

In [63]:
print(pretrain_rewards)

[23.009141065573175, -47.96259643787111, -101.8860423584971, -352.94496865138535, -243.2148964423845, 260.482302178953, 39.836824371347205, -63.68445716189174, 207.64631325916193, -14.695851816910121]
