Authors: Hofmarcher

Date: 20-03-2023

---

This file is part of the "Deep Reinforcement Learning" lecture material. The following copyright statement applies to all code within this file.

Copyright statement:
This material, no matter whether in ed or electronic form, may be used for personal and non-commercial educational use only. Any reproduction of this manuscript, no matter whether as a whole or in parts, no matter whether in ed or in electronic form, requires explicit prior acceptance of the authors.

## Enable GPU Acceleration

---
Before you start exploring this notebook make sure that GPU support is enabled.
To enable the GPU backend for your notebook, go to **Edit** → **Notebook Settings** and set **Hardware accelerator** to **GPU**.

---


# Imports

Install Gymnasium and dependencies to render the environments

In [None]:
!apt-get update
!apt-get install -y swig python3-numpy python3-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python3-opengl libboost-all-dev libsdl2-dev
!pip install gymnasium==0.29.0 gymnasium[box2d] pyvirtualdisplay imageio-ffmpeg moviepy==1.0.3
!pip install onnx onnx2pytorch==0.4.1

In [None]:
%matplotlib inline

# Auxiliary Python imports
import os
import math
import io
import base64
import random
import shutil
from time import time, strftime
from glob import glob
from tqdm import tqdm
import numpy as np

# Pytorch
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import onnx
from onnx2pytorch import ConvertModel

# Environment import and set logger level to display error only
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium import logger as gymlogger
from gymnasium.wrappers import RecordVideo
gymlogger.set_level(gym.logger.ERROR)

# Plotting and notebook imports
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML, clear_output
from IPython import display

# Select device for training

By default we train on GPU if one is available, otherwise we fall back to the CPU.
If you want to always use the CPU change accordingly.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
("Device: " + str(device))

# Setup Google Drive mount to store your results

In [None]:
use_google_drive = True
if use_google_drive:
    from google.colab import drive
    drive.mount('/content/drive')

# Download Dataset and Expert model

In [None]:
# Download training and validation datasets
!wget --no-check-certificate 'https://cloud.ml.jku.at/s/citYJKPgmAGrHGy/download' -O expert.onnx
!wget --no-check-certificate 'https://cloud.ml.jku.at/s/yJ2ZsfqTos3Jn9y/download' -O train.zip
!wget --no-check-certificate 'https://cloud.ml.jku.at/s/3DxHLiqxTddepp8/download' -O val.zip

# Unzip datasets
!unzip -q -o train.zip
!unzip -q -o val.zip

# Auxiliary Methods

The following cell contains classes and functions to provide some functionality for logging, plotting and exporting your model in the format required by the submission server.
You are free to use your own logging framework if you wish (such as tensorboard or Weights & Biases).
The logger is a very simple implementation of a CSV-file based logger.
Additionally it creates a folder for each run with subfolders for model files, logs and videos.

In [None]:
class Logger():
    def __init__(self, logdir, params=None):
        self.basepath = os.path.join(logdir, strftime("%Y-%m-%dT%H-%M-%S"))
        os.makedirs(self.basepath, exist_ok=True)
        os.makedirs(self.log_dir, exist_ok=True)
        if params is not None and os.path.exists(params):
            shutil.copyfile(params, os.path.join(self.basepath, "params.pkl"))
        self.log_dict = {}
        self.dump_idx = {}

    @property
    def param_file(self):
        return os.path.join(self.basepath, "params.pkl")

    @property
    def onnx_file(self):
        return os.path.join(self.basepath, "model.onnx")

    @property
    def video_dir(self):
        return os.path.join(self.basepath, "videos")

    @property
    def log_dir(self):
        return os.path.join(self.basepath, "logs")

    def log(self, name, value):
        if name not in self.log_dict:
            self.log_dict[name] = []
            self.dump_idx[name] = -1
        self.log_dict[name].append((len(self.log_dict[name]), time(), value))

    def get_values(self, name):
        if name in self.log_dict:
            return [x[2] for x in self.log_dict[name]]
        return None

    def dump(self):
        for name, rows in self.log_dict.items():
            with open(os.path.join(self.log_dir, name + ".log"), "a") as f:
                for i, row in enumerate(rows):
                    if i > self.dump_idx[name]:
                        f.write(",".join([str(x) for x in row]) + "\n")
                        self.dump_idx[name] = i


def plot_metrics(logger):
    train_loss  = logger.get_values("training_loss")
    train_entropy  = logger.get_values("training_entropy")
    val_loss = logger.get_values("validation_loss")
    val_acc = logger.get_values("validation_accuracy")

    fig = plt.figure(figsize=(15,5))
    ax1 = fig.add_subplot(131, label="train")
    ax2 = fig.add_subplot(131, label="val",frame_on=False)
    ax4 = fig.add_subplot(132, label="entropy")
    ax3 = fig.add_subplot(133, label="acc")

    ax1.plot(train_loss, color="C0")
    ax1.set_ylabel("Loss")
    ax1.set_xlabel("Update (Training)", color="C0")
    ax1.xaxis.grid(False)
    ax1.set_ylim((0,4))

    ax2.plot(val_loss, color="C1")
    ax2.xaxis.tick_top()
    ax2.yaxis.tick_right()
    ax2.set_xlabel('Epoch (Validation)', color="C1")
    ax2.xaxis.set_label_position('top')
    ax2.xaxis.grid(False)
    ax2.get_yaxis().set_visible(False)
    ax2.set_ylim((0,4))

    ax4.plot(train_entropy, color="C3")
    ax4.set_xlabel('Update (Training)', color="black")
    ax4.set_ylabel("Entropy", color="C3")
    ax4.tick_params(axis='x', colors="black")
    ax4.tick_params(axis='y', colors="black")
    ax4.xaxis.grid(False)

    ax3.plot(val_acc, color="C2")
    ax3.set_xlabel("Epoch (Validation)", color="black")
    ax3.set_ylabel("Accuracy", color="C2")
    ax3.tick_params(axis='x', colors="black")
    ax3.tick_params(axis='y', colors="black")
    ax3.xaxis.grid(False)
    ax3.set_ylim((0,1))

    fig.tight_layout(pad=2.0)
    plt.show()

"""
Utility functions to enable video recording of gym environment and displaying it
"""
def show_video(video_dir):
    mp4list = glob(f'{video_dir}/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display.display(HTML(data='''<video alt="test" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(encoded.decode('ascii'))))
    else:
        ("Could not find video")

def save_as_onnx(torch_model, sample_input, model_path):
    torch.onnx.export(torch_model,             # model being run
                    sample_input,              # model input (or a tuple for multiple inputs)
                    f=model_path,              # where to save the model (can be a file or file-like object)
                    export_params=True,        # store the trained parameter weights inside the model file
                    opset_version=17,          # the ONNX version to export the model to - see https://github.com/microsoft/onnxruntime/blob/master/docs/Versioning.md
                    do_constant_folding=True,  # whether to execute constant folding for optimization
                    )

# Dataset

Use this dataset class to load the provided demonstrations. Furthermore, this dataset has functionality to add new samples to the dataset which you will need for implementing the DAgger algorithm.

In [None]:
class DemonstrationDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.files = sorted(glob(f"{data_dir}/*.npz"))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])
        state = data["state"][np.newaxis, ...].astype(np.float32)
        action = data["action"]
        return state / 255.0, action.item()

    def append(self, states, actions):
        offset = len(self) + 1
        for i in range(len(states)):
            filename = f"{self.data_dir}/{offset+i:06}.npz"
            np.savez_compressed(filename, state=states[i], action=actions[i].astype(np.int32))
            self.files.append(filename)

# Inspect data

It is always a good idea to take a look at the data when you start working with a new dataset. Feel free to investigate the dataset further on your own.

In [None]:
# Action Statistics
dataset = DemonstrationDataset("train")
("Number of samples: {}".format(len(dataset)));

In [None]:
# Action mapping from gymnasium.farama.org
action_mapping = {
    0: "do nothing",
    1: "steer left",
    2: "steer right",
    3: "gas",
    4: "brake"
}

# Visualize random frames
idx = np.random.randint(len(dataset))
state, action = dataset[idx]
# store a single frame as we need it later for exporting an ONNX model (it needs a sample of the input for the export)
sample_state = torch.Tensor(state).unsqueeze(0).to(device)
# Display the sample
(f"Action: {action_mapping[action]}")
plt.axis("off")
plt.imshow(state[0]);

In [None]:
# release memory"
del dataset

# Define Policy Network

You need to design a neural network architecture that is capable of mapping a state to an action.
The input is a single image with the following properties:
- Resolution of 84x84 pixels
- Grayscale (meaning a single channel as opposed to three channels of an RGB image)
- The values of each pixel should be between 0 and 1

The output of the network should be one unit per possible action, as our environment has 5 actions that results in 5 output units.
Your network must implement the forward function in order to be compatible with the evaluation script.

## Policy Network using CNN

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, n_units_out):
        super(PolicyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(in_features=7*7*64, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=5)
        self.relu = nn.ReLU6()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = x.view(-1, 7*7*64)
        x = self.relu(self.fc1(x))
        x = self.fc3(self.fc2(x))
        return x

## Policy Network using MLP


In [None]:
class PolicyNetworkMLP(nn.Module):
    def __init__(self, n_units_out):
        super(PolicyNetworkMLP, self).__init__()
        self.fc1 = nn.Linear(84*84, 512)  # Flatten 84x84 image into a vector
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, n_units_out)  # Output layer with one unit per action
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 84*84)  # Flatten the image to a vector
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Train behavioral cloning policy

Now that you have a Dataset and a network you need to train your network.
With behavioral cloning we want to imitate the behavior of the agent that produced the demonstration dataset as close as possible.
This is basically supervised learning, where you want to minimize the loss of your network on the training and validation sets.

Some tips as to what you need to implement:
- choose the appropriate loss function (think on which kind of problem you are solving)
- choose an optimizer and its hyper-parameters
- optional: use a learning-rate scheduler
- don't forget to evaluate your network on the validation set
- store your model and training progress often so you don't loose progress if your program crashes

In case you use the provided Logger:
- `logger.log("training_loss", <loss-value>)` to log a particular value
- `logger.dump()` to write the current logs to a log file (e.g. after every episode)
- `logger.log_dir`, `logger.param_file`, `logger.onnx_file`, `logger.video_dir` point to files or directories you can use to save files
- you might want to specify your google drive folder as a logdir in order to automatically sync your results
- if you log the metrics specified in the `plot_metrics` function you can use it to visualize your training progress (or take it as a template to plot your own metrics)

## Training with CNN policy notwork

In [None]:
# choose the batchsize for training
batch_size = 256

# Datasets
train_set = DemonstrationDataset("train")
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, drop_last=False, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, drop_last=False, pin_memory=True)

# Specify the google drive mount here if you want to store logs and weights there (and set it up earlier)
# You can also choose to use a different logging framework such as tensorboard (not recommended on Colab) or Weights & Biases (highly recommended)
logger = Logger("logdir")
#print("Saving state to {}".format(logger.basepath))

# Network
model = PolicyNetwork(n_units_out=5)
# model = PolicyNetworkMLP(n_units_out=5)
model = model.to(device)
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable Parameters: {}".format(num_trainable_params))

######################
### YOUR CODE HERE ###
######################
# Implement your training and evaluation loop
# feel free to define your own functions for training and evaluation


epochs=20
cce_loss = nn.CrossEntropyLoss(reduction="sum")
optimiser =  torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
early_stopping_counter,cur_stopping_counter,best_val_loss,best_val_acc=10,0,np.inf,0

@torch.no_grad()
def evaluate(network: nn.Module, data: torch.utils.data.DataLoader, loss: nn.Module):
    network.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    network.to(device)
    errors=[]
    accs=[]
    for batch in data:
        inputs,labels=batch
        inputs=inputs.to(device)
        labels=labels.to(device)
        outputs=network(inputs)
        error=loss(outputs,labels).item()
        errors.append(error)
        accs.append((torch.argmax(outputs,1)==labels).sum()/labels.size(0))
    return torch.mean(torch.tensor(errors)),torch.mean(torch.tensor(accs))


@torch.enable_grad()
def update(network: nn.Module, data: torch.utils.data.DataLoader, loss: nn.Module, opt: torch.optim.Optimizer):
    network.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    network=network.to(device)

    errors=[]
    for i,batch in enumerate(data):
        opt.zero_grad()
        inputs,labels=batch
        labels=labels.to(device)
        inputs=inputs.to(device)
        output=network(inputs)
        l=loss(output,labels)
        errors.append(l.item())
        l.backward()
        opt.step()

    return torch.mean(torch.tensor(errors))

def training_plot(train_errs,valid_errs):
    plt.title("Training Curves with CNN Policy Network")
    plt.xlabel("Number ofEpoch")
    plt.ylabel("Cross Entropy Loss")
    plt.plot(range(1, len(train_errs) + 1), train_errs, label="train")
    plt.plot(range(1, len(valid_errs) + 1), valid_errs, label="valid")
    plt.legend()
    plt.show()

print("")



if not os.path.exists("best_model.pt"):
    print("Training:\n\n")

    train_errors,validation_errors=[],[]

    for epoch in range(epochs):
        print(f"epoch: {epoch}")
        train_loss=(update(model,train_loader,cce_loss,optimiser)/batch_size).item()
        validation_params=(evaluate(model,val_loader,cce_loss))
        val_loss, val_acc = (validation_params[0]/batch_size).item(), validation_params[1].item()

        print(train_loss, val_loss, val_acc)
        if val_loss < best_val_loss:
            torch.save(model, f"best_model.pt")
            cur_stopping_counter=0
            best_val_loss = val_loss
            best_val_acc = val_acc
            print(f"Saved Model with Accuarcy: {best_val_acc}")
        else:
            cur_stopping_counter+=1


        train_errors.append(train_loss)
        validation_errors.append(val_loss)

        #if cur_stopping_counter==early_stopping_counter:
            #print(f"Early Stopped after {epoch} Epochs")
            #print("Best model has a validation loss of: {}")
            #break


    print("End Training:\n\n")

    training_plot(train_errors,validation_errors)
    # If you want to export your model as an ONNX file use the following code as template
    # If you use the provided logger you can use this directly
    save_as_onnx(model, sample_state, logger.onnx_file)
    model=torch.load(f"best_model.pt")
else:
    model = torch.load("best_model_0712.pt")
    validation_params=(evaluate(model,val_loader,cce_loss))
    val_acc =  validation_params[1].item()
    print(f"Accuarcy of Model: {val_acc}")
    save_as_onnx(model, sample_state, logger.onnx_file)
    print("Loaded Model")

## Training with MLP model

In [None]:
# Setup the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the dataset loaders
batch_size = 256
train_set = DemonstrationDataset("train")
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, drop_last=False, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, drop_last=False, pin_memory=True)

# Define logger (assuming Logger class is defined somewhere)
logger = Logger("logdir")

def train_and_evaluate(model, epochs=20):
    model = model.to(device)
    optimiser = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    cce_loss = nn.CrossEntropyLoss(reduction="sum")

    @torch.no_grad()
    def evaluate(network):
        network.eval()
        errors, accs = [], []
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = network(inputs)
            error = cce_loss(outputs, labels).item()
            errors.append(error)
            accs.append((torch.argmax(outputs, 1) == labels).sum() / labels.size(0))
        return torch.mean(torch.tensor(errors)), torch.mean(torch.tensor(accs))

    @torch.enable_grad()
    def update(network):
        network.train()
        errors = []
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimiser.zero_grad()
            outputs = network(inputs)
            loss = cce_loss(outputs, labels)
            errors.append(loss.item())
            loss.backward()
            optimiser.step()
        return torch.mean(torch.tensor(errors))

    train_errors, validation_errors = [], []
    for epoch in range(epochs):
        train_loss = update(model) / batch_size
        val_loss, val_acc = evaluate(model)
        val_loss, val_acc = val_loss / batch_size, val_acc.item()

        train_errors.append(train_loss.item())
        validation_errors.append(val_loss.item())
        print(f"Epoch {epoch}: Train Loss = {train_loss}, Validation Loss = {val_loss}, Validation Accuracy = {val_acc}")

    return train_errors, validation_errors

# Define the networks
mlp_model = PolicyNetworkMLP(n_units_out=5)

# Train and plot for MLP
print("Training Curves with MLP Policy Network")
mlp_train_errors, mlp_validation_errors = train_and_evaluate(mlp_model)
plt.figure(figsize=(7, 5))
plt.plot(mlp_train_errors, label='Train')
plt.plot(mlp_validation_errors, label='Valid')
plt.title('MLP Network Training and Validation Losses')
plt.xlabel('Number of Epoch')
plt.ylabel('Cross Entropy Loss')
plt.legend()
plt.show()

# Evaluate the agent in the real environment

### Environment and Agent

We provide some wrappers you need in order to get the same states from the environment as in the demonstration dataset.
Additionally the `RecordState` wrapper should be very helpful in collecting new samples for the DAgger algorithm.

In [None]:
class CropObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        gym.ObservationWrapper.__init__(self, env)
        self.shape = shape
        obs_shape = self.shape + env.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        return observation[:self.shape[0], :self.shape[1]]


class RecordState(gym.Wrapper):
    def __init__(self, env: gym.Env, reset_clean: bool = True):
        gym.Wrapper.__init__(self, env)

        assert env.render_mode is not None
        self.frame_list = []
        self.reset_clean = reset_clean

    def step(self, action, **kwargs):
        output = self.env.step(action, **kwargs)
        self.frame_list.append(output[0])
        return output

    def reset(self, *args, **kwargs):
        result = self.env.reset(*args, **kwargs)

        if self.reset_clean:
            self.frame_list = []
        self.frame_list.append(result[0])

        return result

    def render(self):
        frames = self.frame_list
        self.frame_list = []
        return frames


class Agent():
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def select_action(self, state):
        with torch.no_grad():
            state = torch.Tensor(state).unsqueeze(0).to(device) / 255.0 # rescale
            logits = self.model(state)
            if type(logits) is tuple:
                logits = logits[0]
            probs = Categorical(logits=logits)
            return probs.sample().cpu().numpy()[0]


def make_env(seed, capture_video=True):
    env = gym.make("CarRacing-v2", render_mode="rgb_array", continuous=False)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    if capture_video:
        env = gym.wrappers.RecordVideo(env, logger.video_dir)

    env = CropObservation(env, (84, 96))
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayScaleObservation(env)
    env = RecordState(env, reset_clean=True)
    env = gym.wrappers.FrameStack(env, 4)
    env.reset(seed=seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env


def run_episode(agent, show_progress=True, capture_video=True, seed=None):
    env = make_env(seed=seed, capture_video=capture_video)
    state, _ = env.reset()
    score = 0
    done = False
    if show_progress:
        progress = tqdm(desc="Score: 0")

    while not done:
        action = agent.select_action(state[-1][np.newaxis, ...])
        state, reward, terminated, truncated, _ = env.step(action)
        score += reward
        done = terminated or truncated
        if show_progress:
            progress.update()
            progress.set_description("Score: {:.2f}".format(score))
    env.close()

    if show_progress:
        progress.close()
    if capture_video:
        show_video(logger.video_dir)

    return score

## Evaluate behavioral cloning agent

Let's see how the agent is doing in the real environment

In [None]:
train_policy = Agent(model, device)
score = run_episode(train_policy, show_progress=True, capture_video=True);
(f"Score: {score:.2f}")

Since we often have high variance when evaluating RL agents we should evaluate the agent multiple times to get a better feeling for its performance.

In [None]:
train_policy = Agent(model, device)
n_eval_episodes = 10
scores = []
for i in tqdm(range(n_eval_episodes), desc="Episode"):
    scores.append(run_episode(train_policy, show_progress=False, capture_video=False))
    ("Score: %d" % scores[-1])
("Mean Score: %.2f (Std: %.2f)" %(np.mean(scores), np.std(scores)))

# DAGGER

Now we can implement DAgger, you have downloaded a relatively well trained model you can use as an expert for this purpose.

Load expert model that is provided as ONNX file.

## Load the expert

In [None]:
# Load expert
expert_model = ConvertModel(onnx.load("expert.onnx"))
expert_model = expert_model.to(device)
# Freeze expert weights
for p in expert_model.parameters():
    p.requires_grad = False

expert_policy = Agent(expert_model, device)

In [None]:
# model = PolicyNetworkMLP(n_units_out=5)
model = model.to(device)
train_policy = Agent(model, device)

Next, you have to implement the DAgger algorithm (see slides for details). This function implements the core idea of DAgger:


1. Choose the policy with probability beta
2. Sample T-step trajectories using this policy
3. Label the gathered states with the expert

The aggregation and training part are already implemented.

CODE CHATGPT Test1

In [None]:
# inner loop of DAgger
# def dagger(env, train_policy, expert_policy, dataset, beta=1.):

    ######################
    ### YOUR CODE HERE ###
    ######################

    # Implement DAgger algorithm here
    # 1) Choose a policy (sample according to beta)
    # 2) Sample T-step trajectory with the chosen policy
    #    (T can be an entire episode or a single state, think about what makes more sense here and implement it accordingly)
    # 3) Label the state (or states) with your expert if they come from your training policy

    #### Note ####
    # To get an action for the current state from your training policy or expert policy:
    # action = policy.select_action(state)
    #
    # Your training policy requires a single grayscale state while
    # the expert policy requires four stacked grayscale states
    # You can prepare your state for the policy like so:
    # Train policy:
    #      np.expand_dims(state[-1], 0)
    # Expert policy:
    #      state


    # Due to the RecordState wrapper you can get the states from the environment by calling
    # env.render()
    # Doing so will clear the list and the next time you call .render() will return the new states since the last call.
    # Note: be careful with the last state

    # Finally, add collected states and the actions the expert would execute in them to the dataset
    # dataset.append(states, actions)
    # Choose a policy (sample according to beta)
    # policy = train_policy if np.random.rand() < beta else expert_policy
    # Choose a policy

    # """
    # Implements the DAgger algorithm to iteratively collect training data using a combination of the train and expert policies.

    # :param env: The environment object with methods reset() and step(action).
    # :param train_policy: The policy being trained.
    # :param expert_policy: The expert policy used to provide the correct actions.
    # :param dataset: The dataset to which new state-action pairs are added.
    # :param beta: The probability of choosing the expert policy at each decision point.
    # """
    # state = env.reset()
    # done = False
    # while not done:
    #     # Decide whether to use the expert or the trained policy
    #     use_expert = np.random.rand() < beta
    #     if use_expert:
    #         # Expert policy uses the full state
    #         action = expert_policy.select_action(state)
    #     else:
    #         # Training policy might require a differently formatted state
    #         # Assuming it uses the last state if the environment states are a time series
    #         formatted_state = np.expand_dims(state[-1], 0) if state.ndim > 1 else state
    #         action = train_policy.select_action(formatted_state)

    #     # Take the action in the environment
    #     next_state, reward, done, _ = env.step(action)

    #     # Get the expert's action for the current state (for training purposes)
    #     expert_action = expert_policy.select_action(state)

    #     # Append the current state and the expert's action to the dataset
    #     dataset.append((state, expert_action))

    #     # Update the current state
    #     state = next_state

    # return dataset

def dagger(env, train_policy, expert_policy, dataset, beta=1.):
    """
    Implements the DAgger algorithm to iteratively collect training data using a combination of the train and expert policies.
    """
    state = env.reset()
    done = False
    while not done:
        # Decide whether to use the expert or the trained policy
        use_expert = np.random.rand() < beta
        action = expert_policy.select_action(state) if use_expert else train_policy.select_action(np.expand_dims(state[-1], 0))

        # Execute the chosen action and collect new state and reward
        next_state, reward, done, info = env.step(action)

        # Get the expert's action for the current state to label it for training
        expert_action = expert_policy.select_action(state)
        dataset.append((state, expert_action))  # Always use the expert action for labeling

        # Update state for the next iteration
        state = next_state

    return dataset


Put everything together now.
1. Create new samples using the DAgger algorithm
2. Continue training your agent
3. Export your fully trained agent as an ONNX file

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

# Initialize environment and model
env = make_env(seed=42, capture_video=False)
model = Model().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
logger = Logger("/content/DRL/logdir_dagger")

# Data loading
train_set = DemonstrationDataset("train")
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

# Training setup
epochs = 50  # Set the number of training epochs
beta = 0.5  # Set the DAgger parameter beta

# Training and validation loop
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for obs, action in train_loader:
        obs = obs.float().to(model.device)
        action = action.long().to(model.device)
        optimizer.zero_grad()

        logits = model(obs)
        loss = criterion(logits, action)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        logger.log("training_loss", loss.item())

    average_train_loss = train_loss / len(train_loader)
    logger.log("average_training_loss", average_train_loss)

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for obs, action in val_loader:
            obs = obs.float().to(model.device)
            action = action.long().to(model.device)
            logits = model(obs)
            loss = criterion(logits, action)
            val_loss += loss.item()

    average_val_loss = val_loss / len(val_loader)
    logger.log("average_validation_loss", average_val_loss)
    logger.dump()

print("Training and validation complete. Logs are saved.")

## GH Training code

In [None]:
import torch.optim as optim

# Specify the google drive mount here if you want to store logs and weights there (and set it up earlier)
logger = Logger("/content/DRL/logdir_dagger")
("Saving state to {}".format(logger.basepath))

# start environment
env = make_env(seed=42, capture_video=False)
dataset = DemonstrationDataset("expert.onnx")
batch_size = 32

# Datasets
train_set = DemonstrationDataset("train")
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, drop_last=False, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, drop_last=False, pin_memory=True)
# Training
######################
epochs = 10  # Set the number of training epochs
beta = 0.5  # Set the DAgger parameter beta
learning_rate = 0.0001
epochs = 2
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


for epoch in range(epochs):
    # Train for one epoch
    train_loss = 0.0
    for i, (obs, action) in enumerate(train_loader):
        # Convert observations and actions to tensors
        obs = obs.float().to(device)
        # action = action.long()
        optimizer.zero_grad()

        # Forward pass
        logits = model(obs)
        # Compute loss and backpropagation
        loss = criterion(logits, action.to(device))
        loss.backward()
        optimizer.step()

        # Log training loss
        train_loss += loss.item()
        if logger is not None:
            logger.log("training_loss", loss.item())

    # Compute average training loss for epoch
    train_loss /= len(train_loader)
    ("training_loss", loss.item())

    # Evaluate on validation set
    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for i, (obs, action) in enumerate(val_loader):

            # Convert observations and actions to tensors
            obs = obs.float().to(device)
            # action = action.long()

            # Forward pass
            logits = model(obs)

            # Compute loss
            loss = criterion(logits, action.to(device))

            # Log validation loss
            val_loss += loss.item()
            if logger is not None:
                logger.log("validation_loss", loss.item())

    # Compute average validation loss for epoch
    val_loss /= len(val_loader)
    ("validation_loss", loss.item())
    # Log epoch statistics
    if logger is not None:
        logger.log("epoch", epoch+1)
        logger.log("train_loss", train_loss)
        logger.log("val_loss", val_loss)
        logger.dump()

save_as_onnx(model, sample_state, logger.onnx_file)
env.close()

## Dagger training code: CHATGPT ????

In [None]:
import torch.optim as optim
import torch.nn as nn

# Initialization and logger setup
logger = Logger("/content/DRL/logdir_dagger")
print("Saving state to {}".format(logger.basepath))

# Start environment and dataset setup
env = make_env(seed=42, capture_video=False)
train_set = DemonstrationDataset("train_data")
val_set = DemonstrationDataset("val_data")
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, pin_memory=True)

# Training parameters
beta = 0.5  # Set the DAgger parameter beta
learning_rate = 0.0001
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for i, (obs, action) in enumerate(train_loader):
        obs = obs.float().to(device)
        action = action.long().to(device)
        optimizer.zero_grad()
        logits = model(obs)
        loss = criterion(logits, action)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        logger.log("training_loss", loss.item())
    train_loss /= len(train_loader)
    print("training_loss", train_loss)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for i, (obs, action) in enumerate(val_loader):
            obs = obs.float().to(device)
            action = action.long().to(device)
            logits = model(obs)
            loss = criterion(logits, action)
            val_loss += loss.item()
            logger.log("validation_loss", loss.item())
    val_loss /= len(val_loader)
    print("validation_loss", val_loss)

    logger.log("epoch", epoch+1)
    logger.log("train_loss", train_loss)
    logger.log("val_loss", val_loss)

# Finalize and save
sample_state = env.reset()  # Ensure this is a proper input for the model
save_as_onnx(model, torch.tensor(sample_state).unsqueeze(0).float().to(device), logger.onnx_file)
env.close()
logger.dump()

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

# Assuming these are defined elsewhere in your codebase
# from your_module import make_env, DemonstrationDataset, Model, Logger

# Environment and Model setup
env = make_env(seed=42, capture_video=False)
# model = model()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialization and logger setup
logger = Logger("/content/DRL/logdir_dagger")
print("Saving state to {}".format(logger.basepath))

batch_size = 32
# Datasets
train_set = DemonstrationDataset("train")
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, drop_last=False, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, drop_last=False, pin_memory=True)

beta = 1.0  # Start with full reliance on expert
for epoch in range(epochs):
    new_data = []
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            action = model(state.to(device)) if np.random.rand() > beta else expert_action(state)
            next_state, reward, done, _ = env.step(action.cpu().numpy())
            new_data.append((state, action))
            state = next_state
    # Update the dataset with new examples
    train_set.update(new_data)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # Decrease beta
    beta *= decay_rate  # decay_rate < 1, e.g., 0.95
# Training parameters
beta = 0.5  # DAgger parameter beta
learning_rate = 0.0001
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for obs, action in train_loader:
        obs = obs.float().to(device)
        action = action.long().to(device)
        optimizer.zero_grad()
        logits = model(obs)
        loss = criterion(logits, action)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        logger.log("training_loss", loss.item())
    average_train_loss = train_loss / len(train_loader)
    print("Average Training Loss:", average_train_loss)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for obs, action in val_loader:
            obs = obs.float().to(device)
            action = action.long().to(device)
            logits = model(obs)
            loss = criterion(logits, action)
            val_loss += loss.item()
    average_val_loss = val_loss / len(val_loader)
    print("Average Validation Loss:", average_val_loss)

    logger.log("epoch", epoch + 1)
    logger.log("average_train_loss", average_train_loss)
    logger.log("average_val_loss", average_val_loss)

# Finalize and save
sample_state = env.reset()
save_as_onnx(model, torch.tensor(sample_state).unsqueeze(0).float().to(device), logger.onnx_file)
env.close()
logger.dump()

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

# Assuming these are defined elsewhere in your codebase
# from your_module import make_env, DemonstrationDataset, Model, Logger

# Environment and Model setup
env = make_env(seed=42, capture_video=False)
# model = model()  # Make sure to define your model
model.to(device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Logger setup
logger = Logger("/content/DRL/logdir_dagger")
print("Saving state to {}".format(logger.basepath))

# Dataset setup
batch_size = 32
train_set = DemonstrationDataset("train")
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=2, shuffle=True, drop_last=False, pin_memory=True)
val_set = DemonstrationDataset("val")
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=2, shuffle=False, drop_last=False, pin_memory=True)

# Training parameters
learning_rate = 0.0001
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# DAgger specific parameters
beta = 1.0  # Start with full reliance on expert
decay_rate = 0.95  # Decay rate for beta
num_episodes = 20  # Define the number of episodes per epoch

for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        lazy_frames, _ = state  # Assuming lazy_frames are your input frames

        # Convert LazyFrames to a tensor
        # Ensure that LazyFrames are converted correctly by first converting to numpy array
        frames_array = np.array(lazy_frames)  # Should convert to (C, H, W), where C is the number of frames if LazyFrames was set up correctly
        frames_tensor = torch.tensor(frames_array, device=device, dtype=torch.float32)

        # Check the shape and adjust if necessary
        if frames_tensor.ndim == 3:  # Correct shape should be (C, H, W)
            frames_tensor = frames_tensor.unsqueeze(0)  # Adds the batch dimension, resulting in (1, C, H, W)
        else:
            print("Unexpected shape:", frames_tensor.shape)
            continue  # Skip this iteration to avoid errors

        formatted_state = frames_tensor  # Now correctly shaped for your CNN

        use_expert = np.random.rand() < beta
        if use_expert:
            action = expert_policy.select_action(formatted_state)
        else:
            action = model(formatted_state)

        action_to_env = action.squeeze().cpu().numpy()
        next_state, reward, done, _ = env.step(action_to_env)
        state = next_state

    # Update the dataset with new examples
    train_set.update(new_data)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

    # Decrease beta
    beta *= decay_rate

    # Training phase
    model.train()
    train_loss = 0.0
    for obs, action in train_loader:
        obs = obs.float().to(device)
        action = action.long().to(device)
        optimizer.zero_grad()
        logits = model(obs)
        loss = criterion(logits, action)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        logger.log("training_loss", loss.item())
    average_train_loss = train_loss / len(train_loader)
    print(f"Average Training Loss: {average_train_loss}")

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for obs, action in val_loader:
            obs = obs.float().to(device)
            action = action.long().to(device)
            logits = model(obs)
            loss = criterion(logits, action)
            val_loss += loss.item()
    average_val_loss = val_loss / len(val_loader)
    print(f"Average Validation Loss: {average_val_loss}")

    logger.log("epoch", epoch + 1)
    logger.log("average_train_loss", average_train_loss)
    logger.log("average_val_loss", average_val_loss)

# Finalize and save
sample_state = env.reset()
model.eval()
with torch.no_grad():
    sample_input = torch.tensor(sample_state).unsqueeze(0).float().to(device)
    save_as_onnx(model, sample_input, logger.onnx_file)
env.close()
logger.dump()


In [None]:
print("frames_array shape:", frames_array.shape)
print("frames_tensor shape:", frames_tensor.shape)


In [None]:
print("Before model input, formatted_state shape:", formatted_state.shape)
action = model(formatted_state)
print("Before expert_policy input, formatted_state shape:", formatted_state.shape)
action = expert_policy.select_action(formatted_state)


In [None]:
n_eval_episodes = 10
scores = []
for i in tqdm(range(n_eval_episodes), desc="Episode"):
    scores.append(run_episode(train_policy, show_progress=False, capture_video=False))
    ("Score: %d" % scores[-1])
("Mean Score: %.2f (Std: %.2f)" %(np.mean(scores), np.std(scores)))