In [1]:
# Kaggle does not automatically have this
!pip install torch_geometric -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [2]:
import os
import sys
from tqdm import tqdm
from datetime import datetime
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn.model_selection import TimeSeriesSplit

import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch_geometric.data import Data, Batch

In [3]:
# Create submission folder if it doesn't exist
submission_dir = './submission'
os.makedirs(submission_dir, exist_ok=True)

# Uncomment the following block ONLY if you wish to inspect file paths in a Kaggle-like directory structure.
# On your local system, you likely have the files in your local folder so this is not needed.
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


# Data Loading for Local Environment
# Files are assumed to be in:
# ./cse-251-b-2025/train.npz
# ./cse-251-b-2025/test_input.npz

running_on_kaggle = True

if running_on_kaggle:
    train_file = np.load("/kaggle/input/cse-251-b-2025/train.npz")
    test_file = np.load("/kaggle/input/cse-251-b-2025/test_input.npz")
else:
    train_file = np.load("./cse-251-b-2025/train.npz")
    test_file = np.load("./cse-251-b-2025/test_input.npz")

train_data = train_file['data']
test_data = test_file['data']

print("train_data's shape:", train_data.shape)  # Expected shape: (10000, 50, 110, 6)
print("test_data's shape:", test_data.shape)    # Expected shape: (2100, 50, 50, 6)

train_data's shape: (10000, 50, 110, 6)
test_data's shape: (2100, 50, 50, 6)


# Visualization: 

In [4]:
# Run visualizations?
run_visualizations: bool = False

# From data loading notebook
def plot_one_training_scene(idx: int = 0):
    # Plot trajectories from one training scene (static plot)
    data_matrix = train_data[idx]

    plt.figure(figsize=(8, 8))
    for agent in range(data_matrix.shape[idx]):
        xs = data_matrix[agent, :, 0]
        ys = data_matrix[agent, :, 1]
        # Remove zeros (padding)
        xs = xs[xs != 0]
        ys = ys[ys != 0]
        plt.plot(xs, ys, alpha=0.7)
    plt.title("Trajectories from one training scene")
    plt.xlabel("x-coordinate")
    plt.ylabel("y-coordinate")
    plt.show()

# Create an animated gif for one training scene (exact code provided on kaggle)
def make_gif(data_matrix, name='example'):
    cmap = None
    if sys.version_info.minor <= 7:
        cmap = plt.cm.get_cmap("viridis", 50)
    else:
        cmap = plt.get_cmap("viridis", 50)

    fig, ax = plt.subplots(figsize=(10, 10))
    # Function to update plot for each frame
    def update(frame):
        ax.clear()
        # Get data for current timestep
        for i in range(1, data_matrix.shape[0]):
            x = data_matrix[i, frame, 0]
            y = data_matrix[i, frame, 1]
            if x != 0 and y != 0:
                xs = data_matrix[i, :frame+1, 0]  # Include current frame
                ys = data_matrix[i, :frame+1, 1]  # Include current frame
                # trim all zeros
                mask = (xs != 0) & (ys != 0)  # Only keep points where both x and y are non-zero
                xs = xs[mask]
                ys = ys[mask]
                # Only plot if we have points to plot
                if len(xs) > 0 and len(ys) > 0:
                    color = cmap(i)
                    ax.plot(xs, ys, alpha=0.9, color=color)
                    ax.scatter(x, y, s=80, color=color)
        ax.plot(data_matrix[0, :frame, 0], data_matrix[0, :frame, 1],
                color='tab:orange', label='Ego Vehicle')
        ax.scatter(data_matrix[0, frame, 0], data_matrix[0, frame, 1],
                   s=80, color='tab:orange')
        # Set title with timestep
        ax.set_title(f'Timestep {frame}')
        # Set consistent axis limits
        ax.set_xlim(data_matrix[:,:,0][data_matrix[:,:,0] != 0].min() - 10, 
                    data_matrix[:,:,0][data_matrix[:,:,0] != 0].max() + 10)
        ax.set_ylim(data_matrix[:,:,1][data_matrix[:,:,1] != 0].min() - 10, 
                    data_matrix[:,:,1][data_matrix[:,:,1] != 0].max() + 10)
        ax.legend()
        return ax.collections + ax.lines

    # Create animation
    anim = animation.FuncAnimation(fig, update, frames=list(range(0, data_matrix.shape[1], 3)),
                                   interval=100, blit=True)
    # Save as GIF
    anim.save(f'trajectory_visualization_{name}.gif', writer='pillow')
    plt.close()

if run_visualizations:
    plot_one_training_scene(0)
    make_gif(train_data[0], 'index0')

# Constant velocity from test set
Untouched from original data loading notebook.

In [5]:
# Run constant velocity model (Kaggle score of ~50)?
run_constant_velocity_model: bool = False

if run_constant_velocity_model:
    # Compute the velocity differences for the ego vehicle (agent index 0)
    velocity_diff = test_data[..., 1:, :2] - test_data[..., :-1, :2]
    print("Velocity difference shape:", velocity_diff.shape)

    # Compute average velocity for the ego vehicle (index 0) in each scene
    constant_vel = np.mean(velocity_diff[:, 0, :, :], axis=1)
    print("Constant velocity shape:", constant_vel.shape)

    # Generate predictions for 60 future time steps based on constant velocity
    pred_y_const = np.zeros((test_data.shape[0], 60, 2))
    starting_point = test_data[:, 0, -1, :2]  # Last observed position of ego vehicle

    for t in range(60):
        pred_y_const[:, t, :] = starting_point + (t + 1) * constant_vel

    # Reshape predictions to submission format: (2100, 60, 2) -> (12600, 2)
    pred_output_const = pred_y_const.reshape(-1, 2)
    output_df_const = pd.DataFrame(pred_output_const, columns=['x', 'y'])
    output_df_const.index.name = 'index'
    # Save output in the submission folder
    constant_vel_path = os.path.join(submission_dir, 'constant_vel_submission.csv')
    output_df_const.to_csv(constant_vel_path)
    print(f"Constant velocity submission saved locally as '{constant_vel_path}'.")

# Our Work

In [6]:
# MLP model with residual blocks: ineffective for TimeSeries data
class BasicMLP(nn.Module):
    def __init__(self, input_features, output_features):
        super().__init__()

        # Lazy layers infer the input size instead of having to explicitly pass it in
        # Backbone: linear -> BatchNorm -> PReLU -> Dropout
        self.net = nn.Sequential(
            nn.Linear(input_features, 1024),
            nn.BatchNorm1d(1024),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.PReLU(),
            nn.Dropout(0.2),
        ) # Note: residual width must match the last width of the net

        # Residual block added to avoid vanishing gradient issue
        self.residual = nn.Sequential(
            nn.LazyLinear(256),
            nn.ReLU(),
            nn.LazyLinear(256),
        )

        # Infer last input shape, then do final projection (60*2)
        self.head = nn.LazyLinear(output_features)

    def forward(self, x):
        # Original forward loop
        # # (batch, 50, 50, 6) or flattened already
        # x = x.view(x.size(0), -1)
        # h = self.net(x) #(batch, 256)
        # h = h + self.residual(h)  # residual skip
        # return self.head(h) #(batch, 120)

        # Taken from milestone notebook (tensor format)
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        # x = x[:, :, :, :2] # (batch, 50, 50, 2)
        x = x.reshape(-1, 50 * 50 * 6)
        x = self.net(x)
        x = x + self.residual(x)
        x = self.head(x)
        return x.view(-1, 60, 2)

In [7]:
# Base LSTM given to us in the milestone notebook
class BaseLSTM(nn.Module):
    def __init__(self, input_dim:int =6, hidden_dim:int =128, output_dim:int =60 * 2, dropout:float = 0):
        super(BaseLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        x= x.reshape(-1, 50, 50, 6)  # (batch_size, num_agents, seq_len, input_dim)
        x = x[:, 0, :, :] # Only Consider ego agent index 0

        lstm_out, _ = self.lstm(x)
        # lstm_out is of shape (batch_size, seq_len, hidden_dim) and we want the last time step output
        out = self.fc(lstm_out[:, -1, :])
        return out.view(-1, 60, 2)

In [8]:
# Multi agent scene context model
class SceneContextModel(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.agent_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.ego_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 60 * 2)
        )

    def forward(self, x_flat):
        # x = data.x
        # x = x[:, :, :, :2] # (batch, 50, 50, 2)
        # x = x.reshape(-1, 50 * 50 * 6)
        # x = self.mlp(x)
        # return x.view(-1, 60, 2)
        # In case you passed in a DataBatch
        if not isinstance(x_flat, torch.Tensor):
            x_flat = x_flat.x

        B = x_flat.size(0)
        x = x_flat.view(B, 50, 50, 6) #(B, agents, timesteps, features)
        x_agents = x.view(B, 50, -1)  #(B, 50, 300)
        agent_feats = self.agent_encoder(x_agents) #(B, 50, hidden_dim)
        scene_context = agent_feats.mean(dim=1) #(B, hidden_dim)

        ego_input = x[:, 0, :, :].reshape(B, -1) #(B, 300)
        ego_feat = self.ego_encoder(ego_input) #(B, hidden_dim)

        combined = torch.cat([ego_feat, scene_context], dim=1)

        out = self.decoder(combined) #(B, 120)
        return out.view(-1, 60, 2)

In [9]:
# Extended from the base LSTM model
class LSTMWithMLP(nn.Module):
    def __init__(self, input_dim:int =6, hidden_dim:int =128, output_dim:int =60 * 2, dropout:float = 0):
        super(LSTMWithMLP, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
        )
        self.fc = nn.Linear(64, output_dim)

    def forward(self, x):
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        x= x.reshape(-1, 50, 50, 6)  # (batch_size, num_agents, seq_len, input_dim)
        x = x[:, 0, :, :] # Only Consider ego agent index 0

        lstm_out, _ = self.lstm(x)
        # lstm_out is of shape (batch_size, seq_len, hidden_dim) and we want the last time step output
        x = self.net(lstm_out[:, -1, :])
        x = self.fc(x)
        return x.view(-1, 60, 2)

In [10]:
class LSTMButTwo(nn.Module):
    def __init__(self, input_dim:int =6, hidden_dim:int =128, output_dim:int =60 * 2, dropout:float = 0):
        super(LSTMButTwo, self).__init__()
        self.second_out_dim = 128

        self.lstm_1 = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.lstm_2 = nn.LSTM(hidden_dim, self.second_out_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(self.second_out_dim, output_dim)

    def forward(self, x):
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        x= x.reshape(-1, 50, 50, 6)  # (batch_size, num_agents, seq_len, input_dim)
        x = x[:, 0, :, :] # Only Consider ego agent index 0

        lstm_first_out, _ = self.lstm_1(x)
        lstm_second_out, _ = self.lstm_2(lstm_first_out)

        # lstm_out is of shape (batch_size, seq_len, hidden_dim) and we want the last time step output
        lstm_out = lstm_second_out[:, -1, :]
        x = self.fc(lstm_out)
        return x.view(-1, 60, 2)

# Preparing data

`TrajectoryDataset*` are taken from the milestone notebook.

In [11]:
class TrajectoryDatasetTrain(Dataset):
    def __init__(self, data, scale=10.0, augment=True):
        """
        data: Shape (N, 50, 110, 6) Training data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        augment: Whether to apply data augmentation (only for training)
        """
        self.data = data
        self.scale = scale
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        scene = self.data[idx]
        # Getting 50 historical timestamps and 60 future timestamps
        hist = scene[:, :50, :].copy()    # (agents=50, time_seq=50, 6)
        future = torch.tensor(scene[0, 50:, :2].copy(), dtype=torch.float32)  # (60, 2)
        
        # Data augmentation(only for training)
        if self.augment:
            if np.random.rand() < 0.5:
                theta = np.random.uniform(-np.pi, np.pi)
                R = np.array([[np.cos(theta), -np.sin(theta)],
                              [np.sin(theta),  np.cos(theta)]], dtype=np.float32)
                # Rotate the historical trajectory and future trajectory
                hist[..., :2] = hist[..., :2] @ R
                hist[..., 2:4] = hist[..., 2:4] @ R
                # future = future @ R gives DeprecationWarning: future a torch.Tensor
                future = torch.from_numpy(np.dot(future.numpy(), R)) 
            if np.random.rand() < 0.5:
                hist[..., 0] *= -1
                hist[..., 2] *= -1
                future[:, 0] *= -1

        # Use the last timeframe of the historical trajectory as the origin
        origin = hist[0, 49, :2].copy()  # (2,)
        hist[..., :2] = hist[..., :2] - origin
        # future = future - origin -> same DeprecationWarning
        future = torch.from_numpy(future.numpy() - origin)

        # Normalize the historical trajectory and future trajectory
        hist[..., :4] = hist[..., :4] / self.scale
        future = future / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            y=future.type(torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )

        return data_item
    

class TrajectoryDatasetTest(Dataset):
    def __init__(self, data, scale=10.0):
        """
        data: Shape (N, 50, 110, 6) Testing data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        """
        self.data = data
        self.scale = scale

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Testing data only contains historical trajectory
        scene = self.data[idx]  # (50, 50, 6)
        hist = scene.copy()
        
        origin = hist[0, 49, :2].copy()
        hist[..., :2] = hist[..., :2] - origin
        hist[..., :4] = hist[..., :4] / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )
        return data_item

# Training loop

Change which model is used at the `model = ...(input_features, output_features)` line.

Change which optimizer is used at the `optimizer = optim...` line.

Do **NOT** change the `criterion`, as MSE is stated in the Data tab of the competition.

In [12]:
# Taken from milestone notebook
# Set device for training speedup
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Apple Silicon GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA GPU")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using CUDA GPU


In [13]:
# Functions to save and load the model (should correspond to what was trained!)
def save_model(model, path="our_model.pth"):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


def load_model(model_instance, path="our_model.pth"):
    loaded_model = model_instance
    loaded_model.load_state_dict(torch.load(path))
    loaded_model.eval()
    return loaded_model


# Example usage:
# save_model(trained_model)
# model = load_model()

In [14]:
def get_timestamp() -> str:
    return datetime.now().strftime("%Y-%m-%d_%I-%M%p")

In [15]:
# Set up hyperparameters

# Calculate number of input features after flattening and number of output features
# Note: LSTM models take features in different dimensions
input_features:int = 50 * 50 * 6   # 50 agents, 50 time steps, 6 dimensions each (15000 input features)
output_features:int = 60 * 2       # 60 future time steps, 2 dimensions (x, y) (120 output features)

# Hyperparameters
batch_size: int = 32
num_folds: int = 4
early_stopping_patience: int = 25
early_stopping_threshold: float = 1e-5
epochs: int = 150
starting_lr: float = 5e-3
scale: float = 10.0
weight_decay: float = 1e-2

lstm_hidden_dim: int = 128


SEED: int = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [16]:
# IMPORTANT! To change which model is used: comment/uncomment below
# Easily swap models by changing what is returned (called in training and test to avoid conflicts)
def get_model():
    global lstm_hidden_dim, input_features, output_features

    # return BasicMLP(input_features, output_features).to(device)
    # return BaseLSTM(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
    # return SceneContextModel(hidden_dim=864).to(device)
    # return LSTMWithMLP(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
    return LSTMButTwo(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)

In [17]:
def train_model(full_training_data: np.ndarray, 
                batch_size:int = 64, epochs:int = 10, num_folds:int = 5,
                early_stopping_patience:int = 5, early_stopping_threshold:float = 1e-3):
    global starting_lr, gamma, scale, lstm_hidden_dim, output_features, weight_decay

    # Time series data needs to keep its data in relative order, so no shuffling can occur
    #   like in regular KFold cross validation
    splitter = TimeSeriesSplit(n_splits=num_folds, test_size=int(0.15 * len(full_training_data)))

    # Perform cross-validation, the best model will be saved as "best_model.pt" to be loaded in later
    overall_best_val_loss = float("inf")
    overall_best_seen_at = (0, 0) #(epoch, fold)

    # Resources used:
    # Project milestone notebook
    # https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-pytorch.md
    # https://www.geeksforgeeks.org/time-series-cross-validation/
    for fold_i, (train_idx, val_idx) in enumerate(splitter.split(full_training_data)):
        print(f"\nFOLD {fold_i + 1}/{num_folds} ==================================")

        # Create the model, loss criterion, and optimizer (reset per fold, to find the best model)
        # If you change the model here, ensure its the same in the test loop!
        # DO NOT CHANGE CRITERION
        criterion = nn.MSELoss()

        model = get_model()
        optimizer = optim.AdamW(model.parameters(), lr=starting_lr, weight_decay=weight_decay)
        schedulers: list[lr_scheduler.LRScheduler] =[
            lr_scheduler.ExponentialLR(optimizer, gamma=0.995),
            lr_scheduler.MultiStepLR(
                optimizer,
                milestones= list(range(25, epochs, 25)),
                gamma=0.80,
            ),
            # lr_scheduler.MultiStepLR(
            #     optimizer,
            #     milestones= list(range(100, epochs, 100)),
            #     gamma=0.5,
            # ),
            lr_scheduler.CosineAnnealingLR(
                optimizer, 
                T_max= int(epochs * 0.9),
                # T_0 = 50,
                # T_mult = 2,
                eta_min=1e-6
            ),
        ]

        # Prepare data from this fold
        train_fold: np.ndarray = full_training_data[train_idx]
        val_fold: np.ndarray = full_training_data[val_idx]
        collate_func = None     # Optional for DataLoader, taken from milestone notebook
        if not isinstance(model, SceneContextModel):
            # LSTM can handle the timeseries data directly
            # TrajectoryDataset expects numpy arrays
            collate_func = lambda x: Batch.from_data_list(x)
            train_dataset = TrajectoryDatasetTrain(train_fold, scale=scale, augment=True)
            val_dataset = TrajectoryDatasetTrain(val_fold, scale=scale, augment=False)
        else:
            train_x: np.ndarray = train_fold[..., :50, :]
            train_y: np.ndarray = train_fold[:, 0, 50:, :2]
            X_train_tensor = torch.FloatTensor(train_x).reshape((-1, input_features))
            y_train_tensor = torch.FloatTensor(train_y).reshape((-1, output_features))
            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

            val_x: np.ndarray = val_fold[..., :50, :]
            val_y: np.ndarray = val_fold[:, 0, 50:, :2]
            X_val_tensor = torch.FloatTensor(val_x).reshape((-1, input_features))
            y_val_tensor = torch.FloatTensor(val_y).reshape((-1, output_features))
            val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_func)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

        best_val_loss: float = float("inf")
        no_improvement: int = 0

        # Training and validation loops are taken from the milestone notebook,
        #   with modifications to allow for different data loading shapes      
        for epoch in tqdm(range(epochs), desc="Epoch", unit="epoch"):
            # Training loop
            model.train()
            train_loss = 0
            for batch in train_dataloader:
                batch_x = None
                batch_y = None
                if isinstance(batch, tuple) or isinstance(batch, list):
                    batch_x, batch_y = batch
                    batch_y = batch_y.view(-1, 60, 2)
                else: # DataBatch type
                    batch = batch.to(device)
                    batch_x = batch.x
                    batch_y = batch.y.view(batch.num_graphs, 60, 2)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
                optimizer.step()
                train_loss += loss.item()

            # Validation loop
            model.eval()
            val_loss = 0
            val_mae = 0
            val_mse = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch_x = None
                    batch_y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        batch_x, batch_y = batch
                        batch_y = batch_y.view(-1, 60, 2)
                    else: # DataBatch type
                        batch = batch.to(device)
                        batch_x = batch.x
                        batch_y = batch.y.view(batch.num_graphs, 60, 2)

                    pred = model(batch_x)
                    val_loss += criterion(pred, batch_y).item()

                    # show MAE and MSE with unnormalized data
                    y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        y = batch_y.view(-1, 60, 2)
                    else: # DataBatch type
                        pred = pred * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                        y = batch_y * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                    val_mae += nn.L1Loss()(pred, y).item()
                    val_mse += nn.MSELoss()(pred, y).item()

            train_loss /= len(train_dataloader)
            val_loss /= len(val_dataloader)
            val_mae /= len(val_dataloader)
            val_mse /= len(val_dataloader)

            if (epoch + 1) % 5 == 0:
                tqdm.write(f"Epoch {(epoch + 1):03d} | Learning rate {optimizer.param_groups[0]['lr']:.6f} | train normalized MSE {train_loss:8.4f} | val normalized MSE {val_loss:8.4f}, | val MAE {val_mae:8.4f} | val MSE {val_mse:8.4f}")

            if val_loss < best_val_loss - early_stopping_threshold:
                best_val_loss = val_loss
                no_improvement = 0

                # Better than the overall seen so far?
                if best_val_loss < overall_best_val_loss:
                    overall_best_val_loss = best_val_loss
                    overall_best_seen_at = (epoch + 1, fold_i + 1)
                    torch.save(model.state_dict(), "best_model.pt")
            else:
                no_improvement += 1
                if no_improvement >= early_stopping_patience:
                    print(f"==== EARLY STOP at epoch {(epoch + 1):03d}")
                    break

            for sched in schedulers:
                sched.step()

        # Clean up after the fold finishes to prevent slower folds later
        # https://discuss.pytorch.org/t/how-to-delete-a-tensor-in-gpu-to-free-up-memory/48879
        torch.cuda.empty_cache()
        del train_dataloader, train_dataset, val_dataloader, val_dataset

    print(f"BEST VALIDATION LOSS (NORMALIZED MSE) SEEN: {overall_best_val_loss}, AT (epoch, fold) = {overall_best_seen_at}")

In [18]:
# Load in the model saved during testing to use on X_test
# Mostly taken from milestone notebook
def predict(X_test: np.ndarray, best_model_path: str = "best_model.pt"):
    global scale, batch_size, lstm_hidden_dim, output_features

    # Ensure this aligns with the trained model!
    best_model = torch.load(best_model_path)
    model = get_model()
    model.load_state_dict(best_model)
    model.eval()

    pred_list = []
    with torch.no_grad():
        if not isinstance(model, SceneContextModel): # Using DataBatch type from a DataLoader
            collate_func = lambda x: Batch.from_data_list(x)
            test_dataset = TrajectoryDatasetTest(X_test, scale=scale)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

            for batch in test_loader:
                batch = batch.to(device)
                pred_norm = model(batch.x)

                # Reshape the prediction to (N, 60, 2)
                pred = pred_norm * batch.scale.view(-1,1,1) + batch.origin.unsqueeze(1)
                pred_list.append(pred.cpu().numpy())
        else:
            X_test_tensor = torch.FloatTensor(X_test).reshape((-1, input_features)).to(device)
            pred = model(X_test_tensor).cpu().reshape((-1, 60, 2))
            pred_list.append(pred.numpy())

    # Reshape predictions to match submission format: (2100, 60, 2) -> (12600, 2)
    pred_list = np.concatenate(pred_list, axis=0)  # (N,60,2)
    pred_output = pred_list.reshape(-1, 2)  # (N*60, 2)
    output_df = pd.DataFrame(pred_output, columns=['x', 'y'])
    output_df.index.name = 'index'
    return output_df

In [19]:
# Train the model (tweak batch_size and epochs as needed at top of this block)
# Saved as "best_model.pt" to be loaded in during testing
train_model(train_data, batch_size=batch_size, epochs=epochs, num_folds=num_folds,
            early_stopping_patience=early_stopping_patience,
            early_stopping_threshold=early_stopping_threshold)




Epoch:   3%|▎         | 5/150 [00:17<08:02,  3.33s/epoch]

Epoch 005 | Learning rate 0.004890 | train normalized MSE   0.1762 | val normalized MSE   0.1649, | val MAE   2.2116 | val MSE  16.4906


Epoch:   7%|▋         | 10/150 [00:33<07:31,  3.22s/epoch]

Epoch 010 | Learning rate 0.004727 | train normalized MSE   0.1328 | val normalized MSE   0.1310, | val MAE   1.9133 | val MSE  13.1036


Epoch:  10%|█         | 15/150 [00:49<07:18,  3.25s/epoch]

Epoch 015 | Learning rate 0.004539 | train normalized MSE   0.1162 | val normalized MSE   0.1107, | val MAE   1.8135 | val MSE  11.0732


Epoch:  13%|█▎        | 20/150 [01:05<07:00,  3.23s/epoch]

Epoch 020 | Learning rate 0.004327 | train normalized MSE   0.1094 | val normalized MSE   0.1011, | val MAE   1.6482 | val MSE  10.1064


Epoch:  17%|█▋        | 25/150 [01:21<06:48,  3.27s/epoch]

Epoch 025 | Learning rate 0.004097 | train normalized MSE   0.1081 | val normalized MSE   0.1050, | val MAE   1.6495 | val MSE  10.5006


Epoch:  20%|██        | 30/150 [01:38<06:27,  3.23s/epoch]

Epoch 030 | Learning rate 0.003080 | train normalized MSE   0.0951 | val normalized MSE   0.1040, | val MAE   1.6447 | val MSE  10.3960


Epoch:  23%|██▎       | 35/150 [01:54<06:08,  3.21s/epoch]

Epoch 035 | Learning rate 0.002872 | train normalized MSE   0.0934 | val normalized MSE   0.0976, | val MAE   1.6269 | val MSE   9.7568


Epoch:  27%|██▋       | 40/150 [02:10<05:56,  3.24s/epoch]

Epoch 040 | Learning rate 0.002658 | train normalized MSE   0.0935 | val normalized MSE   0.0959, | val MAE   1.5233 | val MSE   9.5941


Epoch:  30%|███       | 45/150 [02:26<05:40,  3.24s/epoch]

Epoch 045 | Learning rate 0.002439 | train normalized MSE   0.0981 | val normalized MSE   0.0934, | val MAE   1.5544 | val MSE   9.3428


Epoch:  33%|███▎      | 50/150 [02:42<05:21,  3.21s/epoch]

Epoch 050 | Learning rate 0.002218 | train normalized MSE   0.0882 | val normalized MSE   0.0875, | val MAE   1.4313 | val MSE   8.7494


Epoch:  37%|███▋      | 55/150 [02:58<05:05,  3.22s/epoch]

Epoch 055 | Learning rate 0.001598 | train normalized MSE   0.0828 | val normalized MSE   0.0864, | val MAE   1.4152 | val MSE   8.6400


Epoch:  40%|████      | 60/150 [03:15<04:54,  3.28s/epoch]

Epoch 060 | Learning rate 0.001425 | train normalized MSE   0.0798 | val normalized MSE   0.0889, | val MAE   1.5010 | val MSE   8.8915


Epoch:  43%|████▎     | 65/150 [03:31<04:38,  3.27s/epoch]

Epoch 065 | Learning rate 0.001256 | train normalized MSE   0.0787 | val normalized MSE   0.0844, | val MAE   1.4040 | val MSE   8.4366


Epoch:  47%|████▋     | 70/150 [03:47<04:20,  3.26s/epoch]

Epoch 070 | Learning rate 0.001093 | train normalized MSE   0.0773 | val normalized MSE   0.0840, | val MAE   1.4046 | val MSE   8.4016


Epoch:  50%|█████     | 75/150 [04:03<04:03,  3.24s/epoch]

Epoch 075 | Learning rate 0.000938 | train normalized MSE   0.0758 | val normalized MSE   0.0825, | val MAE   1.3441 | val MSE   8.2539


Epoch:  53%|█████▎    | 80/150 [04:20<03:48,  3.26s/epoch]

Epoch 080 | Learning rate 0.000634 | train normalized MSE   0.0735 | val normalized MSE   0.0814, | val MAE   1.3631 | val MSE   8.1416


Epoch:  57%|█████▋    | 85/150 [04:36<03:31,  3.25s/epoch]

Epoch 085 | Learning rate 0.000526 | train normalized MSE   0.0724 | val normalized MSE   0.0814, | val MAE   1.3564 | val MSE   8.1378


Epoch:  60%|██████    | 90/150 [04:53<03:19,  3.32s/epoch]

Epoch 090 | Learning rate 0.000427 | train normalized MSE   0.0711 | val normalized MSE   0.0819, | val MAE   1.3627 | val MSE   8.1878


Epoch:  63%|██████▎   | 95/150 [05:09<03:02,  3.31s/epoch]

Epoch 095 | Learning rate 0.000338 | train normalized MSE   0.0708 | val normalized MSE   0.0806, | val MAE   1.3157 | val MSE   8.0620


Epoch:  67%|██████▋   | 100/150 [05:26<02:47,  3.34s/epoch]

Epoch 100 | Learning rate 0.000259 | train normalized MSE   0.0690 | val normalized MSE   0.0805, | val MAE   1.3013 | val MSE   8.0530


Epoch:  70%|███████   | 105/150 [05:43<02:29,  3.32s/epoch]

Epoch 105 | Learning rate 0.000152 | train normalized MSE   0.0681 | val normalized MSE   0.0806, | val MAE   1.2998 | val MSE   8.0630


Epoch:  73%|███████▎  | 110/150 [05:59<02:13,  3.33s/epoch]

Epoch 110 | Learning rate 0.000106 | train normalized MSE   0.0664 | val normalized MSE   0.0797, | val MAE   1.2998 | val MSE   7.9680


Epoch:  77%|███████▋  | 115/150 [06:16<01:56,  3.32s/epoch]

Epoch 115 | Learning rate 0.000069 | train normalized MSE   0.0667 | val normalized MSE   0.0795, | val MAE   1.2899 | val MSE   7.9485


Epoch:  80%|████████  | 120/150 [06:32<01:38,  3.29s/epoch]

Epoch 120 | Learning rate 0.000040 | train normalized MSE   0.0668 | val normalized MSE   0.0798, | val MAE   1.2879 | val MSE   7.9757


Epoch:  83%|████████▎ | 125/150 [06:49<01:22,  3.28s/epoch]

Epoch 125 | Learning rate 0.000019 | train normalized MSE   0.0664 | val normalized MSE   0.0796, | val MAE   1.2900 | val MSE   7.9617


Epoch:  84%|████████▍ | 126/150 [06:55<01:19,  3.30s/epoch]

==== EARLY STOP at epoch 127




Epoch:   3%|▎         | 5/150 [00:21<10:20,  4.28s/epoch]

Epoch 005 | Learning rate 0.004890 | train normalized MSE   0.1721 | val normalized MSE   0.1617, | val MAE   2.1870 | val MSE  16.1707


Epoch:   7%|▋         | 10/150 [00:42<09:58,  4.27s/epoch]

Epoch 010 | Learning rate 0.004727 | train normalized MSE   0.1250 | val normalized MSE   0.1327, | val MAE   1.9030 | val MSE  13.2657


Epoch:  10%|█         | 15/150 [01:03<09:28,  4.21s/epoch]

Epoch 015 | Learning rate 0.004539 | train normalized MSE   0.1131 | val normalized MSE   0.1354, | val MAE   1.8576 | val MSE  13.5387


Epoch:  13%|█▎        | 20/150 [01:25<09:11,  4.24s/epoch]

Epoch 020 | Learning rate 0.004327 | train normalized MSE   0.1079 | val normalized MSE   0.1310, | val MAE   1.9627 | val MSE  13.0963


Epoch:  17%|█▋        | 25/150 [01:46<08:55,  4.28s/epoch]

Epoch 025 | Learning rate 0.004097 | train normalized MSE   0.1317 | val normalized MSE   0.1387, | val MAE   1.9314 | val MSE  13.8661


Epoch:  20%|██        | 30/150 [02:07<08:30,  4.25s/epoch]

Epoch 030 | Learning rate 0.003080 | train normalized MSE   0.1060 | val normalized MSE   0.1222, | val MAE   1.7565 | val MSE  12.2187


Epoch:  23%|██▎       | 35/150 [02:29<08:12,  4.28s/epoch]

Epoch 035 | Learning rate 0.002872 | train normalized MSE   0.0978 | val normalized MSE   0.1154, | val MAE   1.6757 | val MSE  11.5351


Epoch:  27%|██▋       | 40/150 [02:50<07:54,  4.31s/epoch]

Epoch 040 | Learning rate 0.002658 | train normalized MSE   0.0934 | val normalized MSE   0.1171, | val MAE   1.6927 | val MSE  11.7089


Epoch:  30%|███       | 45/150 [03:12<07:28,  4.27s/epoch]

Epoch 045 | Learning rate 0.002439 | train normalized MSE   0.0978 | val normalized MSE   0.1125, | val MAE   1.7313 | val MSE  11.2478


Epoch:  33%|███▎      | 50/150 [03:33<07:07,  4.28s/epoch]

Epoch 050 | Learning rate 0.002218 | train normalized MSE   0.0900 | val normalized MSE   0.1100, | val MAE   1.5847 | val MSE  10.9993


Epoch:  37%|███▋      | 55/150 [03:54<06:44,  4.26s/epoch]

Epoch 055 | Learning rate 0.001598 | train normalized MSE   0.0850 | val normalized MSE   0.1054, | val MAE   1.6017 | val MSE  10.5395


Epoch:  40%|████      | 60/150 [04:16<06:23,  4.26s/epoch]

Epoch 060 | Learning rate 0.001425 | train normalized MSE   0.0829 | val normalized MSE   0.1030, | val MAE   1.6064 | val MSE  10.2966


Epoch:  43%|████▎     | 65/150 [04:37<06:00,  4.25s/epoch]

Epoch 065 | Learning rate 0.001256 | train normalized MSE   0.0799 | val normalized MSE   0.1024, | val MAE   1.5573 | val MSE  10.2398


Epoch:  47%|████▋     | 70/150 [04:58<05:38,  4.23s/epoch]

Epoch 070 | Learning rate 0.001093 | train normalized MSE   0.0779 | val normalized MSE   0.0951, | val MAE   1.4493 | val MSE   9.5064


Epoch:  50%|█████     | 75/150 [05:19<05:19,  4.26s/epoch]

Epoch 075 | Learning rate 0.000938 | train normalized MSE   0.0775 | val normalized MSE   0.0970, | val MAE   1.4541 | val MSE   9.6995


Epoch:  53%|█████▎    | 80/150 [05:40<04:54,  4.20s/epoch]

Epoch 080 | Learning rate 0.000634 | train normalized MSE   0.0739 | val normalized MSE   0.0941, | val MAE   1.4165 | val MSE   9.4136


Epoch:  57%|█████▋    | 85/150 [06:01<04:32,  4.19s/epoch]

Epoch 085 | Learning rate 0.000526 | train normalized MSE   0.0726 | val normalized MSE   0.0951, | val MAE   1.4362 | val MSE   9.5057


Epoch:  60%|██████    | 90/150 [06:22<04:13,  4.22s/epoch]

Epoch 090 | Learning rate 0.000427 | train normalized MSE   0.0723 | val normalized MSE   0.0943, | val MAE   1.4326 | val MSE   9.4309


Epoch:  63%|██████▎   | 95/150 [06:43<03:50,  4.20s/epoch]

Epoch 095 | Learning rate 0.000338 | train normalized MSE   0.0705 | val normalized MSE   0.0954, | val MAE   1.4108 | val MSE   9.5413


Epoch:  67%|██████▋   | 100/150 [07:04<03:30,  4.21s/epoch]

Epoch 100 | Learning rate 0.000259 | train normalized MSE   0.0695 | val normalized MSE   0.0925, | val MAE   1.4048 | val MSE   9.2484


Epoch:  70%|███████   | 105/150 [07:26<03:12,  4.29s/epoch]

Epoch 105 | Learning rate 0.000152 | train normalized MSE   0.0693 | val normalized MSE   0.0928, | val MAE   1.4001 | val MSE   9.2828


Epoch:  73%|███████▎  | 110/150 [07:47<02:51,  4.28s/epoch]

Epoch 110 | Learning rate 0.000106 | train normalized MSE   0.0659 | val normalized MSE   0.0923, | val MAE   1.3818 | val MSE   9.2321


Epoch:  77%|███████▋  | 115/150 [08:09<02:29,  4.26s/epoch]

Epoch 115 | Learning rate 0.000069 | train normalized MSE   0.0674 | val normalized MSE   0.0927, | val MAE   1.3919 | val MSE   9.2730


Epoch:  80%|████████  | 120/150 [08:30<02:09,  4.31s/epoch]

Epoch 120 | Learning rate 0.000040 | train normalized MSE   0.0676 | val normalized MSE   0.0925, | val MAE   1.3804 | val MSE   9.2525


Epoch:  83%|████████▎ | 125/150 [08:51<01:46,  4.24s/epoch]

Epoch 125 | Learning rate 0.000019 | train normalized MSE   0.0672 | val normalized MSE   0.0925, | val MAE   1.3781 | val MSE   9.2546


Epoch:  87%|████████▋ | 130/150 [09:12<01:23,  4.17s/epoch]

Epoch 130 | Learning rate 0.000005 | train normalized MSE   0.0662 | val normalized MSE   0.0926, | val MAE   1.3783 | val MSE   9.2566


Epoch:  87%|████████▋ | 131/150 [09:20<01:21,  4.28s/epoch]

==== EARLY STOP at epoch 132




Epoch:   3%|▎         | 5/150 [00:26<12:37,  5.22s/epoch]

Epoch 005 | Learning rate 0.004890 | train normalized MSE   0.1653 | val normalized MSE   0.1537, | val MAE   2.2068 | val MSE  15.3742


Epoch:   7%|▋         | 10/150 [00:52<12:06,  5.19s/epoch]

Epoch 010 | Learning rate 0.004727 | train normalized MSE   0.1282 | val normalized MSE   0.1201, | val MAE   1.7898 | val MSE  12.0131


Epoch:  10%|█         | 15/150 [01:18<11:42,  5.20s/epoch]

Epoch 015 | Learning rate 0.004539 | train normalized MSE   0.1065 | val normalized MSE   0.1232, | val MAE   1.9477 | val MSE  12.3250


Epoch:  13%|█▎        | 20/150 [01:44<11:15,  5.20s/epoch]

Epoch 020 | Learning rate 0.004327 | train normalized MSE   0.1031 | val normalized MSE   0.1015, | val MAE   1.5842 | val MSE  10.1475


Epoch:  17%|█▋        | 25/150 [02:09<10:46,  5.17s/epoch]

Epoch 025 | Learning rate 0.004097 | train normalized MSE   0.0992 | val normalized MSE   0.1019, | val MAE   1.6254 | val MSE  10.1861


Epoch:  20%|██        | 30/150 [02:35<10:22,  5.18s/epoch]

Epoch 030 | Learning rate 0.003080 | train normalized MSE   0.0942 | val normalized MSE   0.0970, | val MAE   1.6294 | val MSE   9.7030


Epoch:  23%|██▎       | 35/150 [03:01<09:58,  5.21s/epoch]

Epoch 035 | Learning rate 0.002872 | train normalized MSE   0.0916 | val normalized MSE   0.0932, | val MAE   1.4365 | val MSE   9.3227


Epoch:  27%|██▋       | 40/150 [03:28<09:35,  5.24s/epoch]

Epoch 040 | Learning rate 0.002658 | train normalized MSE   0.0896 | val normalized MSE   0.0924, | val MAE   1.4481 | val MSE   9.2434


Epoch:  30%|███       | 45/150 [03:53<09:02,  5.16s/epoch]

Epoch 045 | Learning rate 0.002439 | train normalized MSE   0.0876 | val normalized MSE   0.0916, | val MAE   1.5504 | val MSE   9.1563


Epoch:  33%|███▎      | 50/150 [04:19<08:38,  5.18s/epoch]

Epoch 050 | Learning rate 0.002218 | train normalized MSE   0.0869 | val normalized MSE   0.0890, | val MAE   1.4689 | val MSE   8.9020


Epoch:  37%|███▋      | 55/150 [04:45<08:05,  5.11s/epoch]

Epoch 055 | Learning rate 0.001598 | train normalized MSE   0.0826 | val normalized MSE   0.0870, | val MAE   1.4037 | val MSE   8.7037


Epoch:  40%|████      | 60/150 [05:11<07:46,  5.18s/epoch]

Epoch 060 | Learning rate 0.001425 | train normalized MSE   0.0799 | val normalized MSE   0.0893, | val MAE   1.4727 | val MSE   8.9253


Epoch:  43%|████▎     | 65/150 [05:37<07:21,  5.20s/epoch]

Epoch 065 | Learning rate 0.001256 | train normalized MSE   0.0799 | val normalized MSE   0.0870, | val MAE   1.4516 | val MSE   8.6966


Epoch:  47%|████▋     | 70/150 [06:03<06:57,  5.22s/epoch]

Epoch 070 | Learning rate 0.001093 | train normalized MSE   0.0792 | val normalized MSE   0.0845, | val MAE   1.3774 | val MSE   8.4486


Epoch:  50%|█████     | 75/150 [06:29<06:28,  5.18s/epoch]

Epoch 075 | Learning rate 0.000938 | train normalized MSE   0.0756 | val normalized MSE   0.0841, | val MAE   1.3821 | val MSE   8.4051


Epoch:  53%|█████▎    | 80/150 [06:54<05:59,  5.14s/epoch]

Epoch 080 | Learning rate 0.000634 | train normalized MSE   0.0741 | val normalized MSE   0.0819, | val MAE   1.3346 | val MSE   8.1856


Epoch:  57%|█████▋    | 85/150 [07:20<05:35,  5.17s/epoch]

Epoch 085 | Learning rate 0.000526 | train normalized MSE   0.0730 | val normalized MSE   0.0812, | val MAE   1.3284 | val MSE   8.1184


Epoch:  60%|██████    | 90/150 [07:46<05:11,  5.19s/epoch]

Epoch 090 | Learning rate 0.000427 | train normalized MSE   0.0721 | val normalized MSE   0.0830, | val MAE   1.3493 | val MSE   8.2989


Epoch:  63%|██████▎   | 95/150 [08:12<04:46,  5.21s/epoch]

Epoch 095 | Learning rate 0.000338 | train normalized MSE   0.0707 | val normalized MSE   0.0840, | val MAE   1.3509 | val MSE   8.4024


Epoch:  67%|██████▋   | 100/150 [08:38<04:19,  5.19s/epoch]

Epoch 100 | Learning rate 0.000259 | train normalized MSE   0.0705 | val normalized MSE   0.0830, | val MAE   1.3449 | val MSE   8.3018


Epoch:  70%|███████   | 105/150 [09:04<03:54,  5.21s/epoch]

Epoch 105 | Learning rate 0.000152 | train normalized MSE   0.0698 | val normalized MSE   0.0823, | val MAE   1.3326 | val MSE   8.2272


Epoch:  73%|███████▎  | 109/150 [09:30<03:34,  5.23s/epoch]


Epoch 110 | Learning rate 0.000106 | train normalized MSE   0.0695 | val normalized MSE   0.0819, | val MAE   1.3276 | val MSE   8.1908
==== EARLY STOP at epoch 110



Epoch:   3%|▎         | 5/150 [00:30<14:47,  6.12s/epoch]

Epoch 005 | Learning rate 0.004890 | train normalized MSE   0.1568 | val normalized MSE   0.1253, | val MAE   1.8414 | val MSE  12.5292


Epoch:   7%|▋         | 10/150 [01:01<14:17,  6.12s/epoch]

Epoch 010 | Learning rate 0.004727 | train normalized MSE   0.1304 | val normalized MSE   0.1178, | val MAE   1.8108 | val MSE  11.7753


Epoch:  10%|█         | 15/150 [01:32<13:56,  6.19s/epoch]

Epoch 015 | Learning rate 0.004539 | train normalized MSE   0.1069 | val normalized MSE   0.1034, | val MAE   1.6841 | val MSE  10.3351


Epoch:  13%|█▎        | 20/150 [02:03<13:21,  6.16s/epoch]

Epoch 020 | Learning rate 0.004327 | train normalized MSE   0.1007 | val normalized MSE   0.0940, | val MAE   1.5678 | val MSE   9.4024


Epoch:  17%|█▋        | 25/150 [02:33<12:44,  6.11s/epoch]

Epoch 025 | Learning rate 0.004097 | train normalized MSE   0.0983 | val normalized MSE   0.1020, | val MAE   1.6437 | val MSE  10.2041


Epoch:  20%|██        | 30/150 [03:04<12:13,  6.11s/epoch]

Epoch 030 | Learning rate 0.003080 | train normalized MSE   0.0917 | val normalized MSE   0.0906, | val MAE   1.5491 | val MSE   9.0630


Epoch:  23%|██▎       | 35/150 [03:35<11:45,  6.14s/epoch]

Epoch 035 | Learning rate 0.002872 | train normalized MSE   0.0903 | val normalized MSE   0.0939, | val MAE   1.6174 | val MSE   9.3933


Epoch:  27%|██▋       | 40/150 [04:05<11:13,  6.12s/epoch]

Epoch 040 | Learning rate 0.002658 | train normalized MSE   0.0873 | val normalized MSE   0.0867, | val MAE   1.4234 | val MSE   8.6672


Epoch:  30%|███       | 45/150 [04:36<10:50,  6.19s/epoch]

Epoch 045 | Learning rate 0.002439 | train normalized MSE   0.0832 | val normalized MSE   0.0880, | val MAE   1.5046 | val MSE   8.7999


Epoch:  33%|███▎      | 50/150 [05:07<10:14,  6.15s/epoch]

Epoch 050 | Learning rate 0.002218 | train normalized MSE   0.0831 | val normalized MSE   0.0873, | val MAE   1.4658 | val MSE   8.7334


Epoch:  37%|███▋      | 55/150 [05:38<09:45,  6.17s/epoch]

Epoch 055 | Learning rate 0.001598 | train normalized MSE   0.0791 | val normalized MSE   0.0819, | val MAE   1.3776 | val MSE   8.1887


Epoch:  40%|████      | 60/150 [06:09<09:14,  6.16s/epoch]

Epoch 060 | Learning rate 0.001425 | train normalized MSE   0.0789 | val normalized MSE   0.0815, | val MAE   1.3570 | val MSE   8.1545


Epoch:  43%|████▎     | 65/150 [06:40<08:50,  6.24s/epoch]

Epoch 065 | Learning rate 0.001256 | train normalized MSE   0.0780 | val normalized MSE   0.0842, | val MAE   1.4278 | val MSE   8.4168


Epoch:  47%|████▋     | 70/150 [07:11<08:20,  6.25s/epoch]

Epoch 070 | Learning rate 0.001093 | train normalized MSE   0.0761 | val normalized MSE   0.0817, | val MAE   1.3358 | val MSE   8.1690


Epoch:  50%|█████     | 75/150 [07:42<07:46,  6.22s/epoch]

Epoch 075 | Learning rate 0.000938 | train normalized MSE   0.0744 | val normalized MSE   0.0807, | val MAE   1.3238 | val MSE   8.0699


Epoch:  53%|█████▎    | 80/150 [08:13<07:14,  6.21s/epoch]

Epoch 080 | Learning rate 0.000634 | train normalized MSE   0.0731 | val normalized MSE   0.0783, | val MAE   1.3117 | val MSE   7.8253


Epoch:  57%|█████▋    | 85/150 [08:45<06:45,  6.23s/epoch]

Epoch 085 | Learning rate 0.000526 | train normalized MSE   0.0719 | val normalized MSE   0.0778, | val MAE   1.3068 | val MSE   7.7824


Epoch:  60%|██████    | 90/150 [09:16<06:11,  6.20s/epoch]

Epoch 090 | Learning rate 0.000427 | train normalized MSE   0.0705 | val normalized MSE   0.0778, | val MAE   1.3213 | val MSE   7.7802


Epoch:  63%|██████▎   | 95/150 [09:47<05:45,  6.28s/epoch]

Epoch 095 | Learning rate 0.000338 | train normalized MSE   0.0704 | val normalized MSE   0.0780, | val MAE   1.2929 | val MSE   7.8049


Epoch:  67%|██████▋   | 100/150 [10:18<05:12,  6.26s/epoch]

Epoch 100 | Learning rate 0.000259 | train normalized MSE   0.0700 | val normalized MSE   0.0770, | val MAE   1.2869 | val MSE   7.7045


Epoch:  70%|███████   | 105/150 [10:50<04:41,  6.25s/epoch]

Epoch 105 | Learning rate 0.000152 | train normalized MSE   0.0681 | val normalized MSE   0.0777, | val MAE   1.2794 | val MSE   7.7721


Epoch:  73%|███████▎  | 110/150 [11:21<04:08,  6.20s/epoch]

Epoch 110 | Learning rate 0.000106 | train normalized MSE   0.0672 | val normalized MSE   0.0785, | val MAE   1.2787 | val MSE   7.8484


Epoch:  75%|███████▍  | 112/150 [11:39<03:57,  6.25s/epoch]

==== EARLY STOP at epoch 113
BEST VALIDATION LOSS (NORMALIZED MSE) SEEN: 0.07671434622495732, AT (epoch, fold) = (88, 4)





In [20]:
# Make predictions on the test set
model_predictions_df = predict(test_data)
assert len(model_predictions_df) == 126000, f"Incorrect number of rows in output, expected 126000, got {len(model_predictions_df)}"

# Save output in the submission foldder, timestamped!
submission_path = os.path.join(submission_dir, f"submission-{get_timestamp()}.csv")
model_predictions_df.to_csv(submission_path)
print(f"Submission saved locally as: '{submission_path}'.")

Submission saved locally as: './submission/submission-2025-05-22_10-35PM.csv'.


# Visualize predictions

These functions are taken from the milestone notebook, with minor additions

In [21]:
def plot_trajectory(ax, pred, gt, title=None):
    ax.cla()
    # Plot the predicted future trajectory
    ax.plot(pred[0,:60,0], pred[0,:60,1], color='palevioletred', label='Predicted Future Trajectory')
    
    # Plot the ground truth future trajectory
    ax.plot(gt[0,:60,0], gt[0,:60,1], color='navy', label='Ground Truth Future Trajectory')
    
    # Optionally set axis limits, labels, and title.
    x_max = max(pred[..., 0].max(), gt[..., 0].max())
    x_min = min(pred[..., 0].min(), gt[..., 0].min())
    y_max = max(pred[..., 1].max(), gt[..., 1].max())
    y_min = min(pred[..., 1].min(), gt[..., 1].min())
    
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xlabel('X-axis')
    ax.set_ylabel('Y-axis')
    
    if title:
        ax.set_title(title)
    
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.7)

In [22]:
def visualize_predictions(model, val_dataset, graph_save_path: str = f"{get_timestamp()}_graph.png"):
    global input_features

    model.load_state_dict(torch.load("best_model.pt"))
    model.eval()

    # randomly select 4 samples from the validation set
    random_indices = random.sample(range(len(val_dataset)), 4)
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
    axes = axes.flatten()  # Flatten the array to iterate single axes objects

    for i, idx in enumerate(random_indices):
        batch = val_dataset[idx]
        batch_x, batch_y = None, None
        if isinstance(batch, tuple) or isinstance(batch, list):
            batch_x, batch_y = batch
            X_val_tensor = torch.FloatTensor(batch_x).reshape((-1, input_features)).to(device)
            y_val_tensor = torch.FloatTensor(batch_y.view(-1, 60, 2))

            pred = model(X_val_tensor).cpu().reshape((-1, 60, 2))
            gt = torch.stack(torch.split(y_val_tensor, 60, dim=0), dim=0).squeeze(dim=0) # getting a phantom dimension
        else:
            batch = batch.to(device)
            batch_x = batch.x
            batch_y = batch.y
        
            pred = model(batch_x)
            gt = torch.stack(torch.split(batch_y, 60, dim=0), dim=0)

            pred = pred * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
            gt = torch.stack(torch.split(batch_y, 60, dim=0), dim=0) * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)

        pred = pred.detach().cpu().numpy()
        gt = gt.detach().cpu().numpy()

        # Plot the trajectory using the i-th axis
        plot_trajectory(axes[i], pred, gt, title=f"Sample {idx}")

    plt.savefig(fname=graph_save_path)
    plt.show()

In [None]:
model = get_model()
if not isinstance(model, SceneContextModel):
    # LSTM can handle the timeseries data directly
    # TrajectoryDataset expects numpy arrays
    collate_func = lambda x: Batch.from_data_list(x)
    train_dataset = TrajectoryDatasetTrain(train_data, scale=scale, augment=True)
else:
    train_x: np.ndarray = train_data[..., :50, :]
    train_y: np.ndarray = train_data[:, 0, 50:, :2]
    X_train_tensor = torch.FloatTensor(train_x).reshape((-1, input_features))
    y_train_tensor = torch.FloatTensor(train_y).reshape((-1, output_features))
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

visualize_predictions(model, train_dataset)