In [1]:
import os
import sys
from tqdm import tqdm
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn.model_selection import TimeSeriesSplit

import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch_geometric.data import Data, Batch

In [2]:
# Create submission folder if it doesn't exist
submission_dir = './submission'
os.makedirs(submission_dir, exist_ok=True)

# Uncomment the following block ONLY if you wish to inspect file paths in a Kaggle-like directory structure.
# On your local system, you likely have the files in your local folder so this is not needed.
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


# Data Loading for Local Environment
# Files are assumed to be in:
# ./cse-251-b-2025/train.npz
# ./cse-251-b-2025/test_input.npz

train_file = np.load("./cse-251-b-2025/train.npz")
train_data = train_file['data']
print("train_data's shape:", train_data.shape)  # Expected shape: (10000, 50, 110, 6)

test_file = np.load("./cse-251-b-2025/test_input.npz")
test_data = test_file['data']
print("test_data's shape:", test_data.shape)    # Expected shape: (2100, 50, 50, 6)

train_data's shape: (10000, 50, 110, 6)
test_data's shape: (2100, 50, 50, 6)


# Visualization: 

In [3]:
# Run visualizations?
run_visualizations: bool = False

# From data loading notebook
def plot_one_training_scene(idx: int = 0):
    # Plot trajectories from one training scene (static plot)
    data_matrix = train_data[idx]

    plt.figure(figsize=(8, 8))
    for agent in range(data_matrix.shape[idx]):
        xs = data_matrix[agent, :, 0]
        ys = data_matrix[agent, :, 1]
        # Remove zeros (padding)
        xs = xs[xs != 0]
        ys = ys[ys != 0]
        plt.plot(xs, ys, alpha=0.7)
    plt.title("Trajectories from one training scene")
    plt.xlabel("x-coordinate")
    plt.ylabel("y-coordinate")
    plt.show()

# Create an animated gif for one training scene (exact code provided on kaggle)
def make_gif(data_matrix, name='example'):
    cmap = None
    if sys.version_info.minor <= 7:
        cmap = plt.cm.get_cmap("viridis", 50)
    else:
        cmap = plt.get_cmap("viridis", 50)

    fig, ax = plt.subplots(figsize=(10, 10))
    # Function to update plot for each frame
    def update(frame):
        ax.clear()
        # Get data for current timestep
        for i in range(1, data_matrix.shape[0]):
            x = data_matrix[i, frame, 0]
            y = data_matrix[i, frame, 1]
            if x != 0 and y != 0:
                xs = data_matrix[i, :frame+1, 0]  # Include current frame
                ys = data_matrix[i, :frame+1, 1]  # Include current frame
                # trim all zeros
                mask = (xs != 0) & (ys != 0)  # Only keep points where both x and y are non-zero
                xs = xs[mask]
                ys = ys[mask]
                # Only plot if we have points to plot
                if len(xs) > 0 and len(ys) > 0:
                    color = cmap(i)
                    ax.plot(xs, ys, alpha=0.9, color=color)
                    ax.scatter(x, y, s=80, color=color)
        ax.plot(data_matrix[0, :frame, 0], data_matrix[0, :frame, 1],
                color='tab:orange', label='Ego Vehicle')
        ax.scatter(data_matrix[0, frame, 0], data_matrix[0, frame, 1],
                   s=80, color='tab:orange')
        # Set title with timestep
        ax.set_title(f'Timestep {frame}')
        # Set consistent axis limits
        ax.set_xlim(data_matrix[:,:,0][data_matrix[:,:,0] != 0].min() - 10, 
                    data_matrix[:,:,0][data_matrix[:,:,0] != 0].max() + 10)
        ax.set_ylim(data_matrix[:,:,1][data_matrix[:,:,1] != 0].min() - 10, 
                    data_matrix[:,:,1][data_matrix[:,:,1] != 0].max() + 10)
        ax.legend()
        return ax.collections + ax.lines

    # Create animation
    anim = animation.FuncAnimation(fig, update, frames=list(range(0, data_matrix.shape[1], 3)),
                                   interval=100, blit=True)
    # Save as GIF
    anim.save(f'trajectory_visualization_{name}.gif', writer='pillow')
    plt.close()

if run_visualizations:
    plot_one_training_scene(0)
    make_gif(train_data[0], 'index0')

# Constant velocity from test set
Untouched from original data loading notebook.

In [4]:
# Run constant velocity model (Kaggle score of ~50)?
run_constant_velocity_model: bool = False

if run_constant_velocity_model:
    # Compute the velocity differences for the ego vehicle (agent index 0)
    velocity_diff = test_data[..., 1:, :2] - test_data[..., :-1, :2]
    print("Velocity difference shape:", velocity_diff.shape)

    # Compute average velocity for the ego vehicle (index 0) in each scene
    constant_vel = np.mean(velocity_diff[:, 0, :, :], axis=1)
    print("Constant velocity shape:", constant_vel.shape)

    # Generate predictions for 60 future time steps based on constant velocity
    pred_y_const = np.zeros((test_data.shape[0], 60, 2))
    starting_point = test_data[:, 0, -1, :2]  # Last observed position of ego vehicle

    for t in range(60):
        pred_y_const[:, t, :] = starting_point + (t + 1) * constant_vel

    # Reshape predictions to submission format: (2100, 60, 2) -> (12600, 2)
    pred_output_const = pred_y_const.reshape(-1, 2)
    output_df_const = pd.DataFrame(pred_output_const, columns=['x', 'y'])
    output_df_const.index.name = 'index'
    # Save output in the submission folder
    constant_vel_path = os.path.join(submission_dir, 'constant_vel_submission.csv')
    output_df_const.to_csv(constant_vel_path)
    print(f"Constant velocity submission saved locally as '{constant_vel_path}'.")

# Our Work

In [5]:
# CNN model with residual blocks: ineffective for TimeSeries data
class BasicCNN(nn.Module):
    def __init__(self, input_features, output_features):
        super().__init__()

        # Lazy layers infer the input size instead of having to explicitly pass it in
        # Backbone: linear -> BatchNorm -> PReLU -> Dropout
        self.net = nn.Sequential(
            nn.Linear(input_features, 2048),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(2048, 1024),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),
        ) # Note: residual width must match the last width of the net

        # Residual block added to avoid vanishing gradient issue
        self.residual = nn.Sequential(
            nn.LazyLinear(256),
            nn.ReLU(),
            nn.LazyLinear(256),
        )

        # Infer last input shape, then do final projection (60*2)
        self.head = nn.LazyLinear(output_features)

    def forward(self, x):
        # (batch, 50, 50, 6) or flattened already
        x = x.view(x.size(0), -1)
        h = self.net(x) #(batch, 256)
        h = h + self.residual(h)  # residual skip
        return self.head(h) #(batch, 120)

In [6]:
# Base LSTM given to us in the milestone notebook
class BaseLSTM(nn.Module):
    def __init__(self, input_dim:int =6, hidden_dim:int =128, output_dim:int =60 * 2, dropout:float = 0):
        super(BaseLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        x= x.reshape(-1, 50, 50, 6)  # (batch_size, num_agents, seq_len, input_dim)
        x = x[:, 0, :, :] # Only Consider ego agent index 0

        lstm_out, _ = self.lstm(x)
        # lstm_out is of shape (batch_size, seq_len, hidden_dim) and we want the last time step output
        out = self.fc(lstm_out[:, -1, :])
        return out.view(-1, 60, 2)

In [7]:
# Multi agent scene context model
class SceneContextModel(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.agent_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.ego_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 60 * 2)
        )

    def forward(self, x_flat):# In case you passed in a DataBatch
        if not isinstance(x_flat, torch.Tensor):
            x_flat = x_flat.x

        B = x_flat.size(0)
        x = x_flat.view(B, 50, 50, 6) #(B, agents, timesteps, features)
        x_agents = x.view(B, 50, -1)  #(B, 50, 300)
        agent_feats = self.agent_encoder(x_agents) #(B, 50, hidden_dim)
        scene_context = agent_feats.mean(dim=1) #(B, hidden_dim)

        ego_input = x[:, 0, :, :].reshape(B, -1) #(B, 300)
        ego_feat = self.ego_encoder(ego_input) #(B, hidden_dim)

        combined = torch.cat([ego_feat, scene_context], dim=1)
        return self.decoder(combined) #(B, 120)

# Preparing data

`TrajectoryDataset*` are taken from the milestone notebook.

In [8]:
class TrajectoryDatasetTrain(Dataset):
    def __init__(self, data, scale=10.0, augment=True):
        """
        data: Shape (N, 50, 110, 6) Training data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        augment: Whether to apply data augmentation (only for training)
        """
        self.data = data
        self.scale = scale
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        scene = self.data[idx]
        # Getting 50 historical timestamps and 60 future timestamps
        hist = scene[:, :50, :].copy()    # (agents=50, time_seq=50, 6)
        future = torch.tensor(scene[0, 50:, :2].copy(), dtype=torch.float32)  # (60, 2)
        
        # Data augmentation(only for training)
        if self.augment:
            if np.random.rand() < 0.5:
                theta = np.random.uniform(-np.pi, np.pi)
                R = np.array([[np.cos(theta), -np.sin(theta)],
                              [np.sin(theta),  np.cos(theta)]], dtype=np.float32)
                # Rotate the historical trajectory and future trajectory
                hist[..., :2] = hist[..., :2] @ R
                hist[..., 2:4] = hist[..., 2:4] @ R
                # future = future @ R gives DeprecationWarning: future a torch.Tensor
                future = torch.from_numpy(np.dot(future.numpy(), R)) 
            if np.random.rand() < 0.5:
                hist[..., 0] *= -1
                hist[..., 2] *= -1
                future[:, 0] *= -1

        # Use the last timeframe of the historical trajectory as the origin
        origin = hist[0, 49, :2].copy()  # (2,)
        hist[..., :2] = hist[..., :2] - origin
        # future = future - origin -> same DeprecationWarning
        future = torch.from_numpy(future.numpy() - origin)

        # Normalize the historical trajectory and future trajectory
        hist[..., :4] = hist[..., :4] / self.scale
        future = future / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            y=future.type(torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )

        return data_item
    

class TrajectoryDatasetTest(Dataset):
    def __init__(self, data, scale=10.0):
        """
        data: Shape (N, 50, 110, 6) Testing data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        """
        self.data = data
        self.scale = scale

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Testing data only contains historical trajectory
        scene = self.data[idx]  # (50, 50, 6)
        hist = scene.copy()
        
        origin = hist[0, 49, :2].copy()
        hist[..., :2] = hist[..., :2] - origin
        hist[..., :4] = hist[..., :4] / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )
        return data_item

# Training loop

Change which model is used at the `model = ...(input_features, output_features)` line.

Change which optimizer is used at the `optimizer = optim...` line.

Do **NOT** change the `criterion`, as MSE is stated in the Data tab of the competition.

In [9]:
# Taken from milestone notebook
# Set device for training speedup
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Apple Silicon GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA GPU")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using CPU


In [10]:
# Functions to save and load the model (should correspond to what was trained!)
def save_model(model, path="our_model.pth"):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


def load_model(model_instance, path="our_model.pth"):
    loaded_model = model_instance
    loaded_model.load_state_dict(torch.load(path))
    loaded_model.eval()
    return loaded_model


# Example usage:
# save_model(trained_model)
# model = load_model()

In [20]:
# Calculate number of input features after flattening and number of output features
# Note: LSTM models take features in different dimensions
input_features:int = 50 * 50 * 6   # 50 agents, 50 time steps, 6 dimensions each (15000 input features)
output_features:int = 60 * 2       # 60 future time steps, 2 dimensions (x, y) (120 output features)

# Hyperparameters
batch_size:int = 64
num_folds:int = 4
early_stopping_patience:int = 20
early_stopping_threshold:float = 1e-4
epochs:int = 200
starting_lr:float = 1e-2
scale:float = 10.0

lstm_hidden_dim = 128

SEED: int = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

def train_model(full_training_data: np.ndarray, 
                batch_size:int = 64, epochs:int = 10, num_folds:int = 5,
                early_stopping_patience:int = 5, early_stopping_threshold:float = 1e-3):
    global starting_lr, gamma, scale, lstm_hidden_dim, output_features

    # Time series data needs to keep its data in relative order, so no shuffling can occur
    #   like in regular KFold cross validation
    splitter = TimeSeriesSplit(n_splits=num_folds)

    # Perform cross-validation, the best model will be saved as "best_model.pt" to be loaded in later
    overall_best_val_loss = float("inf")

    # Resources used:
    # Project milestone notebook
    # https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-pytorch.md
    # https://www.geeksforgeeks.org/time-series-cross-validation/
    for fold_i, (train_idx, val_idx) in enumerate(splitter.split(full_training_data)):
        print(f"\nFOLD {fold_i + 1}/{num_folds} ==================================")

        # Create the model, loss criterion, and optimizer (reset per fold, to find the best model)
        # If you change the model here, ensure its the same in the test loop!
        # DO NOT CHANGE CRITERION
        criterion = nn.MSELoss()

        model = BaseLSTM(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
        # model = SceneContextModel(hidden_dim=864).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=starting_lr, weight_decay=1e-2)
        schedulers: list[lr_scheduler.LRScheduler] =[
            lr_scheduler.ExponentialLR(optimizer, gamma=0.98),
            lr_scheduler.CosineAnnealingLR(
                optimizer, 
                T_max= int(epochs * 0.9),
                eta_min=1e-5
            ),
        ]

        # Prepare data from this fold
        train_fold: np.ndarray = full_training_data[train_idx]
        val_fold: np.ndarray = full_training_data[val_idx]
        collate_func = None     # Optional for DataLoader, taken from milestone notebook
        if isinstance(model, BaseLSTM):
            # LSTM can handle the timeseries data directly
            # TrajectoryDataset expects numpy arrays
            collate_func = lambda x: Batch.from_data_list(x)
            train_dataset = TrajectoryDatasetTrain(train_fold, scale=scale, augment=True)
            val_dataset = TrajectoryDatasetTrain(val_fold, scale=scale, augment=False)
        else:
            train_x: np.ndarray = train_fold[..., :50, :]
            train_y: np.ndarray = train_fold[:, 0, 50:, :2]
            X_train_tensor = torch.FloatTensor(train_x).reshape((-1, input_features))
            y_train_tensor = torch.FloatTensor(train_y).reshape((-1, output_features))
            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

            val_x: np.ndarray = val_fold[..., :50, :]
            val_y: np.ndarray = val_fold[:, 0, 50:, :2]
            X_val_tensor = torch.FloatTensor(val_x).reshape((-1, input_features))
            y_val_tensor = torch.FloatTensor(val_y).reshape((-1, output_features))
            val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
       
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_func)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

        best_val_loss: float = float("inf")
        no_improvement: int = 0

        # Training and validation loops are taken from the milestone notebook,
        #   with modifications to allow for different data loading shapes      
        for epoch in tqdm(range(epochs), desc="Epoch", unit="epoch"):
            # Training loop
            model.train()
            train_loss = 0
            for batch in train_dataloader:
                batch_x = None
                batch_y = None
                if isinstance(batch, tuple) or isinstance(batch, list):
                    batch_x, batch_y = batch
                else: # DataBatch type
                    batch = batch.to(device)
                    batch_x = batch.x
                    batch_y = batch.y.view(batch.num_graphs, 60, 2)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                optimizer.step()
                train_loss += loss.item()

            # Validation loop
            model.eval()
            val_loss = 0
            val_mae = 0
            val_mse = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch_x = None
                    batch_y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        batch_x, batch_y = batch
                    else: # DataBatch type
                        batch = batch.to(device)
                        batch_x = batch.x
                        batch_y = batch.y.view(batch.num_graphs, 60, 2)

                    pred = model(batch_x)
                    val_loss += criterion(pred, batch_y).item()

                    # show MAE and MSE with unnormalized data
                    y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        y = batch_y
                    else: # DataBatch type
                        pred = pred * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                        y = batch_y * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                    val_mae += nn.L1Loss()(pred, y).item()
                    val_mse += nn.MSELoss()(pred, y).item()

            train_loss /= len(train_dataloader)
            val_loss /= len(val_dataloader)
            val_mae /= len(val_dataloader)
            val_mse /= len(val_dataloader)

            if (epoch + 1) % 5 == 0:
                tqdm.write(f"Epoch {(epoch + 1):03d} | Learning rate {optimizer.param_groups[0]['lr']:.6f} | train normalized MSE {train_loss:8.4f} | val normalized MSE {val_loss:8.4f}, | val MAE {val_mae:8.4f} | val MSE {val_mse:8.4f}")

            if val_loss < best_val_loss - early_stopping_threshold:
                best_val_loss = val_loss
                no_improvement = 0

                # Better than the overall seen so far?
                if best_val_loss < overall_best_val_loss:
                    overall_best_val_loss = best_val_loss
                    torch.save(model.state_dict(), "best_model.pt")
            else:
                no_improvement += 1
                if no_improvement >= early_stopping_patience:
                    print(f"==== EARLY STOP at epoch {(epoch + 1):03d}")
                    break

            for sched in schedulers:
                sched.step()


# Load in the model saved during testing to use on X_test
# Mostly taken from milestone notebook
def predict(X_test, best_model_path: str = "best_model.pt"):
    global scale, batch_size, lstm_hidden_dim, output_features

    # Ensure this aligns with the trained model!
    best_model = torch.load(best_model_path)
    model = BaseLSTM(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
    model.load_state_dict(best_model)
    model.eval()

    pred_list = []
    with torch.no_grad():
        if isinstance(model, BaseLSTM): # Using DataBatch type from a DataLoader
            collate_func = lambda x: Batch.from_data_list(x)
            test_dataset = TrajectoryDatasetTest(X_test, scale=scale)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

            for batch in test_loader:
                batch = batch.to(device)
                pred_norm = model(batch.x)

                # Reshape the prediction to (N, 60, 2)
                pred = pred_norm * batch.scale.view(-1,1,1) + batch.origin.unsqueeze(1)
                pred_list.append(pred.cpu().numpy())
        else:
            X_test_tensor = torch.FloatTensor(X_test).reshape((-1, input_features)).to(device)
            pred = model(X_test_tensor).cpu().reshape((-1, 60, 2))
            pred_list.append(pred.numpy())

    # Reshape predictions to match submission format: (2100, 60, 2) -> (12600, 2)
    pred_list = np.concatenate(pred_list, axis=0)  # (N,60,2)
    pred_output = pred_list.reshape(-1, 2)  # (N*60, 2)
    output_df = pd.DataFrame(pred_output, columns=['x', 'y'])
    output_df.index.name = 'index'
    return output_df

# Train the model (tweak batch_size and epochs as needed at top of this block)
# Saved as "best_model.pt" to be loaded in during testing
train_model(train_data, batch_size=batch_size, epochs=epochs, num_folds=num_folds,
            early_stopping_patience=early_stopping_patience,
            early_stopping_threshold=early_stopping_threshold)

# Make predictions on the test set
model_predictions_df = predict(test_data)
assert len(model_predictions_df) == 126000, f"Incorrect number of rows in output, expected 126000, got {len(model_predictions_df)}"

# Save output in the submission foldder, timestamped!
timestamp = datetime.now().strftime("%Y-%m-%d_%I-%M%p")

submission_path = os.path.join(submission_dir, f"submission-{timestamp}.csv")
model_predictions_df.to_csv(submission_path)
print(f"Submission saved locally as: '{submission_path}'.")




Epoch:   2%|▎         | 5/200 [00:11<07:12,  2.22s/epoch]

Epoch 005 | Learning rate 0.009212 | train normalized MSE   0.1994 | val normalized MSE   0.1968, | val MAE   2.5233 | val MSE  19.6847


Epoch:   5%|▌         | 10/200 [00:22<07:07,  2.25s/epoch]

Epoch 010 | Learning rate 0.008286 | train normalized MSE   0.1787 | val normalized MSE   0.1959, | val MAE   2.5990 | val MSE  19.5908


Epoch:   8%|▊         | 15/200 [00:36<08:35,  2.79s/epoch]

Epoch 015 | Learning rate 0.007425 | train normalized MSE   0.1532 | val normalized MSE   0.1507, | val MAE   2.0977 | val MSE  15.0677


Epoch:  10%|█         | 20/200 [00:51<08:59,  3.00s/epoch]

Epoch 020 | Learning rate 0.006627 | train normalized MSE   0.1365 | val normalized MSE   0.1258, | val MAE   1.8434 | val MSE  12.5803


Epoch:  12%|█▎        | 25/200 [01:06<08:36,  2.95s/epoch]

Epoch 025 | Learning rate 0.005892 | train normalized MSE   0.1255 | val normalized MSE   0.1314, | val MAE   1.8664 | val MSE  13.1405


Epoch:  15%|█▌        | 30/200 [01:19<07:45,  2.74s/epoch]

Epoch 030 | Learning rate 0.005218 | train normalized MSE   0.1196 | val normalized MSE   0.1153, | val MAE   1.7319 | val MSE  11.5259


Epoch:  18%|█▊        | 35/200 [01:33<07:38,  2.78s/epoch]

Epoch 035 | Learning rate 0.004602 | train normalized MSE   0.1121 | val normalized MSE   0.1124, | val MAE   1.6705 | val MSE  11.2447


Epoch:  20%|██        | 40/200 [01:47<07:19,  2.75s/epoch]

Epoch 040 | Learning rate 0.004042 | train normalized MSE   0.1117 | val normalized MSE   0.1151, | val MAE   1.6735 | val MSE  11.5107


Epoch:  22%|██▎       | 45/200 [02:01<07:26,  2.88s/epoch]

Epoch 045 | Learning rate 0.003535 | train normalized MSE   0.1014 | val normalized MSE   0.1077, | val MAE   1.6668 | val MSE  10.7653


Epoch:  25%|██▌       | 50/200 [02:15<06:41,  2.67s/epoch]

Epoch 050 | Learning rate 0.003078 | train normalized MSE   0.0972 | val normalized MSE   0.0990, | val MAE   1.5061 | val MSE   9.8960


Epoch:  28%|██▊       | 55/200 [02:28<06:26,  2.66s/epoch]

Epoch 055 | Learning rate 0.002668 | train normalized MSE   0.0951 | val normalized MSE   0.1014, | val MAE   1.5444 | val MSE  10.1355


Epoch:  30%|███       | 60/200 [02:41<06:05,  2.61s/epoch]

Epoch 060 | Learning rate 0.002302 | train normalized MSE   0.0932 | val normalized MSE   0.1016, | val MAE   1.5286 | val MSE  10.1582


Epoch:  32%|███▎      | 65/200 [02:54<05:51,  2.60s/epoch]

Epoch 065 | Learning rate 0.001976 | train normalized MSE   0.0921 | val normalized MSE   0.0958, | val MAE   1.4562 | val MSE   9.5786


Epoch:  35%|███▌      | 70/200 [03:08<05:50,  2.70s/epoch]

Epoch 070 | Learning rate 0.001687 | train normalized MSE   0.0896 | val normalized MSE   0.0968, | val MAE   1.4962 | val MSE   9.6754


Epoch:  38%|███▊      | 75/200 [03:21<05:25,  2.60s/epoch]

Epoch 075 | Learning rate 0.001433 | train normalized MSE   0.0938 | val normalized MSE   0.0999, | val MAE   1.5473 | val MSE   9.9870


Epoch:  40%|████      | 80/200 [03:36<05:47,  2.90s/epoch]

Epoch 080 | Learning rate 0.001210 | train normalized MSE   0.0881 | val normalized MSE   0.0947, | val MAE   1.4270 | val MSE   9.4729


Epoch:  42%|████▎     | 85/200 [03:50<05:24,  2.82s/epoch]

Epoch 085 | Learning rate 0.001015 | train normalized MSE   0.0871 | val normalized MSE   0.0957, | val MAE   1.4355 | val MSE   9.5691


Epoch:  45%|████▌     | 90/200 [04:03<04:58,  2.72s/epoch]

Epoch 090 | Learning rate 0.000846 | train normalized MSE   0.0867 | val normalized MSE   0.0932, | val MAE   1.4456 | val MSE   9.3195


Epoch:  48%|████▊     | 95/200 [04:17<04:43,  2.70s/epoch]

Epoch 095 | Learning rate 0.000700 | train normalized MSE   0.0842 | val normalized MSE   0.0935, | val MAE   1.4403 | val MSE   9.3506


Epoch:  50%|█████     | 100/200 [04:31<04:34,  2.74s/epoch]

Epoch 100 | Learning rate 0.000575 | train normalized MSE   0.0814 | val normalized MSE   0.0941, | val MAE   1.4502 | val MSE   9.4124


Epoch:  52%|█████▎    | 105/200 [04:44<04:21,  2.75s/epoch]

Epoch 105 | Learning rate 0.000468 | train normalized MSE   0.0807 | val normalized MSE   0.0914, | val MAE   1.4162 | val MSE   9.1353


Epoch:  55%|█████▌    | 110/200 [04:58<04:02,  2.69s/epoch]

Epoch 110 | Learning rate 0.000377 | train normalized MSE   0.0793 | val normalized MSE   0.0920, | val MAE   1.3971 | val MSE   9.2035


Epoch:  57%|█████▊    | 115/200 [05:11<03:41,  2.61s/epoch]

Epoch 115 | Learning rate 0.000301 | train normalized MSE   0.0770 | val normalized MSE   0.0912, | val MAE   1.3934 | val MSE   9.1181


Epoch:  60%|██████    | 120/200 [05:24<03:26,  2.58s/epoch]

Epoch 120 | Learning rate 0.000238 | train normalized MSE   0.0785 | val normalized MSE   0.0908, | val MAE   1.3913 | val MSE   9.0845


Epoch:  62%|██████▎   | 125/200 [05:37<03:18,  2.65s/epoch]

Epoch 125 | Learning rate 0.000185 | train normalized MSE   0.0778 | val normalized MSE   0.0916, | val MAE   1.4048 | val MSE   9.1596


Epoch:  62%|██████▎   | 125/200 [05:40<03:24,  2.72s/epoch]

==== EARLY STOP at epoch 126




Epoch:   2%|▎         | 5/200 [00:24<15:37,  4.81s/epoch]

Epoch 005 | Learning rate 0.009212 | train normalized MSE   0.1733 | val normalized MSE   0.1664, | val MAE   2.1893 | val MSE  16.6385


Epoch:   5%|▌         | 10/200 [00:47<14:23,  4.54s/epoch]

Epoch 010 | Learning rate 0.008286 | train normalized MSE   0.1359 | val normalized MSE   0.1432, | val MAE   1.9968 | val MSE  14.3215


Epoch:   8%|▊         | 15/200 [01:09<13:49,  4.48s/epoch]

Epoch 015 | Learning rate 0.007425 | train normalized MSE   0.1180 | val normalized MSE   0.1210, | val MAE   1.6747 | val MSE  12.0971


Epoch:  10%|█         | 20/200 [01:32<13:17,  4.43s/epoch]

Epoch 020 | Learning rate 0.006627 | train normalized MSE   0.1114 | val normalized MSE   0.1116, | val MAE   1.6363 | val MSE  11.1647


Epoch:  12%|█▎        | 25/200 [01:54<13:04,  4.48s/epoch]

Epoch 025 | Learning rate 0.005892 | train normalized MSE   0.1050 | val normalized MSE   0.1113, | val MAE   1.6734 | val MSE  11.1275


Epoch:  15%|█▌        | 30/200 [02:17<12:48,  4.52s/epoch]

Epoch 030 | Learning rate 0.005218 | train normalized MSE   0.1021 | val normalized MSE   0.1063, | val MAE   1.5571 | val MSE  10.6291


Epoch:  18%|█▊        | 35/200 [02:42<13:29,  4.91s/epoch]

Epoch 035 | Learning rate 0.004602 | train normalized MSE   0.0971 | val normalized MSE   0.1119, | val MAE   1.7463 | val MSE  11.1862


Epoch:  20%|██        | 40/200 [03:08<13:55,  5.22s/epoch]

Epoch 040 | Learning rate 0.004042 | train normalized MSE   0.0992 | val normalized MSE   0.1059, | val MAE   1.5736 | val MSE  10.5924


Epoch:  22%|██▎       | 45/200 [03:34<13:32,  5.24s/epoch]

Epoch 045 | Learning rate 0.003535 | train normalized MSE   0.0911 | val normalized MSE   0.1012, | val MAE   1.5317 | val MSE  10.1196


Epoch:  25%|██▌       | 50/200 [03:57<11:39,  4.66s/epoch]

Epoch 050 | Learning rate 0.003078 | train normalized MSE   0.0892 | val normalized MSE   0.1071, | val MAE   1.6161 | val MSE  10.7060


Epoch:  28%|██▊       | 55/200 [04:19<10:30,  4.35s/epoch]

Epoch 055 | Learning rate 0.002668 | train normalized MSE   0.0876 | val normalized MSE   0.0983, | val MAE   1.5416 | val MSE   9.8277


Epoch:  30%|███       | 60/200 [04:41<10:49,  4.64s/epoch]

Epoch 060 | Learning rate 0.002302 | train normalized MSE   0.0867 | val normalized MSE   0.0984, | val MAE   1.5197 | val MSE   9.8356


Epoch:  32%|███▎      | 65/200 [05:05<10:37,  4.72s/epoch]

Epoch 065 | Learning rate 0.001976 | train normalized MSE   0.0854 | val normalized MSE   0.0974, | val MAE   1.5103 | val MSE   9.7410


Epoch:  35%|███▌      | 70/200 [05:29<10:04,  4.65s/epoch]

Epoch 070 | Learning rate 0.001687 | train normalized MSE   0.0831 | val normalized MSE   0.0966, | val MAE   1.4802 | val MSE   9.6630


Epoch:  38%|███▊      | 75/200 [05:52<09:48,  4.70s/epoch]

Epoch 075 | Learning rate 0.001433 | train normalized MSE   0.0819 | val normalized MSE   0.0967, | val MAE   1.4718 | val MSE   9.6691


Epoch:  40%|████      | 80/200 [06:16<09:31,  4.76s/epoch]

Epoch 080 | Learning rate 0.001210 | train normalized MSE   0.0831 | val normalized MSE   0.0938, | val MAE   1.4415 | val MSE   9.3752


Epoch:  42%|████▎     | 85/200 [06:39<08:55,  4.65s/epoch]

Epoch 085 | Learning rate 0.001015 | train normalized MSE   0.0785 | val normalized MSE   0.0923, | val MAE   1.4121 | val MSE   9.2330


Epoch:  45%|████▌     | 90/200 [07:03<08:33,  4.67s/epoch]

Epoch 090 | Learning rate 0.000846 | train normalized MSE   0.0788 | val normalized MSE   0.0954, | val MAE   1.4401 | val MSE   9.5436


Epoch:  48%|████▊     | 95/200 [07:26<08:10,  4.67s/epoch]

Epoch 095 | Learning rate 0.000700 | train normalized MSE   0.0775 | val normalized MSE   0.0931, | val MAE   1.4130 | val MSE   9.3101


Epoch:  50%|█████     | 100/200 [07:48<07:26,  4.46s/epoch]

Epoch 100 | Learning rate 0.000575 | train normalized MSE   0.0771 | val normalized MSE   0.0925, | val MAE   1.4342 | val MSE   9.2525


Epoch:  52%|█████▎    | 105/200 [08:10<06:56,  4.39s/epoch]

Epoch 105 | Learning rate 0.000468 | train normalized MSE   0.0756 | val normalized MSE   0.0918, | val MAE   1.3861 | val MSE   9.1773


Epoch:  55%|█████▌    | 110/200 [08:32<06:46,  4.52s/epoch]

Epoch 110 | Learning rate 0.000377 | train normalized MSE   0.0751 | val normalized MSE   0.0923, | val MAE   1.4008 | val MSE   9.2302


Epoch:  57%|█████▊    | 115/200 [08:55<06:20,  4.47s/epoch]

Epoch 115 | Learning rate 0.000301 | train normalized MSE   0.0729 | val normalized MSE   0.0914, | val MAE   1.3882 | val MSE   9.1436


Epoch:  60%|██████    | 120/200 [09:17<05:55,  4.44s/epoch]

Epoch 120 | Learning rate 0.000238 | train normalized MSE   0.0741 | val normalized MSE   0.0921, | val MAE   1.4006 | val MSE   9.2071


Epoch:  62%|██████▎   | 125/200 [09:40<05:32,  4.44s/epoch]

Epoch 125 | Learning rate 0.000185 | train normalized MSE   0.0727 | val normalized MSE   0.0911, | val MAE   1.3852 | val MSE   9.1145


Epoch:  65%|██████▌   | 130/200 [10:01<05:03,  4.33s/epoch]

Epoch 130 | Learning rate 0.000142 | train normalized MSE   0.0730 | val normalized MSE   0.0918, | val MAE   1.3800 | val MSE   9.1813


Epoch:  66%|██████▌   | 131/200 [10:10<05:21,  4.66s/epoch]

==== EARLY STOP at epoch 132




Epoch:   2%|▎         | 5/200 [00:32<21:06,  6.50s/epoch]

Epoch 005 | Learning rate 0.009212 | train normalized MSE   0.1655 | val normalized MSE   0.1515, | val MAE   1.9834 | val MSE  15.1542


Epoch:   5%|▌         | 10/200 [01:05<20:32,  6.48s/epoch]

Epoch 010 | Learning rate 0.008286 | train normalized MSE   0.1327 | val normalized MSE   0.1525, | val MAE   2.0923 | val MSE  15.2541


Epoch:   8%|▊         | 15/200 [01:36<19:28,  6.32s/epoch]

Epoch 015 | Learning rate 0.007425 | train normalized MSE   0.1109 | val normalized MSE   0.1132, | val MAE   1.7599 | val MSE  11.3239


Epoch:  10%|█         | 20/200 [02:07<18:28,  6.16s/epoch]

Epoch 020 | Learning rate 0.006627 | train normalized MSE   0.1152 | val normalized MSE   0.1087, | val MAE   1.6222 | val MSE  10.8716


Epoch:  12%|█▎        | 25/200 [02:37<17:14,  5.91s/epoch]

Epoch 025 | Learning rate 0.005892 | train normalized MSE   0.0985 | val normalized MSE   0.1040, | val MAE   1.6316 | val MSE  10.4038


Epoch:  15%|█▌        | 30/200 [03:08<18:00,  6.36s/epoch]

Epoch 030 | Learning rate 0.005218 | train normalized MSE   0.0974 | val normalized MSE   0.1029, | val MAE   1.5670 | val MSE  10.2899


Epoch:  18%|█▊        | 35/200 [03:36<15:20,  5.58s/epoch]

Epoch 035 | Learning rate 0.004602 | train normalized MSE   0.0932 | val normalized MSE   0.0954, | val MAE   1.4789 | val MSE   9.5362


Epoch:  20%|██        | 40/200 [04:02<14:08,  5.30s/epoch]

Epoch 040 | Learning rate 0.004042 | train normalized MSE   0.0911 | val normalized MSE   0.0914, | val MAE   1.4380 | val MSE   9.1385


Epoch:  22%|██▎       | 45/200 [04:30<14:13,  5.50s/epoch]

Epoch 045 | Learning rate 0.003535 | train normalized MSE   0.0898 | val normalized MSE   0.0905, | val MAE   1.4436 | val MSE   9.0529


Epoch:  25%|██▌       | 50/200 [04:58<14:05,  5.64s/epoch]

Epoch 050 | Learning rate 0.003078 | train normalized MSE   0.0874 | val normalized MSE   0.0902, | val MAE   1.4225 | val MSE   9.0157


Epoch:  28%|██▊       | 55/200 [05:30<14:45,  6.11s/epoch]

Epoch 055 | Learning rate 0.002668 | train normalized MSE   0.0857 | val normalized MSE   0.0899, | val MAE   1.4300 | val MSE   8.9917


Epoch:  30%|███       | 60/200 [05:59<13:26,  5.76s/epoch]

Epoch 060 | Learning rate 0.002302 | train normalized MSE   0.0844 | val normalized MSE   0.0874, | val MAE   1.4122 | val MSE   8.7401


Epoch:  32%|███▎      | 65/200 [06:29<13:42,  6.09s/epoch]

Epoch 065 | Learning rate 0.001976 | train normalized MSE   0.0834 | val normalized MSE   0.0871, | val MAE   1.3899 | val MSE   8.7059


Epoch:  35%|███▌      | 70/200 [06:59<13:28,  6.22s/epoch]

Epoch 070 | Learning rate 0.001687 | train normalized MSE   0.0820 | val normalized MSE   0.0895, | val MAE   1.3847 | val MSE   8.9470


Epoch:  38%|███▊      | 75/200 [07:32<13:19,  6.40s/epoch]

Epoch 075 | Learning rate 0.001433 | train normalized MSE   0.0804 | val normalized MSE   0.0868, | val MAE   1.3992 | val MSE   8.6764


Epoch:  40%|████      | 80/200 [08:03<12:28,  6.24s/epoch]

Epoch 080 | Learning rate 0.001210 | train normalized MSE   0.0801 | val normalized MSE   0.0847, | val MAE   1.3642 | val MSE   8.4742


Epoch:  42%|████▎     | 85/200 [08:33<11:36,  6.05s/epoch]

Epoch 085 | Learning rate 0.001015 | train normalized MSE   0.0785 | val normalized MSE   0.0855, | val MAE   1.3847 | val MSE   8.5486


Epoch:  45%|████▌     | 90/200 [09:03<11:02,  6.02s/epoch]

Epoch 090 | Learning rate 0.000846 | train normalized MSE   0.0776 | val normalized MSE   0.0839, | val MAE   1.3521 | val MSE   8.3930


Epoch:  48%|████▊     | 95/200 [09:33<10:24,  5.95s/epoch]

Epoch 095 | Learning rate 0.000700 | train normalized MSE   0.0764 | val normalized MSE   0.0841, | val MAE   1.3403 | val MSE   8.4062


Epoch:  50%|█████     | 100/200 [10:04<10:29,  6.30s/epoch]

Epoch 100 | Learning rate 0.000575 | train normalized MSE   0.0764 | val normalized MSE   0.0853, | val MAE   1.3550 | val MSE   8.5288


Epoch:  52%|█████▎    | 105/200 [10:34<09:05,  5.74s/epoch]

Epoch 105 | Learning rate 0.000468 | train normalized MSE   0.0770 | val normalized MSE   0.0844, | val MAE   1.3390 | val MSE   8.4439


Epoch:  55%|█████▌    | 110/200 [11:00<08:06,  5.40s/epoch]

Epoch 110 | Learning rate 0.000377 | train normalized MSE   0.0750 | val normalized MSE   0.0840, | val MAE   1.3277 | val MSE   8.3998


Epoch:  57%|█████▊    | 115/200 [11:27<07:49,  5.52s/epoch]

Epoch 115 | Learning rate 0.000301 | train normalized MSE   0.0737 | val normalized MSE   0.0842, | val MAE   1.3460 | val MSE   8.4212


Epoch:  59%|█████▉    | 118/200 [11:51<08:14,  6.03s/epoch]

==== EARLY STOP at epoch 119




Epoch:   2%|▎         | 5/200 [00:35<23:10,  7.13s/epoch]

Epoch 005 | Learning rate 0.009212 | train normalized MSE   0.1502 | val normalized MSE   0.1412, | val MAE   2.0292 | val MSE  14.1179


Epoch:   5%|▌         | 10/200 [01:09<22:04,  6.97s/epoch]

Epoch 010 | Learning rate 0.008286 | train normalized MSE   0.1346 | val normalized MSE   0.1734, | val MAE   2.2311 | val MSE  17.3427


Epoch:   8%|▊         | 15/200 [01:44<21:09,  6.86s/epoch]

Epoch 015 | Learning rate 0.007425 | train normalized MSE   0.1091 | val normalized MSE   0.1090, | val MAE   1.6654 | val MSE  10.9015


Epoch:  10%|█         | 20/200 [02:17<20:13,  6.74s/epoch]

Epoch 020 | Learning rate 0.006627 | train normalized MSE   0.1009 | val normalized MSE   0.1043, | val MAE   1.5834 | val MSE  10.4346


Epoch:  12%|█▎        | 25/200 [02:52<20:42,  7.10s/epoch]

Epoch 025 | Learning rate 0.005892 | train normalized MSE   0.0969 | val normalized MSE   0.1037, | val MAE   1.6797 | val MSE  10.3749


Epoch:  15%|█▌        | 30/200 [03:25<18:39,  6.59s/epoch]

Epoch 030 | Learning rate 0.005218 | train normalized MSE   0.0939 | val normalized MSE   0.0944, | val MAE   1.4601 | val MSE   9.4414


Epoch:  18%|█▊        | 35/200 [04:01<20:11,  7.34s/epoch]

Epoch 035 | Learning rate 0.004602 | train normalized MSE   0.0911 | val normalized MSE   0.0926, | val MAE   1.4399 | val MSE   9.2643


Epoch:  20%|██        | 40/200 [04:42<21:33,  8.09s/epoch]

Epoch 040 | Learning rate 0.004042 | train normalized MSE   0.0889 | val normalized MSE   0.0935, | val MAE   1.4736 | val MSE   9.3471


Epoch:  22%|██▎       | 45/200 [05:20<19:22,  7.50s/epoch]

Epoch 045 | Learning rate 0.003535 | train normalized MSE   0.1099 | val normalized MSE   0.1040, | val MAE   1.6041 | val MSE  10.3984


Epoch:  25%|██▌       | 50/200 [05:54<16:46,  6.71s/epoch]

Epoch 050 | Learning rate 0.003078 | train normalized MSE   0.0981 | val normalized MSE   0.0990, | val MAE   1.4877 | val MSE   9.8956


Epoch:  28%|██▊       | 55/200 [06:28<16:49,  6.96s/epoch]

Epoch 055 | Learning rate 0.002668 | train normalized MSE   0.0948 | val normalized MSE   0.0972, | val MAE   1.4939 | val MSE   9.7195


Epoch:  28%|██▊       | 55/200 [06:36<17:24,  7.20s/epoch]

==== EARLY STOP at epoch 056





Submission saved locally as: './submission\submission-2025-05-09_01-17PM.csv'.
