In [1]:
import os
import sys
from tqdm import tqdm
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn.model_selection import TimeSeriesSplit

import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch_geometric.data import Data, Batch

In [2]:
# Create submission folder if it doesn't exist
submission_dir = './submission'
os.makedirs(submission_dir, exist_ok=True)

# Uncomment the following block ONLY if you wish to inspect file paths in a Kaggle-like directory structure.
# On your local system, you likely have the files in your local folder so this is not needed.
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


# Data Loading for Local Environment
# Files are assumed to be in:
# ./cse-251-b-2025/train.npz
# ./cse-251-b-2025/test_input.npz

train_file = np.load("./cse-251-b-2025/train.npz")
train_data = train_file['data']
print("train_data's shape:", train_data.shape)  # Expected shape: (10000, 50, 110, 6)

test_file = np.load("./cse-251-b-2025/test_input.npz")
test_data = test_file['data']
print("test_data's shape:", test_data.shape)    # Expected shape: (2100, 50, 50, 6)

train_data's shape: (10000, 50, 110, 6)
test_data's shape: (2100, 50, 50, 6)


# Visualization: 

In [3]:
# Run visualizations?
run_visualizations: bool = False

# From data loading notebook
def plot_one_training_scene(idx: int = 0):
    # Plot trajectories from one training scene (static plot)
    data_matrix = train_data[idx]

    plt.figure(figsize=(8, 8))
    for agent in range(data_matrix.shape[idx]):
        xs = data_matrix[agent, :, 0]
        ys = data_matrix[agent, :, 1]
        # Remove zeros (padding)
        xs = xs[xs != 0]
        ys = ys[ys != 0]
        plt.plot(xs, ys, alpha=0.7)
    plt.title("Trajectories from one training scene")
    plt.xlabel("x-coordinate")
    plt.ylabel("y-coordinate")
    plt.show()

# Create an animated gif for one training scene (exact code provided on kaggle)
def make_gif(data_matrix, name='example'):
    cmap = None
    if sys.version_info.minor <= 7:
        cmap = plt.cm.get_cmap("viridis", 50)
    else:
        cmap = plt.get_cmap("viridis", 50)

    fig, ax = plt.subplots(figsize=(10, 10))
    # Function to update plot for each frame
    def update(frame):
        ax.clear()
        # Get data for current timestep
        for i in range(1, data_matrix.shape[0]):
            x = data_matrix[i, frame, 0]
            y = data_matrix[i, frame, 1]
            if x != 0 and y != 0:
                xs = data_matrix[i, :frame+1, 0]  # Include current frame
                ys = data_matrix[i, :frame+1, 1]  # Include current frame
                # trim all zeros
                mask = (xs != 0) & (ys != 0)  # Only keep points where both x and y are non-zero
                xs = xs[mask]
                ys = ys[mask]
                # Only plot if we have points to plot
                if len(xs) > 0 and len(ys) > 0:
                    color = cmap(i)
                    ax.plot(xs, ys, alpha=0.9, color=color)
                    ax.scatter(x, y, s=80, color=color)
        ax.plot(data_matrix[0, :frame, 0], data_matrix[0, :frame, 1],
                color='tab:orange', label='Ego Vehicle')
        ax.scatter(data_matrix[0, frame, 0], data_matrix[0, frame, 1],
                   s=80, color='tab:orange')
        # Set title with timestep
        ax.set_title(f'Timestep {frame}')
        # Set consistent axis limits
        ax.set_xlim(data_matrix[:,:,0][data_matrix[:,:,0] != 0].min() - 10, 
                    data_matrix[:,:,0][data_matrix[:,:,0] != 0].max() + 10)
        ax.set_ylim(data_matrix[:,:,1][data_matrix[:,:,1] != 0].min() - 10, 
                    data_matrix[:,:,1][data_matrix[:,:,1] != 0].max() + 10)
        ax.legend()
        return ax.collections + ax.lines

    # Create animation
    anim = animation.FuncAnimation(fig, update, frames=list(range(0, data_matrix.shape[1], 3)),
                                   interval=100, blit=True)
    # Save as GIF
    anim.save(f'trajectory_visualization_{name}.gif', writer='pillow')
    plt.close()

if run_visualizations:
    plot_one_training_scene(0)
    make_gif(train_data[0], 'index0')

# Constant velocity from test set
Untouched from original data loading notebook.

In [4]:
# Run constant velocity model (Kaggle score of ~50)?
run_constant_velocity_model: bool = False

if run_constant_velocity_model:
    # Compute the velocity differences for the ego vehicle (agent index 0)
    velocity_diff = test_data[..., 1:, :2] - test_data[..., :-1, :2]
    print("Velocity difference shape:", velocity_diff.shape)

    # Compute average velocity for the ego vehicle (index 0) in each scene
    constant_vel = np.mean(velocity_diff[:, 0, :, :], axis=1)
    print("Constant velocity shape:", constant_vel.shape)

    # Generate predictions for 60 future time steps based on constant velocity
    pred_y_const = np.zeros((test_data.shape[0], 60, 2))
    starting_point = test_data[:, 0, -1, :2]  # Last observed position of ego vehicle

    for t in range(60):
        pred_y_const[:, t, :] = starting_point + (t + 1) * constant_vel

    # Reshape predictions to submission format: (2100, 60, 2) -> (12600, 2)
    pred_output_const = pred_y_const.reshape(-1, 2)
    output_df_const = pd.DataFrame(pred_output_const, columns=['x', 'y'])
    output_df_const.index.name = 'index'
    # Save output in the submission folder
    constant_vel_path = os.path.join(submission_dir, 'constant_vel_submission.csv')
    output_df_const.to_csv(constant_vel_path)
    print(f"Constant velocity submission saved locally as '{constant_vel_path}'.")

# Our Work

In [5]:
# CNN model with residual blocks: ineffective for TimeSeries data
class BasicCNN(nn.Module):
    def __init__(self, input_features, output_features):
        super().__init__()

        # Lazy layers infer the input size instead of having to explicitly pass it in
        # Backbone: linear -> BatchNorm -> PReLU -> Dropout
        self.net = nn.Sequential(
            nn.Linear(input_features, 2048),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(2048, 1024),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.LazyBatchNorm1d(),
            nn.PReLU(),
            nn.Dropout(0.2),
        ) # Note: residual width must match the last width of the net

        # Residual block added to avoid vanishing gradient issue
        self.residual = nn.Sequential(
            nn.LazyLinear(256),
            nn.ReLU(),
            nn.LazyLinear(256),
        )

        # Infer last input shape, then do final projection (60*2)
        self.head = nn.LazyLinear(output_features)

    def forward(self, x):
        # (batch, 50, 50, 6) or flattened already
        x = x.view(x.size(0), -1)
        h = self.net(x) #(batch, 256)
        h = h + self.residual(h)  # residual skip
        return self.head(h) #(batch, 120)

In [6]:
# Base LSTM given to us in the milestone notebook
class BaseLSTM(nn.Module):
    def __init__(self, input_dim:int =6, hidden_dim:int =128, output_dim:int =60 * 2, dropout:float = 0):
        super(BaseLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # In case you passed in a DataBatch
        if not isinstance(x, torch.Tensor):
            x = x.x

        x= x.reshape(-1, 50, 50, 6)  # (batch_size, num_agents, seq_len, input_dim)
        x = x[:, 0, :, :] # Only Consider ego agent index 0

        lstm_out, _ = self.lstm(x)
        # lstm_out is of shape (batch_size, seq_len, hidden_dim) and we want the last time step output
        out = self.fc(lstm_out[:, -1, :])
        return out.view(-1, 60, 2)

In [7]:
# Multi agent scene context model
class SceneContextModel(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.agent_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.ego_encoder = nn.Sequential(
            nn.Linear(50 * 6, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 60 * 2)
        )

    def forward(self, x_flat):# In case you passed in a DataBatch
        if not isinstance(x_flat, torch.Tensor):
            x_flat = x_flat.x

        B = x_flat.size(0)
        x = x_flat.view(B, 50, 50, 6) #(B, agents, timesteps, features)
        x_agents = x.view(B, 50, -1)  #(B, 50, 300)
        agent_feats = self.agent_encoder(x_agents) #(B, 50, hidden_dim)
        scene_context = agent_feats.mean(dim=1) #(B, hidden_dim)

        ego_input = x[:, 0, :, :].reshape(B, -1) #(B, 300)
        ego_feat = self.ego_encoder(ego_input) #(B, hidden_dim)

        combined = torch.cat([ego_feat, scene_context], dim=1)
        return self.decoder(combined) #(B, 120)

# Preparing data

`TrajectoryDataset*` are taken from the milestone notebook.

In [8]:
class TrajectoryDatasetTrain(Dataset):
    def __init__(self, data, scale=10.0, augment=True):
        """
        data: Shape (N, 50, 110, 6) Training data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        augment: Whether to apply data augmentation (only for training)
        """
        self.data = data
        self.scale = scale
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        scene = self.data[idx]
        # Getting 50 historical timestamps and 60 future timestamps
        hist = scene[:, :50, :].copy()    # (agents=50, time_seq=50, 6)
        future = torch.tensor(scene[0, 50:, :2].copy(), dtype=torch.float32)  # (60, 2)
        
        # Data augmentation(only for training)
        if self.augment:
            if np.random.rand() < 0.5:
                theta = np.random.uniform(-np.pi, np.pi)
                R = np.array([[np.cos(theta), -np.sin(theta)],
                              [np.sin(theta),  np.cos(theta)]], dtype=np.float32)
                # Rotate the historical trajectory and future trajectory
                hist[..., :2] = hist[..., :2] @ R
                hist[..., 2:4] = hist[..., 2:4] @ R
                # future = future @ R gives DeprecationWarning: future a torch.Tensor
                future = torch.from_numpy(np.dot(future.numpy(), R)) 
            if np.random.rand() < 0.5:
                hist[..., 0] *= -1
                hist[..., 2] *= -1
                future[:, 0] *= -1

        # Use the last timeframe of the historical trajectory as the origin
        origin = hist[0, 49, :2].copy()  # (2,)
        hist[..., :2] = hist[..., :2] - origin
        # future = future - origin -> same DeprecationWarning
        future = torch.from_numpy(future.numpy() - origin)

        # Normalize the historical trajectory and future trajectory
        hist[..., :4] = hist[..., :4] / self.scale
        future = future / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            y=future.type(torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )

        return data_item
    

class TrajectoryDatasetTest(Dataset):
    def __init__(self, data, scale=10.0):
        """
        data: Shape (N, 50, 110, 6) Testing data
        scale: Scale for normalization (suggested to use 10.0 for Argoverse 2 data)
        """
        self.data = data
        self.scale = scale

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Testing data only contains historical trajectory
        scene = self.data[idx]  # (50, 50, 6)
        hist = scene.copy()
        
        origin = hist[0, 49, :2].copy()
        hist[..., :2] = hist[..., :2] - origin
        hist[..., :4] = hist[..., :4] / self.scale

        data_item = Data(
            x=torch.tensor(hist, dtype=torch.float32),
            origin=torch.tensor(origin, dtype=torch.float32).unsqueeze(0),
            scale=torch.tensor(self.scale, dtype=torch.float32),
        )
        return data_item

# Training loop

Change which model is used at the `model = ...(input_features, output_features)` line.

Change which optimizer is used at the `optimizer = optim...` line.

Do **NOT** change the `criterion`, as MSE is stated in the Data tab of the competition.

In [9]:
# Taken from milestone notebook
# Set device for training speedup
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Apple Silicon GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA GPU")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using CPU


In [10]:
# Functions to save and load the model (should correspond to what was trained!)
def save_model(model, path="our_model.pth"):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


def load_model(model_instance, path="our_model.pth"):
    loaded_model = model_instance
    loaded_model.load_state_dict(torch.load(path))
    loaded_model.eval()
    return loaded_model


# Example usage:
# save_model(trained_model)
# model = load_model()

In [12]:
# Calculate number of input features after flattening and number of output features
# Note: LSTM models take features in different dimensions
input_features:int = 50 * 50 * 6   # 50 agents, 50 time steps, 6 dimensions each (15000 input features)
output_features:int = 60 * 2       # 60 future time steps, 2 dimensions (x, y) (120 output features)

# Hyperparameters
batch_size:int = 64
num_folds:int = 3
early_stopping_patience:int = 30
early_stopping_threshold:float = 1e-4
epochs:int = 250
starting_lr:float = 1e-2
scale:float = 10.0

lstm_hidden_dim = 128

SEED: int = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

def train_model(full_training_data: np.ndarray, 
                batch_size:int = 64, epochs:int = 10, num_folds:int = 5,
                early_stopping_patience:int = 5, early_stopping_threshold:float = 1e-3):
    global starting_lr, gamma, scale, lstm_hidden_dim, output_features

    # Time series data needs to keep its data in relative order, so no shuffling can occur
    #   like in regular KFold cross validation
    splitter = TimeSeriesSplit(n_splits=num_folds)

    # Perform cross-validation, the best model will be saved as "best_model.pt" to be loaded in later
    overall_best_val_loss = float("inf")

    # Resources used:
    # Project milestone notebook
    # https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-pytorch.md
    # https://www.geeksforgeeks.org/time-series-cross-validation/
    for fold_i, (train_idx, val_idx) in enumerate(splitter.split(full_training_data)):
        print(f"\nFOLD {fold_i + 1}/{num_folds} ==================================")

        # Create the model, loss criterion, and optimizer (reset per fold, to find the best model)
        # If you change the model here, ensure its the same in the test loop!
        # DO NOT CHANGE CRITERION
        criterion = nn.MSELoss()

        model = BaseLSTM(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
        # model = SceneContextModel(hidden_dim=864).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=starting_lr, weight_decay=1e-2)
        schedulers: list[lr_scheduler.LRScheduler] =[
            lr_scheduler.ExponentialLR(optimizer, gamma=0.99),
            # lr_scheduler.CosineAnnealingLR(
            #     optimizer, 
            #     T_max= int(epochs * 0.9),
            #     eta_min=1e-5
            # ),
            lr_scheduler.MultiStepLR(
                optimizer,
                milestones= list(range(50, epochs, 50)),
                gamma=0.5,
            )
        ]

        # Prepare data from this fold
        train_fold: np.ndarray = full_training_data[train_idx]
        val_fold: np.ndarray = full_training_data[val_idx]
        collate_func = None     # Optional for DataLoader, taken from milestone notebook
        if isinstance(model, BaseLSTM):
            # LSTM can handle the timeseries data directly
            # TrajectoryDataset expects numpy arrays
            collate_func = lambda x: Batch.from_data_list(x)
            train_dataset = TrajectoryDatasetTrain(train_fold, scale=scale, augment=True)
            val_dataset = TrajectoryDatasetTrain(val_fold, scale=scale, augment=False)
        else:
            train_x: np.ndarray = train_fold[..., :50, :]
            train_y: np.ndarray = train_fold[:, 0, 50:, :2]
            X_train_tensor = torch.FloatTensor(train_x).reshape((-1, input_features))
            y_train_tensor = torch.FloatTensor(train_y).reshape((-1, output_features))
            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

            val_x: np.ndarray = val_fold[..., :50, :]
            val_y: np.ndarray = val_fold[:, 0, 50:, :2]
            X_val_tensor = torch.FloatTensor(val_x).reshape((-1, input_features))
            y_val_tensor = torch.FloatTensor(val_y).reshape((-1, output_features))
            val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
       
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_func)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

        best_val_loss: float = float("inf")
        no_improvement: int = 0

        # Training and validation loops are taken from the milestone notebook,
        #   with modifications to allow for different data loading shapes      
        for epoch in tqdm(range(epochs), desc="Epoch", unit="epoch"):
            # Training loop
            model.train()
            train_loss = 0
            for batch in train_dataloader:
                batch_x = None
                batch_y = None
                if isinstance(batch, tuple) or isinstance(batch, list):
                    batch_x, batch_y = batch
                else: # DataBatch type
                    batch = batch.to(device)
                    batch_x = batch.x
                    batch_y = batch.y.view(batch.num_graphs, 60, 2)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
                optimizer.step()
                train_loss += loss.item()

            # Validation loop
            model.eval()
            val_loss = 0
            val_mae = 0
            val_mse = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch_x = None
                    batch_y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        batch_x, batch_y = batch
                    else: # DataBatch type
                        batch = batch.to(device)
                        batch_x = batch.x
                        batch_y = batch.y.view(batch.num_graphs, 60, 2)

                    pred = model(batch_x)
                    val_loss += criterion(pred, batch_y).item()

                    # show MAE and MSE with unnormalized data
                    y = None
                    if isinstance(batch, tuple) or isinstance(batch, list):
                        y = batch_y
                    else: # DataBatch type
                        pred = pred * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                        y = batch_y * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
                    val_mae += nn.L1Loss()(pred, y).item()
                    val_mse += nn.MSELoss()(pred, y).item()

            train_loss /= len(train_dataloader)
            val_loss /= len(val_dataloader)
            val_mae /= len(val_dataloader)
            val_mse /= len(val_dataloader)

            if (epoch + 1) % 5 == 0:
                tqdm.write(f"Epoch {(epoch + 1):03d} | Learning rate {optimizer.param_groups[0]['lr']:.6f} | train normalized MSE {train_loss:8.4f} | val normalized MSE {val_loss:8.4f}, | val MAE {val_mae:8.4f} | val MSE {val_mse:8.4f}")

            if val_loss < best_val_loss - early_stopping_threshold:
                best_val_loss = val_loss
                no_improvement = 0

                # Better than the overall seen so far?
                if best_val_loss < overall_best_val_loss:
                    overall_best_val_loss = best_val_loss
                    torch.save(model.state_dict(), "best_model.pt")
            else:
                no_improvement += 1
                if no_improvement >= early_stopping_patience:
                    print(f"==== EARLY STOP at epoch {(epoch + 1):03d}")
                    break

            for sched in schedulers:
                sched.step()


# Load in the model saved during testing to use on X_test
# Mostly taken from milestone notebook
def predict(X_test, best_model_path: str = "best_model.pt"):
    global scale, batch_size, lstm_hidden_dim, output_features

    # Ensure this aligns with the trained model!
    best_model = torch.load(best_model_path)
    model = BaseLSTM(input_dim=6, hidden_dim=lstm_hidden_dim, output_dim=output_features).to(device)
    model.load_state_dict(best_model)
    model.eval()

    pred_list = []
    with torch.no_grad():
        if isinstance(model, BaseLSTM): # Using DataBatch type from a DataLoader
            collate_func = lambda x: Batch.from_data_list(x)
            test_dataset = TrajectoryDatasetTest(X_test, scale=scale)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_func)

            for batch in test_loader:
                batch = batch.to(device)
                pred_norm = model(batch.x)

                # Reshape the prediction to (N, 60, 2)
                pred = pred_norm * batch.scale.view(-1,1,1) + batch.origin.unsqueeze(1)
                pred_list.append(pred.cpu().numpy())
        else:
            X_test_tensor = torch.FloatTensor(X_test).reshape((-1, input_features)).to(device)
            pred = model(X_test_tensor).cpu().reshape((-1, 60, 2))
            pred_list.append(pred.numpy())

    # Reshape predictions to match submission format: (2100, 60, 2) -> (12600, 2)
    pred_list = np.concatenate(pred_list, axis=0)  # (N,60,2)
    pred_output = pred_list.reshape(-1, 2)  # (N*60, 2)
    output_df = pd.DataFrame(pred_output, columns=['x', 'y'])
    output_df.index.name = 'index'
    return output_df

# Train the model (tweak batch_size and epochs as needed at top of this block)
# Saved as "best_model.pt" to be loaded in during testing
train_model(train_data, batch_size=batch_size, epochs=epochs, num_folds=num_folds,
            early_stopping_patience=early_stopping_patience,
            early_stopping_threshold=early_stopping_threshold)

# Make predictions on the test set
model_predictions_df = predict(test_data)
assert len(model_predictions_df) == 126000, f"Incorrect number of rows in output, expected 126000, got {len(model_predictions_df)}"

# Save output in the submission foldder, timestamped!
timestamp = datetime.now().strftime("%Y-%m-%d_%I-%M%p")

submission_path = os.path.join(submission_dir, f"submission-{timestamp}.csv")
model_predictions_df.to_csv(submission_path)
print(f"Submission saved locally as: '{submission_path}'.")




Epoch:   2%|▏         | 5/250 [00:17<16:40,  4.08s/epoch]

Epoch 005 | Learning rate 0.009606 | train normalized MSE   0.2024 | val normalized MSE   0.2144, | val MAE   2.5839 | val MSE  21.4417


Epoch:   4%|▍         | 10/250 [00:50<23:38,  5.91s/epoch]

Epoch 010 | Learning rate 0.009135 | train normalized MSE   0.1788 | val normalized MSE   0.1717, | val MAE   2.2782 | val MSE  17.1671


Epoch:   6%|▌         | 15/250 [01:04<12:48,  3.27s/epoch]

Epoch 015 | Learning rate 0.008687 | train normalized MSE   0.1418 | val normalized MSE   0.1403, | val MAE   1.9215 | val MSE  14.0266


Epoch:   8%|▊         | 20/250 [01:25<18:28,  4.82s/epoch]

Epoch 020 | Learning rate 0.008262 | train normalized MSE   0.1406 | val normalized MSE   0.1600, | val MAE   2.1855 | val MSE  15.9974


Epoch:  10%|█         | 25/250 [01:49<19:18,  5.15s/epoch]

Epoch 025 | Learning rate 0.007857 | train normalized MSE   0.1242 | val normalized MSE   0.1229, | val MAE   1.7981 | val MSE  12.2856


Epoch:  12%|█▏        | 30/250 [02:26<25:50,  7.05s/epoch]

Epoch 030 | Learning rate 0.007472 | train normalized MSE   0.1169 | val normalized MSE   0.1173, | val MAE   1.7919 | val MSE  11.7299


Epoch:  14%|█▍        | 35/250 [03:05<27:42,  7.73s/epoch]

Epoch 035 | Learning rate 0.007106 | train normalized MSE   0.1028 | val normalized MSE   0.1144, | val MAE   1.7209 | val MSE  11.4376


Epoch:  16%|█▌        | 40/250 [03:36<19:17,  5.51s/epoch]

Epoch 040 | Learning rate 0.006757 | train normalized MSE   0.1208 | val normalized MSE   0.1197, | val MAE   1.8151 | val MSE  11.9681


Epoch:  18%|█▊        | 45/250 [03:57<17:53,  5.24s/epoch]

Epoch 045 | Learning rate 0.006426 | train normalized MSE   0.1048 | val normalized MSE   0.1032, | val MAE   1.6185 | val MSE  10.3219


Epoch:  20%|██        | 50/250 [04:35<24:12,  7.26s/epoch]

Epoch 050 | Learning rate 0.006111 | train normalized MSE   0.1159 | val normalized MSE   0.1141, | val MAE   1.7585 | val MSE  11.4093


Epoch:  22%|██▏       | 55/250 [04:58<14:00,  4.31s/epoch]

Epoch 055 | Learning rate 0.002906 | train normalized MSE   0.0972 | val normalized MSE   0.1017, | val MAE   1.6060 | val MSE  10.1719


Epoch:  24%|██▍       | 60/250 [05:30<20:51,  6.59s/epoch]

Epoch 060 | Learning rate 0.002763 | train normalized MSE   0.0965 | val normalized MSE   0.0988, | val MAE   1.5573 | val MSE   9.8751


Epoch:  26%|██▌       | 65/250 [05:53<12:47,  4.15s/epoch]

Epoch 065 | Learning rate 0.002628 | train normalized MSE   0.0919 | val normalized MSE   0.0946, | val MAE   1.4800 | val MSE   9.4585


Epoch:  28%|██▊       | 70/250 [06:06<08:33,  2.85s/epoch]

Epoch 070 | Learning rate 0.002499 | train normalized MSE   0.0939 | val normalized MSE   0.0944, | val MAE   1.4618 | val MSE   9.4413


Epoch:  30%|███       | 75/250 [06:21<08:55,  3.06s/epoch]

Epoch 075 | Learning rate 0.002377 | train normalized MSE   0.0878 | val normalized MSE   0.0925, | val MAE   1.4421 | val MSE   9.2537


Epoch:  32%|███▏      | 80/250 [06:36<08:28,  2.99s/epoch]

Epoch 080 | Learning rate 0.002260 | train normalized MSE   0.0920 | val normalized MSE   0.0928, | val MAE   1.4937 | val MSE   9.2774


Epoch:  34%|███▍      | 85/250 [06:52<08:20,  3.03s/epoch]

Epoch 085 | Learning rate 0.002149 | train normalized MSE   0.0908 | val normalized MSE   0.0959, | val MAE   1.4752 | val MSE   9.5859


Epoch:  36%|███▌      | 90/250 [07:07<08:08,  3.05s/epoch]

Epoch 090 | Learning rate 0.002044 | train normalized MSE   0.0891 | val normalized MSE   0.0986, | val MAE   1.5730 | val MSE   9.8635


Epoch:  38%|███▊      | 95/250 [07:22<07:45,  3.00s/epoch]

Epoch 095 | Learning rate 0.001944 | train normalized MSE   0.0846 | val normalized MSE   0.0927, | val MAE   1.4876 | val MSE   9.2662


Epoch:  40%|████      | 100/250 [07:37<07:24,  2.96s/epoch]

Epoch 100 | Learning rate 0.001849 | train normalized MSE   0.0846 | val normalized MSE   0.0942, | val MAE   1.5083 | val MSE   9.4158


Epoch:  42%|████▏     | 105/250 [07:52<07:13,  2.99s/epoch]

Epoch 105 | Learning rate 0.000879 | train normalized MSE   0.0789 | val normalized MSE   0.0890, | val MAE   1.3973 | val MSE   8.8951


Epoch:  44%|████▍     | 110/250 [08:07<07:07,  3.06s/epoch]

Epoch 110 | Learning rate 0.000836 | train normalized MSE   0.0808 | val normalized MSE   0.0906, | val MAE   1.4107 | val MSE   9.0570


Epoch:  46%|████▌     | 115/250 [08:22<06:40,  2.97s/epoch]

Epoch 115 | Learning rate 0.000795 | train normalized MSE   0.0805 | val normalized MSE   0.0886, | val MAE   1.4126 | val MSE   8.8648


Epoch:  48%|████▊     | 120/250 [08:37<06:26,  2.98s/epoch]

Epoch 120 | Learning rate 0.000756 | train normalized MSE   0.0793 | val normalized MSE   0.0880, | val MAE   1.3746 | val MSE   8.7990


Epoch:  50%|█████     | 125/250 [08:52<06:17,  3.02s/epoch]

Epoch 125 | Learning rate 0.000719 | train normalized MSE   0.0822 | val normalized MSE   0.0890, | val MAE   1.3971 | val MSE   8.8950


Epoch:  52%|█████▏    | 130/250 [09:07<06:04,  3.04s/epoch]

Epoch 130 | Learning rate 0.000684 | train normalized MSE   0.0798 | val normalized MSE   0.0881, | val MAE   1.3930 | val MSE   8.8117


Epoch:  54%|█████▍    | 135/250 [09:23<05:48,  3.03s/epoch]

Epoch 135 | Learning rate 0.000650 | train normalized MSE   0.0808 | val normalized MSE   0.0891, | val MAE   1.4108 | val MSE   8.9126


Epoch:  56%|█████▌    | 140/250 [09:38<05:32,  3.02s/epoch]

Epoch 140 | Learning rate 0.000618 | train normalized MSE   0.0796 | val normalized MSE   0.0887, | val MAE   1.3925 | val MSE   8.8680


Epoch:  58%|█████▊    | 145/250 [09:53<05:27,  3.12s/epoch]

Epoch 145 | Learning rate 0.000588 | train normalized MSE   0.0815 | val normalized MSE   0.0887, | val MAE   1.3968 | val MSE   8.8727


Epoch:  60%|██████    | 150/250 [10:09<05:13,  3.13s/epoch]

Epoch 150 | Learning rate 0.000559 | train normalized MSE   0.0768 | val normalized MSE   0.0892, | val MAE   1.3959 | val MSE   8.9169


Epoch:  62%|██████▏   | 155/250 [10:26<05:13,  3.30s/epoch]

Epoch 155 | Learning rate 0.000266 | train normalized MSE   0.0767 | val normalized MSE   0.0882, | val MAE   1.3915 | val MSE   8.8222


Epoch:  64%|██████▍   | 160/250 [10:42<04:39,  3.10s/epoch]

Epoch 160 | Learning rate 0.000253 | train normalized MSE   0.0761 | val normalized MSE   0.0890, | val MAE   1.3916 | val MSE   8.8971


Epoch:  65%|██████▌   | 163/250 [10:54<05:49,  4.01s/epoch]

==== EARLY STOP at epoch 164




Epoch:   2%|▏         | 5/250 [00:26<21:04,  5.16s/epoch]

Epoch 005 | Learning rate 0.009606 | train normalized MSE   0.1801 | val normalized MSE   0.1747, | val MAE   2.2366 | val MSE  17.4702


Epoch:   4%|▍         | 10/250 [00:51<20:30,  5.13s/epoch]

Epoch 010 | Learning rate 0.009135 | train normalized MSE   0.1329 | val normalized MSE   0.1497, | val MAE   2.0552 | val MSE  14.9708


Epoch:   6%|▌         | 15/250 [01:17<19:56,  5.09s/epoch]

Epoch 015 | Learning rate 0.008687 | train normalized MSE   0.1194 | val normalized MSE   0.1349, | val MAE   1.8074 | val MSE  13.4921


Epoch:   8%|▊         | 20/250 [01:42<19:40,  5.13s/epoch]

Epoch 020 | Learning rate 0.008262 | train normalized MSE   0.1073 | val normalized MSE   0.1129, | val MAE   1.6708 | val MSE  11.2870


Epoch:  10%|█         | 25/250 [02:08<19:20,  5.16s/epoch]

Epoch 025 | Learning rate 0.007857 | train normalized MSE   0.0997 | val normalized MSE   0.1110, | val MAE   1.6233 | val MSE  11.0972


Epoch:  12%|█▏        | 30/250 [02:34<18:39,  5.09s/epoch]

Epoch 030 | Learning rate 0.007472 | train normalized MSE   0.0965 | val normalized MSE   0.1067, | val MAE   1.5464 | val MSE  10.6655


Epoch:  14%|█▍        | 35/250 [02:59<18:13,  5.08s/epoch]

Epoch 035 | Learning rate 0.007106 | train normalized MSE   0.0971 | val normalized MSE   0.1118, | val MAE   1.7388 | val MSE  11.1818


Epoch:  16%|█▌        | 40/250 [03:25<17:58,  5.14s/epoch]

Epoch 040 | Learning rate 0.006757 | train normalized MSE   0.0923 | val normalized MSE   0.1054, | val MAE   1.5901 | val MSE  10.5374


Epoch:  18%|█▊        | 45/250 [03:51<18:07,  5.31s/epoch]

Epoch 045 | Learning rate 0.006426 | train normalized MSE   0.1081 | val normalized MSE   0.1233, | val MAE   1.7506 | val MSE  12.3316


Epoch:  20%|██        | 50/250 [04:18<17:49,  5.35s/epoch]

Epoch 050 | Learning rate 0.006111 | train normalized MSE   0.0980 | val normalized MSE   0.1114, | val MAE   1.6794 | val MSE  11.1355


Epoch:  22%|██▏       | 55/250 [04:44<16:58,  5.22s/epoch]

Epoch 055 | Learning rate 0.002906 | train normalized MSE   0.0903 | val normalized MSE   0.1029, | val MAE   1.5412 | val MSE  10.2871


Epoch:  24%|██▍       | 60/250 [05:10<16:12,  5.12s/epoch]

Epoch 060 | Learning rate 0.002763 | train normalized MSE   0.0872 | val normalized MSE   0.1034, | val MAE   1.5592 | val MSE  10.3431


Epoch:  26%|██▌       | 65/250 [05:36<15:49,  5.13s/epoch]

Epoch 065 | Learning rate 0.002628 | train normalized MSE   0.0865 | val normalized MSE   0.1001, | val MAE   1.4951 | val MSE  10.0106


Epoch:  28%|██▊       | 70/250 [06:01<15:22,  5.13s/epoch]

Epoch 070 | Learning rate 0.002499 | train normalized MSE   0.0870 | val normalized MSE   0.1001, | val MAE   1.4908 | val MSE  10.0066


Epoch:  30%|███       | 75/250 [06:27<14:50,  5.09s/epoch]

Epoch 075 | Learning rate 0.002377 | train normalized MSE   0.0850 | val normalized MSE   0.0979, | val MAE   1.4813 | val MSE   9.7865


Epoch:  32%|███▏      | 80/250 [06:53<14:34,  5.15s/epoch]

Epoch 080 | Learning rate 0.002260 | train normalized MSE   0.0837 | val normalized MSE   0.0959, | val MAE   1.4542 | val MSE   9.5902


Epoch:  34%|███▍      | 85/250 [07:19<14:20,  5.22s/epoch]

Epoch 085 | Learning rate 0.002149 | train normalized MSE   0.0826 | val normalized MSE   0.0977, | val MAE   1.4696 | val MSE   9.7721


Epoch:  36%|███▌      | 90/250 [07:44<13:41,  5.14s/epoch]

Epoch 090 | Learning rate 0.002044 | train normalized MSE   0.0817 | val normalized MSE   0.0963, | val MAE   1.4457 | val MSE   9.6310


Epoch:  38%|███▊      | 95/250 [08:10<13:17,  5.15s/epoch]

Epoch 095 | Learning rate 0.001944 | train normalized MSE   0.0809 | val normalized MSE   0.0958, | val MAE   1.4703 | val MSE   9.5832


Epoch:  40%|████      | 100/250 [08:35<12:46,  5.11s/epoch]

Epoch 100 | Learning rate 0.001849 | train normalized MSE   0.0802 | val normalized MSE   0.0970, | val MAE   1.5057 | val MSE   9.6956


Epoch:  42%|████▏     | 105/250 [09:01<12:34,  5.20s/epoch]

Epoch 105 | Learning rate 0.000879 | train normalized MSE   0.0784 | val normalized MSE   0.0935, | val MAE   1.4230 | val MSE   9.3470


Epoch:  44%|████▍     | 110/250 [09:27<11:54,  5.10s/epoch]

Epoch 110 | Learning rate 0.000836 | train normalized MSE   0.0780 | val normalized MSE   0.0942, | val MAE   1.4144 | val MSE   9.4208


Epoch:  46%|████▌     | 115/250 [09:52<11:30,  5.12s/epoch]

Epoch 115 | Learning rate 0.000795 | train normalized MSE   0.0759 | val normalized MSE   0.0934, | val MAE   1.4318 | val MSE   9.3424


Epoch:  48%|████▊     | 120/250 [10:18<11:02,  5.10s/epoch]

Epoch 120 | Learning rate 0.000756 | train normalized MSE   0.0765 | val normalized MSE   0.0936, | val MAE   1.4127 | val MSE   9.3623


Epoch:  50%|█████     | 125/250 [10:43<10:39,  5.11s/epoch]

Epoch 125 | Learning rate 0.000719 | train normalized MSE   0.0753 | val normalized MSE   0.0932, | val MAE   1.4325 | val MSE   9.3198


Epoch:  52%|█████▏    | 130/250 [11:09<10:15,  5.13s/epoch]

Epoch 130 | Learning rate 0.000684 | train normalized MSE   0.0752 | val normalized MSE   0.0924, | val MAE   1.4104 | val MSE   9.2399


Epoch:  54%|█████▍    | 135/250 [11:35<09:49,  5.13s/epoch]

Epoch 135 | Learning rate 0.000650 | train normalized MSE   0.0743 | val normalized MSE   0.0941, | val MAE   1.4360 | val MSE   9.4054


Epoch:  56%|█████▌    | 140/250 [12:00<09:26,  5.15s/epoch]

Epoch 140 | Learning rate 0.000618 | train normalized MSE   0.0744 | val normalized MSE   0.0938, | val MAE   1.4125 | val MSE   9.3788


Epoch:  58%|█████▊    | 145/250 [12:26<08:52,  5.07s/epoch]

Epoch 145 | Learning rate 0.000588 | train normalized MSE   0.0740 | val normalized MSE   0.0929, | val MAE   1.4157 | val MSE   9.2886


Epoch:  58%|█████▊    | 146/250 [12:36<08:58,  5.18s/epoch]

==== EARLY STOP at epoch 147




Epoch:   2%|▏         | 5/250 [00:35<29:06,  7.13s/epoch]

Epoch 005 | Learning rate 0.009606 | train normalized MSE   0.1617 | val normalized MSE   0.1691, | val MAE   2.3029 | val MSE  16.9114


Epoch:   4%|▍         | 10/250 [01:11<28:41,  7.17s/epoch]

Epoch 010 | Learning rate 0.009135 | train normalized MSE   0.1198 | val normalized MSE   0.1091, | val MAE   1.6639 | val MSE  10.9077


Epoch:   6%|▌         | 15/250 [01:48<28:26,  7.26s/epoch]

Epoch 015 | Learning rate 0.008687 | train normalized MSE   0.1076 | val normalized MSE   0.1042, | val MAE   1.6460 | val MSE  10.4240


Epoch:   8%|▊         | 20/250 [02:24<27:36,  7.20s/epoch]

Epoch 020 | Learning rate 0.008262 | train normalized MSE   0.2026 | val normalized MSE   0.1586, | val MAE   2.2121 | val MSE  15.8623


Epoch:  10%|█         | 25/250 [03:00<27:05,  7.22s/epoch]

Epoch 025 | Learning rate 0.007857 | train normalized MSE   0.1184 | val normalized MSE   0.1241, | val MAE   1.9091 | val MSE  12.4134


Epoch:  12%|█▏        | 30/250 [03:36<26:20,  7.19s/epoch]

Epoch 030 | Learning rate 0.007472 | train normalized MSE   0.1062 | val normalized MSE   0.1055, | val MAE   1.6358 | val MSE  10.5489


Epoch:  14%|█▍        | 35/250 [04:12<25:43,  7.18s/epoch]

Epoch 035 | Learning rate 0.007106 | train normalized MSE   0.1008 | val normalized MSE   0.1061, | val MAE   1.8336 | val MSE  10.6110


Epoch:  16%|█▌        | 40/250 [04:47<24:54,  7.12s/epoch]

Epoch 040 | Learning rate 0.006757 | train normalized MSE   0.0978 | val normalized MSE   0.0958, | val MAE   1.5522 | val MSE   9.5789


Epoch:  18%|█▊        | 45/250 [05:24<24:58,  7.31s/epoch]

Epoch 045 | Learning rate 0.006426 | train normalized MSE   0.0951 | val normalized MSE   0.0977, | val MAE   1.6091 | val MSE   9.7710


Epoch:  20%|██        | 50/250 [06:00<23:45,  7.13s/epoch]

Epoch 050 | Learning rate 0.006111 | train normalized MSE   0.0944 | val normalized MSE   0.0961, | val MAE   1.5273 | val MSE   9.6075


Epoch:  22%|██▏       | 55/250 [06:36<23:27,  7.22s/epoch]

Epoch 055 | Learning rate 0.002906 | train normalized MSE   0.0889 | val normalized MSE   0.0934, | val MAE   1.4870 | val MSE   9.3384


Epoch:  24%|██▍       | 60/250 [07:12<22:56,  7.24s/epoch]

Epoch 060 | Learning rate 0.002763 | train normalized MSE   0.0890 | val normalized MSE   0.0903, | val MAE   1.4685 | val MSE   9.0305


Epoch:  26%|██▌       | 65/250 [07:48<22:09,  7.19s/epoch]

Epoch 065 | Learning rate 0.002628 | train normalized MSE   0.0899 | val normalized MSE   0.0897, | val MAE   1.4437 | val MSE   8.9728


Epoch:  28%|██▊       | 70/250 [08:24<21:18,  7.10s/epoch]

Epoch 070 | Learning rate 0.002499 | train normalized MSE   0.0880 | val normalized MSE   0.0904, | val MAE   1.4736 | val MSE   9.0355


Epoch:  30%|███       | 75/250 [09:00<20:58,  7.19s/epoch]

Epoch 075 | Learning rate 0.002377 | train normalized MSE   0.0866 | val normalized MSE   0.0888, | val MAE   1.4586 | val MSE   8.8796


Epoch:  32%|███▏      | 80/250 [09:36<20:20,  7.18s/epoch]

Epoch 080 | Learning rate 0.002260 | train normalized MSE   0.0859 | val normalized MSE   0.0900, | val MAE   1.5145 | val MSE   9.0023


Epoch:  34%|███▍      | 85/250 [10:12<19:50,  7.22s/epoch]

Epoch 085 | Learning rate 0.002149 | train normalized MSE   0.0847 | val normalized MSE   0.0889, | val MAE   1.4688 | val MSE   8.8912


Epoch:  36%|███▌      | 90/250 [10:48<19:13,  7.21s/epoch]

Epoch 090 | Learning rate 0.002044 | train normalized MSE   0.0843 | val normalized MSE   0.0862, | val MAE   1.3938 | val MSE   8.6197


Epoch:  38%|███▊      | 95/250 [11:24<18:41,  7.24s/epoch]

Epoch 095 | Learning rate 0.001944 | train normalized MSE   0.0848 | val normalized MSE   0.0880, | val MAE   1.4264 | val MSE   8.7966


Epoch:  40%|████      | 100/250 [12:00<18:00,  7.20s/epoch]

Epoch 100 | Learning rate 0.001849 | train normalized MSE   0.0852 | val normalized MSE   0.0861, | val MAE   1.4145 | val MSE   8.6071


Epoch:  42%|████▏     | 105/250 [12:36<17:17,  7.16s/epoch]

Epoch 105 | Learning rate 0.000879 | train normalized MSE   0.0810 | val normalized MSE   0.0850, | val MAE   1.3671 | val MSE   8.5043


Epoch:  44%|████▍     | 110/250 [13:12<16:45,  7.18s/epoch]

Epoch 110 | Learning rate 0.000836 | train normalized MSE   0.0808 | val normalized MSE   0.0846, | val MAE   1.4126 | val MSE   8.4635


Epoch:  46%|████▌     | 115/250 [13:48<16:09,  7.18s/epoch]

Epoch 115 | Learning rate 0.000795 | train normalized MSE   0.0816 | val normalized MSE   0.0857, | val MAE   1.3829 | val MSE   8.5705


Epoch:  48%|████▊     | 120/250 [14:23<15:27,  7.14s/epoch]

Epoch 120 | Learning rate 0.000756 | train normalized MSE   0.0807 | val normalized MSE   0.0841, | val MAE   1.3916 | val MSE   8.4059


Epoch:  50%|█████     | 125/250 [14:59<14:52,  7.14s/epoch]

Epoch 125 | Learning rate 0.000719 | train normalized MSE   0.0792 | val normalized MSE   0.0840, | val MAE   1.3723 | val MSE   8.3996


Epoch:  52%|█████▏    | 130/250 [15:35<14:29,  7.24s/epoch]

Epoch 130 | Learning rate 0.000684 | train normalized MSE   0.0847 | val normalized MSE   0.0839, | val MAE   1.3547 | val MSE   8.3877


Epoch:  54%|█████▍    | 135/250 [16:11<13:45,  7.18s/epoch]

Epoch 135 | Learning rate 0.000650 | train normalized MSE   0.0779 | val normalized MSE   0.0843, | val MAE   1.3560 | val MSE   8.4254


Epoch:  56%|█████▌    | 140/250 [16:47<13:04,  7.13s/epoch]

Epoch 140 | Learning rate 0.000618 | train normalized MSE   0.0784 | val normalized MSE   0.0839, | val MAE   1.3763 | val MSE   8.3927


Epoch:  58%|█████▊    | 145/250 [17:24<12:43,  7.27s/epoch]

Epoch 145 | Learning rate 0.000588 | train normalized MSE   0.0780 | val normalized MSE   0.0841, | val MAE   1.3760 | val MSE   8.4080


Epoch:  60%|██████    | 150/250 [18:00<11:55,  7.16s/epoch]

Epoch 150 | Learning rate 0.000559 | train normalized MSE   0.0777 | val normalized MSE   0.0830, | val MAE   1.3616 | val MSE   8.2961


Epoch:  62%|██████▏   | 155/250 [18:36<11:26,  7.23s/epoch]

Epoch 155 | Learning rate 0.000266 | train normalized MSE   0.0763 | val normalized MSE   0.0826, | val MAE   1.3466 | val MSE   8.2638


Epoch:  64%|██████▍   | 160/250 [19:12<10:51,  7.24s/epoch]

Epoch 160 | Learning rate 0.000253 | train normalized MSE   0.0771 | val normalized MSE   0.0825, | val MAE   1.3400 | val MSE   8.2488


Epoch:  66%|██████▌   | 165/250 [19:48<10:10,  7.18s/epoch]

Epoch 165 | Learning rate 0.000240 | train normalized MSE   0.0769 | val normalized MSE   0.0826, | val MAE   1.3381 | val MSE   8.2611


Epoch:  68%|██████▊   | 170/250 [20:24<09:34,  7.19s/epoch]

Epoch 170 | Learning rate 0.000229 | train normalized MSE   0.0771 | val normalized MSE   0.0835, | val MAE   1.3362 | val MSE   8.3476


Epoch:  70%|███████   | 175/250 [21:00<08:56,  7.16s/epoch]

Epoch 175 | Learning rate 0.000217 | train normalized MSE   0.0765 | val normalized MSE   0.0830, | val MAE   1.3411 | val MSE   8.3046


Epoch:  72%|███████▏  | 180/250 [21:35<08:17,  7.10s/epoch]

Epoch 180 | Learning rate 0.000207 | train normalized MSE   0.0780 | val normalized MSE   0.0828, | val MAE   1.3410 | val MSE   8.2762


Epoch:  74%|███████▍  | 185/250 [22:11<07:43,  7.14s/epoch]

Epoch 185 | Learning rate 0.000197 | train normalized MSE   0.0763 | val normalized MSE   0.0832, | val MAE   1.3353 | val MSE   8.3202


Epoch:  76%|███████▌  | 190/250 [22:47<07:10,  7.17s/epoch]

Epoch 190 | Learning rate 0.000187 | train normalized MSE   0.0771 | val normalized MSE   0.0831, | val MAE   1.3435 | val MSE   8.3105


Epoch:  77%|███████▋  | 193/250 [23:16<06:52,  7.23s/epoch]

==== EARLY STOP at epoch 194





Submission saved locally as: './submission\submission-2025-05-09_04-08PM.csv'.
