### 0. Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
import pywt
import copy
import json

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/data/train_data.csv')
df.set_index('timestamp', inplace=True)

### 1. Data Preprocessing

##### 1.1 Wavelet Frequency Decompose Correlation

In [None]:
def wavelet_decomposed_corr(df, input_window_width=30, label_window_width=10, wavelet='db4', level=3):
    X, Y = [], []
    data = df.values

    for t in range(input_window_width, len(df) - 1):
        window_data = data[t-input_window_width : t]

        low_band, mid_band, high_band = [], [], []
        for i in range(window_data.shape[1]):
            comod_per_widow = window_data[:, i]
            coeffs = pywt.wavedec(comod_per_widow, wavelet, level=level)
            cA3, cD3, _, cD1 = coeffs
            low_band.append(cA3)
            mid_band.append(cD3)
            high_band.append(cD1)

        corr_low = np.corrcoef(low_band)
        corr_mid = np.corrcoef(mid_band)
        corr_high = np.corrcoef(high_band)

        corr_tensor = torch.tensor(np.stack([corr_low, corr_mid, corr_high]), dtype=torch.float32)
        X.append(corr_tensor)

        label_window = data[t : t+label_window_width]
        corr_next = np.corrcoef(label_window.T)
        Y.append(torch.tensor(corr_next, dtype=torch.float32))


    return torch.stack(X), torch.stack(Y)

In [None]:
X_tensor, Y_tensor = wavelet_decomposed_corr(df)

total_size = len(X_tensor)
train_size = int(total_size * 0.8)
val_size   = int(total_size * 0.1)

test_size  = total_size - train_size - val_size

X_train = X_tensor[:train_size]
Y_train = Y_tensor[:train_size]

X_val = X_tensor[train_size:train_size + val_size]
Y_val = Y_tensor[train_size:train_size + val_size]

X_test = X_tensor[train_size + val_size:]
Y_test = Y_tensor[train_size + val_size:]

train_ds = TensorDataset(X_train, Y_train)
val_ds   = TensorDataset(X_val, Y_val)
test_ds  = TensorDataset(X_test, Y_test)



### 2. Modeling

##### 2.1 Model Structure Setting

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]

        return x

In [None]:
class CorrPredictorTransformer(nn.Module):
    def __init__(
            self,
            num_channels=3,
            nhead=4,
            num_layers=2,
            dim_feedforward=256,
            activation='relu',
            d_model=64,
        ):
        super().__init__()

        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            activation=activation,
            batch_first=True
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 64)
        )

    def forward(self, x):
        """
        x: (B, 3, 8, 8)
        output: (B, 8, 8)
        """
        B, C, H, W = x.shape

        x, _ = x.max(dim=1)
        x = x.view(B, -1)
        x = x.unsqueeze(0)

        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = x.squeeze(0)

        x = self.fc(x)
        x = x.view(-1, 8, 8)
        return x


##### 2.2 Training

In [None]:
def train_model(model, train_loader, val_loader, optimizer_name='Adam', lr=5e-4, epochs=70, device='cuda'):
    model.to(device)

    # Optimizer 선택
    if optimizer_name == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'RMSprop':
        opt = torch.optim.RMSprop(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        opt = torch.optim.AdamW(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss & LR Scheduler
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode='min', factor=0.5, patience=5, verbose=True
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()

            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            opt.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        # 스케줄러 적용
        scheduler.step(val_loss)

    return val_loss

In [None]:
model_save_path = '/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/best_model'

In [None]:
param_grid = {
    'nhead': [2, 4],
    'num_layers': [2, 4],
    'dim_feedforward': [256, 512],
    'activation': ['relu', 'gelu'],
    'lr': [0.001, 5e-4],
    'optimizer': ['Adam', 'RMSprop', 'AdamW'],
    'batch_size': [64, 128, 256, 512],
}

best_loss = float('inf')
best_config = None

for config in tqdm(ParameterGrid(param_grid)):
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=False)
    val_loader   = DataLoader(val_ds,   batch_size=config['batch_size'], shuffle=False)

    model = CorrPredictorTransformer(
        d_model=64,  # 고정
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        dim_feedforward=config['dim_feedforward'],
        activation=config['activation'],
    )

    loss = train_model(
        model, train_loader, val_loader,
        optimizer_name=config['optimizer'],
        lr=config['lr'],
        device=device
    )

    print(f"Config: {config}, Loss: {loss:.4f}")

    if loss < best_loss:
        best_loss = loss
        best_config = config
        torch.save(model.state_dict(), f"{model_save_path}/best_model_window10per30_WT_weights.pth")
        with open(f'{model_save_path}/best_model_window10per30_WT_config.json', 'w') as f:
            json.dump(best_config, f, indent=4)

print(f"\n✅ Best Config: {best_config}")
print(f"✅ Best Loss: {best_loss:.4f}")




Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0662


  1%|          | 2/384 [00:31<1:35:25, 14.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0657


  1%|          | 3/384 [00:43<1:27:28, 13.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


  1%|          | 4/384 [01:03<1:43:26, 16.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0649


  1%|▏         | 5/384 [01:23<1:51:37, 17.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


  2%|▏         | 6/384 [01:45<1:59:04, 18.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


  2%|▏         | 7/384 [01:57<1:45:57, 16.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0713


  2%|▏         | 8/384 [02:10<1:37:01, 15.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


  2%|▏         | 9/384 [02:22<1:31:02, 14.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


  3%|▎         | 10/384 [02:43<1:42:00, 16.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


  3%|▎         | 11/384 [03:03<1:48:21, 17.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0651


  3%|▎         | 12/384 [03:23<1:53:29, 18.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


  3%|▎         | 13/384 [03:35<1:41:58, 16.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0654


  4%|▎         | 14/384 [03:47<1:33:10, 15.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0673


  4%|▍         | 15/384 [04:00<1:27:50, 14.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0668


  4%|▍         | 16/384 [04:20<1:38:44, 16.10s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


  4%|▍         | 17/384 [04:40<1:45:24, 17.23s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


  5%|▍         | 18/384 [05:00<1:50:56, 18.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


  5%|▍         | 19/384 [05:13<1:40:07, 16.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0662


  5%|▌         | 20/384 [05:25<1:32:09, 15.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0654


  5%|▌         | 21/384 [05:37<1:26:41, 14.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0664


  6%|▌         | 22/384 [05:58<1:37:30, 16.16s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0653


  6%|▌         | 23/384 [06:17<1:43:57, 17.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0647


  6%|▋         | 24/384 [06:38<1:49:22, 18.23s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


  7%|▋         | 25/384 [06:50<1:38:32, 16.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


  7%|▋         | 26/384 [07:03<1:30:44, 15.21s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0651


  7%|▋         | 27/384 [07:15<1:25:23, 14.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0771


  7%|▋         | 28/384 [07:35<1:35:52, 16.16s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


  8%|▊         | 29/384 [07:55<1:42:06, 17.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0655


  8%|▊         | 30/384 [08:16<1:47:40, 18.25s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


  8%|▊         | 31/384 [08:28<1:37:04, 16.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


  8%|▊         | 32/384 [08:40<1:29:18, 15.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0660


  9%|▊         | 33/384 [08:53<1:24:10, 14.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


  9%|▉         | 34/384 [09:13<1:34:16, 16.16s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


  9%|▉         | 35/384 [09:33<1:40:08, 17.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


  9%|▉         | 36/384 [09:53<1:45:29, 18.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 10%|▉         | 37/384 [10:06<1:35:11, 16.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 10%|▉         | 38/384 [10:18<1:27:29, 15.17s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 10%|█         | 39/384 [10:30<1:22:23, 14.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 10%|█         | 40/384 [10:51<1:32:33, 16.14s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0685


 11%|█         | 41/384 [11:11<1:39:31, 17.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


 11%|█         | 42/384 [11:31<1:44:38, 18.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 11%|█         | 43/384 [11:44<1:34:17, 16.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 11%|█▏        | 44/384 [11:56<1:26:49, 15.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0657


 12%|█▏        | 45/384 [12:09<1:22:14, 14.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 12%|█▏        | 46/384 [12:30<1:32:25, 16.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 12%|█▏        | 47/384 [12:50<1:38:16, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0651


 12%|█▎        | 48/384 [13:11<1:43:28, 18.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0644


 13%|█▎        | 49/384 [13:17<1:23:26, 14.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


 13%|█▎        | 50/384 [13:24<1:09:12, 12.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0655


 13%|█▎        | 51/384 [13:30<59:17, 10.68s/it]  

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0669


 14%|█▎        | 52/384 [13:41<59:08, 10.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 14%|█▍        | 53/384 [13:52<58:48, 10.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 14%|█▍        | 54/384 [14:03<59:18, 10.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


 14%|█▍        | 55/384 [14:10<52:21,  9.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0661


 15%|█▍        | 56/384 [14:16<47:06,  8.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0653


 15%|█▍        | 57/384 [14:23<43:45,  8.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0659


 15%|█▌        | 58/384 [14:33<48:02,  8.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 15%|█▌        | 59/384 [14:44<50:24,  9.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0649


 16%|█▌        | 60/384 [14:55<52:49,  9.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 16%|█▌        | 61/384 [15:01<47:45,  8.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 16%|█▌        | 62/384 [15:08<43:59,  8.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 16%|█▋        | 63/384 [15:15<41:25,  7.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 17%|█▋        | 64/384 [15:25<45:59,  8.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 17%|█▋        | 65/384 [15:36<48:48,  9.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


 17%|█▋        | 66/384 [15:47<51:12,  9.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 17%|█▋        | 67/384 [15:53<46:14,  8.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 18%|█▊        | 68/384 [16:00<42:35,  8.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0660


 18%|█▊        | 69/384 [16:07<40:16,  7.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0760


 18%|█▊        | 70/384 [16:17<44:53,  8.58s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 18%|█▊        | 71/384 [16:28<47:43,  9.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 19%|█▉        | 72/384 [16:39<50:10,  9.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 19%|█▉        | 73/384 [16:45<45:27,  8.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0723


 19%|█▉        | 74/384 [16:52<41:47,  8.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 20%|█▉        | 75/384 [16:59<39:44,  7.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0682


 20%|█▉        | 76/384 [17:09<44:27,  8.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0651


 20%|██        | 77/384 [17:20<47:40,  9.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 20%|██        | 78/384 [17:31<49:41,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 21%|██        | 79/384 [17:38<44:58,  8.85s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 21%|██        | 80/384 [17:44<41:22,  8.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0653


 21%|██        | 81/384 [17:51<38:59,  7.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 21%|██▏       | 82/384 [18:02<43:44,  8.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0651


 22%|██▏       | 83/384 [18:12<46:19,  9.24s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0650


 22%|██▏       | 84/384 [18:23<48:38,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0655


 22%|██▏       | 85/384 [18:30<44:05,  8.85s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0718


 22%|██▏       | 86/384 [18:37<40:46,  8.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0658


 23%|██▎       | 87/384 [18:44<38:26,  7.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 23%|██▎       | 88/384 [18:55<43:27,  8.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0654


 23%|██▎       | 89/384 [19:06<46:14,  9.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 23%|██▎       | 90/384 [19:17<48:24,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0655


 24%|██▎       | 91/384 [19:23<43:45,  8.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 24%|██▍       | 92/384 [19:30<40:18,  8.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0658


 24%|██▍       | 93/384 [19:37<37:59,  7.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 24%|██▍       | 94/384 [19:48<42:04,  8.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 25%|██▍       | 95/384 [19:58<44:21,  9.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 25%|██▌       | 96/384 [20:09<46:37,  9.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 25%|██▌       | 97/384 [20:13<38:11,  7.98s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0686


 26%|██▌       | 98/384 [20:17<32:21,  6.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 26%|██▌       | 99/384 [20:21<28:07,  5.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0676


 26%|██▌       | 100/384 [20:27<28:21,  5.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 26%|██▋       | 101/384 [20:33<28:11,  5.98s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 27%|██▋       | 102/384 [20:39<28:11,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 27%|██▋       | 103/384 [20:43<25:22,  5.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0663


 27%|██▋       | 104/384 [20:47<23:07,  4.96s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 27%|██▋       | 105/384 [20:51<21:33,  4.64s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0668


 28%|██▊       | 106/384 [20:57<23:31,  5.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0654


 28%|██▊       | 107/384 [21:03<24:30,  5.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 28%|██▊       | 108/384 [21:09<25:40,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


 28%|██▊       | 109/384 [21:13<23:16,  5.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 29%|██▊       | 110/384 [21:17<21:27,  4.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0664


 29%|██▉       | 111/384 [21:21<20:38,  4.54s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0674


 29%|██▉       | 112/384 [21:27<22:30,  4.97s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0659


 29%|██▉       | 113/384 [21:33<23:56,  5.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 30%|██▉       | 114/384 [21:39<24:55,  5.54s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 30%|██▉       | 115/384 [21:43<22:46,  5.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 30%|███       | 116/384 [21:47<21:21,  4.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 30%|███       | 117/384 [21:51<20:13,  4.54s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0673


 31%|███       | 118/384 [21:57<22:01,  4.97s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0708


 31%|███       | 119/384 [22:03<23:22,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 31%|███▏      | 120/384 [22:09<24:22,  5.54s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0659


 32%|███▏      | 121/384 [22:13<22:15,  5.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 32%|███▏      | 122/384 [22:17<20:31,  4.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 32%|███▏      | 123/384 [22:21<19:30,  4.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 32%|███▏      | 124/384 [22:27<21:38,  4.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0663


 33%|███▎      | 125/384 [22:33<22:45,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 33%|███▎      | 126/384 [22:39<23:28,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0733


 33%|███▎      | 127/384 [22:43<21:39,  5.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 33%|███▎      | 128/384 [22:47<19:59,  4.68s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0660


 34%|███▎      | 129/384 [22:51<19:04,  4.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0668


 34%|███▍      | 130/384 [22:57<20:53,  4.94s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0666


 34%|███▍      | 131/384 [23:03<21:56,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0659


 34%|███▍      | 132/384 [23:09<23:06,  5.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 35%|███▍      | 133/384 [23:13<20:59,  5.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0683


 35%|███▍      | 134/384 [23:17<19:27,  4.67s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0665


 35%|███▌      | 135/384 [23:21<18:44,  4.52s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 35%|███▌      | 136/384 [23:27<20:25,  4.94s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 36%|███▌      | 137/384 [23:33<21:47,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 36%|███▌      | 138/384 [23:39<22:30,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 36%|███▌      | 139/384 [23:43<20:31,  5.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


 36%|███▋      | 140/384 [23:47<19:08,  4.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0668


 37%|███▋      | 141/384 [23:51<18:05,  4.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0679


 37%|███▋      | 142/384 [23:57<19:51,  4.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


 37%|███▋      | 143/384 [24:03<21:06,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 38%|███▊      | 144/384 [24:09<22:00,  5.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0691


 38%|███▊      | 145/384 [24:11<18:27,  4.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 38%|███▊      | 146/384 [24:14<15:45,  3.97s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0671


 38%|███▊      | 147/384 [24:16<14:07,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 39%|███▊      | 148/384 [24:20<14:15,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 39%|███▉      | 149/384 [24:24<13:58,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 39%|███▉      | 150/384 [24:27<14:01,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0690


 39%|███▉      | 151/384 [24:30<12:48,  3.30s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0695


 40%|███▉      | 152/384 [24:32<11:47,  3.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 40%|███▉      | 153/384 [24:35<11:14,  2.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 40%|████      | 154/384 [24:39<12:05,  3.15s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


 40%|████      | 155/384 [24:42<12:24,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 41%|████      | 156/384 [24:46<12:50,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 41%|████      | 157/384 [24:48<11:53,  3.14s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0676


 41%|████      | 158/384 [24:51<11:04,  2.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0685


 41%|████▏     | 159/384 [24:54<10:39,  2.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 42%|████▏     | 160/384 [24:57<11:37,  3.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0690


 42%|████▏     | 161/384 [25:01<12:10,  3.28s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 42%|████▏     | 162/384 [25:04<12:25,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0684


 42%|████▏     | 163/384 [25:07<11:33,  3.14s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 43%|████▎     | 164/384 [25:10<10:45,  2.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 43%|████▎     | 165/384 [25:12<10:20,  2.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 43%|████▎     | 166/384 [25:16<11:12,  3.08s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0671


 43%|████▎     | 167/384 [25:19<11:45,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 44%|████▍     | 168/384 [25:23<11:59,  3.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 44%|████▍     | 169/384 [25:26<11:11,  3.12s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 44%|████▍     | 170/384 [25:28<10:30,  2.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0670


 45%|████▍     | 171/384 [25:31<09:55,  2.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


 45%|████▍     | 172/384 [25:34<10:50,  3.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 45%|████▌     | 173/384 [25:38<11:21,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 45%|████▌     | 174/384 [25:41<11:36,  3.32s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0691


 46%|████▌     | 175/384 [25:44<10:53,  3.13s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 46%|████▌     | 176/384 [25:47<10:07,  2.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0665


 46%|████▌     | 177/384 [25:49<09:48,  2.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 46%|████▋     | 178/384 [25:53<10:35,  3.08s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 47%|████▋     | 179/384 [25:56<11:06,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 47%|████▋     | 180/384 [26:00<11:31,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 47%|████▋     | 181/384 [26:03<10:41,  3.16s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 47%|████▋     | 182/384 [26:05<09:52,  2.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 48%|████▊     | 183/384 [26:08<09:32,  2.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 48%|████▊     | 184/384 [26:12<10:21,  3.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0718


 48%|████▊     | 185/384 [26:15<10:35,  3.20s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


 48%|████▊     | 186/384 [26:19<11:00,  3.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 49%|████▊     | 187/384 [26:21<10:14,  3.12s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0693


 49%|████▉     | 188/384 [26:24<09:34,  2.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 49%|████▉     | 189/384 [26:26<09:15,  2.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 49%|████▉     | 190/384 [26:30<10:02,  3.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 50%|████▉     | 191/384 [26:34<10:21,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 50%|█████     | 192/384 [26:37<10:45,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0668


 50%|█████     | 193/384 [26:50<19:22,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 51%|█████     | 194/384 [27:02<24:53,  7.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 51%|█████     | 195/384 [27:14<29:03,  9.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 51%|█████     | 196/384 [27:35<39:32, 12.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 51%|█████▏    | 197/384 [27:55<46:15, 14.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 52%|█████▏    | 198/384 [28:15<51:11, 16.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 52%|█████▏    | 199/384 [28:28<47:09, 15.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


 52%|█████▏    | 200/384 [28:39<43:42, 14.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 52%|█████▏    | 201/384 [28:52<41:45, 13.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 53%|█████▎    | 202/384 [29:12<47:48, 15.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 53%|█████▎    | 203/384 [29:32<51:22, 17.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 53%|█████▎    | 204/384 [29:53<54:08, 18.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0651


 53%|█████▎    | 205/384 [30:05<48:48, 16.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 54%|█████▎    | 206/384 [30:17<44:46, 15.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0660


 54%|█████▍    | 207/384 [30:30<42:09, 14.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0744


 54%|█████▍    | 208/384 [30:50<47:22, 16.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0634


 54%|█████▍    | 209/384 [31:10<50:21, 17.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0654


 55%|█████▍    | 210/384 [31:31<52:47, 18.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0647


 55%|█████▍    | 211/384 [31:43<47:25, 16.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0659


 55%|█████▌    | 212/384 [31:55<43:36, 15.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 55%|█████▌    | 213/384 [32:08<40:56, 14.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 56%|█████▌    | 214/384 [32:28<45:46, 16.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0641


 56%|█████▌    | 215/384 [32:48<48:46, 17.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 56%|█████▋    | 216/384 [33:09<51:17, 18.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0648


 57%|█████▋    | 217/384 [33:21<46:00, 16.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0700


 57%|█████▋    | 218/384 [33:33<42:07, 15.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 57%|█████▋    | 219/384 [33:46<39:34, 14.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0652


 57%|█████▋    | 220/384 [34:06<44:26, 16.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0662


 58%|█████▊    | 221/384 [34:26<47:19, 17.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0654


 58%|█████▊    | 222/384 [34:47<49:30, 18.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 58%|█████▊    | 223/384 [34:59<44:21, 16.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0668


 58%|█████▊    | 224/384 [35:11<40:28, 15.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 59%|█████▊    | 225/384 [35:24<38:02, 14.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 59%|█████▉    | 226/384 [35:44<42:38, 16.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0640


 59%|█████▉    | 227/384 [36:04<45:22, 17.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 59%|█████▉    | 228/384 [36:24<47:28, 18.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 60%|█████▉    | 229/384 [36:37<42:41, 16.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 60%|█████▉    | 230/384 [36:49<38:53, 15.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 60%|██████    | 231/384 [37:01<36:37, 14.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 60%|██████    | 232/384 [37:22<41:03, 16.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0657


 61%|██████    | 233/384 [37:42<43:32, 17.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0647


 61%|██████    | 234/384 [38:02<45:42, 18.28s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 61%|██████    | 235/384 [38:15<41:02, 16.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0671


 61%|██████▏   | 236/384 [38:27<37:35, 15.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0665


 62%|██████▏   | 237/384 [38:39<35:12, 14.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 62%|██████▏   | 238/384 [39:00<39:21, 16.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 62%|██████▏   | 239/384 [39:20<41:47, 17.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 62%|██████▎   | 240/384 [39:40<43:50, 18.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 63%|██████▎   | 241/384 [39:47<35:19, 14.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0657


 63%|██████▎   | 242/384 [39:54<29:14, 12.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 63%|██████▎   | 243/384 [40:00<25:04, 10.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 64%|██████▎   | 244/384 [40:11<25:04, 10.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 64%|██████▍   | 245/384 [40:22<24:45, 10.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0654


 64%|██████▍   | 246/384 [40:33<24:42, 10.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0652


 64%|██████▍   | 247/384 [40:39<21:47,  9.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0663


 65%|██████▍   | 248/384 [40:46<19:39,  8.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0656


 65%|██████▍   | 249/384 [40:53<18:13,  8.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0678


 65%|██████▌   | 250/384 [41:04<19:53,  8.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 65%|██████▌   | 251/384 [41:14<20:50,  9.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 66%|██████▌   | 252/384 [41:25<21:40,  9.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 66%|██████▌   | 253/384 [41:32<19:26,  8.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0663


 66%|██████▌   | 254/384 [41:38<17:46,  8.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0656


 66%|██████▋   | 255/384 [41:45<16:41,  7.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0659


 67%|██████▋   | 256/384 [41:56<18:30,  8.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 67%|██████▋   | 257/384 [42:07<19:37,  9.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 67%|██████▋   | 258/384 [42:17<20:29,  9.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 67%|██████▋   | 259/384 [42:24<18:30,  8.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 68%|██████▊   | 260/384 [42:31<16:57,  8.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0659


 68%|██████▊   | 261/384 [42:38<15:57,  7.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0705


 68%|██████▊   | 262/384 [42:49<17:39,  8.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 68%|██████▊   | 263/384 [42:59<18:41,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 69%|██████▉   | 264/384 [43:10<19:37,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0644


 69%|██████▉   | 265/384 [43:17<17:39,  8.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0670


 69%|██████▉   | 266/384 [43:24<16:10,  8.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0655


 70%|██████▉   | 267/384 [43:30<15:09,  7.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0655


 70%|██████▉   | 268/384 [43:41<16:45,  8.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0759


 70%|███████   | 269/384 [43:52<17:42,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0651


 70%|███████   | 270/384 [44:02<18:24,  9.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 71%|███████   | 271/384 [44:09<16:33,  8.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 71%|███████   | 272/384 [44:16<15:09,  8.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0655


 71%|███████   | 273/384 [44:22<14:14,  7.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 71%|███████▏  | 274/384 [44:33<15:45,  8.59s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0639


 72%|███████▏  | 275/384 [44:44<16:39,  9.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 72%|███████▏  | 276/384 [44:54<17:23,  9.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


 72%|███████▏  | 277/384 [45:01<15:40,  8.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0710


 72%|███████▏  | 278/384 [45:08<14:19,  8.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 73%|███████▎  | 279/384 [45:14<13:27,  7.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 73%|███████▎  | 280/384 [45:25<14:59,  8.65s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 73%|███████▎  | 281/384 [45:36<15:44,  9.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


 73%|███████▎  | 282/384 [45:46<16:16,  9.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 74%|███████▎  | 283/384 [45:53<14:35,  8.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 74%|███████▍  | 284/384 [45:59<13:18,  7.98s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0657


 74%|███████▍  | 285/384 [46:06<12:29,  7.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0680


 74%|███████▍  | 286/384 [46:16<13:50,  8.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0653


 75%|███████▍  | 287/384 [46:27<14:38,  9.06s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0652


 75%|███████▌  | 288/384 [46:37<15:15,  9.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 75%|███████▌  | 289/384 [46:41<12:27,  7.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0663


 76%|███████▌  | 290/384 [46:45<10:23,  6.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 76%|███████▌  | 291/384 [46:49<09:04,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 76%|███████▌  | 292/384 [46:55<08:57,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 76%|███████▋  | 293/384 [47:01<08:48,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 77%|███████▋  | 294/384 [47:07<08:47,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0712


 77%|███████▋  | 295/384 [47:11<07:47,  5.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0662


 77%|███████▋  | 296/384 [47:14<07:02,  4.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0659


 77%|███████▋  | 297/384 [47:18<06:37,  4.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 78%|███████▊  | 298/384 [47:24<07:05,  4.95s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0654


 78%|███████▊  | 299/384 [47:30<07:24,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 78%|███████▊  | 300/384 [47:36<07:36,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 78%|███████▊  | 301/384 [47:40<06:51,  4.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


 79%|███████▊  | 302/384 [47:44<06:20,  4.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 79%|███████▉  | 303/384 [47:48<05:59,  4.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0669


 79%|███████▉  | 304/384 [47:54<06:29,  4.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0664


 79%|███████▉  | 305/384 [48:00<06:51,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 80%|███████▉  | 306/384 [48:05<07:03,  5.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 80%|███████▉  | 307/384 [48:09<06:25,  5.01s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0672


 80%|████████  | 308/384 [48:13<05:54,  4.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 80%|████████  | 309/384 [48:17<05:33,  4.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0667


 81%|████████  | 310/384 [48:23<06:07,  4.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 81%|████████  | 311/384 [48:29<06:20,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 81%|████████▏ | 312/384 [48:35<06:30,  5.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0664


 82%|████████▏ | 313/384 [48:39<05:53,  4.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0776


 82%|████████▏ | 314/384 [48:43<05:22,  4.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 82%|████████▏ | 315/384 [48:47<05:05,  4.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 82%|████████▏ | 316/384 [48:53<05:31,  4.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 83%|████████▎ | 317/384 [48:59<05:44,  5.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 83%|████████▎ | 318/384 [49:05<05:58,  5.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 83%|████████▎ | 319/384 [49:09<05:22,  4.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 83%|████████▎ | 320/384 [49:12<04:58,  4.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0659


 84%|████████▎ | 321/384 [49:16<04:38,  4.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 84%|████████▍ | 322/384 [49:22<05:01,  4.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 84%|████████▍ | 323/384 [49:28<05:17,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0659


 84%|████████▍ | 324/384 [49:34<05:27,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0665


 85%|████████▍ | 325/384 [49:38<04:53,  4.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0689


 85%|████████▍ | 326/384 [49:42<04:30,  4.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0662


 85%|████████▌ | 327/384 [49:46<04:13,  4.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 85%|████████▌ | 328/384 [49:52<04:34,  4.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 86%|████████▌ | 329/384 [49:58<04:44,  5.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 86%|████████▌ | 330/384 [50:04<04:50,  5.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 86%|████████▌ | 331/384 [50:08<04:23,  4.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0675


 86%|████████▋ | 332/384 [50:11<04:00,  4.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0664


 87%|████████▋ | 333/384 [50:15<03:43,  4.38s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0680


 87%|████████▋ | 334/384 [50:21<04:03,  4.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 87%|████████▋ | 335/384 [50:27<04:10,  5.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 88%|████████▊ | 336/384 [50:33<04:19,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0691


 88%|████████▊ | 337/384 [50:36<03:32,  4.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0702


 88%|████████▊ | 338/384 [50:38<03:00,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0673


 88%|████████▊ | 339/384 [50:41<02:36,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 89%|████████▊ | 340/384 [50:44<02:35,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0744


 89%|████████▉ | 341/384 [50:48<02:32,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 89%|████████▉ | 342/384 [50:51<02:27,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0666


 89%|████████▉ | 343/384 [50:54<02:12,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0731


 90%|████████▉ | 344/384 [50:56<02:01,  3.05s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 90%|████████▉ | 345/384 [50:59<01:51,  2.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 90%|█████████ | 346/384 [51:02<01:57,  3.08s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 90%|█████████ | 347/384 [51:06<01:59,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


 91%|█████████ | 348/384 [51:09<01:59,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 91%|█████████ | 349/384 [51:12<01:48,  3.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0685


 91%|█████████ | 350/384 [51:14<01:37,  2.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 91%|█████████▏| 351/384 [51:17<01:32,  2.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 92%|█████████▏| 352/384 [51:21<01:38,  3.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 92%|█████████▏| 353/384 [51:24<01:38,  3.17s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 92%|█████████▏| 354/384 [51:28<01:39,  3.30s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0666


 92%|█████████▏| 355/384 [51:30<01:29,  3.08s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0682


 93%|█████████▎| 356/384 [51:33<01:20,  2.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 93%|█████████▎| 357/384 [51:35<01:15,  2.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0684


 93%|█████████▎| 358/384 [51:39<01:19,  3.05s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 93%|█████████▎| 359/384 [51:42<01:19,  3.16s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


 94%|█████████▍| 360/384 [51:46<01:19,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 94%|█████████▍| 361/384 [51:49<01:11,  3.12s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 94%|█████████▍| 362/384 [51:51<01:04,  2.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 95%|█████████▍| 363/384 [51:54<00:59,  2.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0729


 95%|█████████▍| 364/384 [51:57<01:01,  3.08s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 95%|█████████▌| 365/384 [52:01<01:00,  3.18s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 95%|█████████▌| 366/384 [52:05<00:59,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0712


 96%|█████████▌| 367/384 [52:07<00:52,  3.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 96%|█████████▌| 368/384 [52:10<00:46,  2.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0673


 96%|█████████▌| 369/384 [52:12<00:42,  2.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 96%|█████████▋| 370/384 [52:16<00:42,  3.01s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 97%|█████████▋| 371/384 [52:19<00:41,  3.18s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 97%|█████████▋| 372/384 [52:23<00:39,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0741


 97%|█████████▋| 373/384 [52:25<00:34,  3.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 97%|█████████▋| 374/384 [52:28<00:28,  2.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0685


 98%|█████████▊| 375/384 [52:30<00:25,  2.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 98%|█████████▊| 376/384 [52:34<00:24,  3.02s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 98%|█████████▊| 377/384 [52:38<00:22,  3.19s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 98%|█████████▊| 378/384 [52:41<00:19,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0689


 99%|█████████▊| 379/384 [52:44<00:15,  3.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0687


 99%|█████████▉| 380/384 [52:46<00:11,  2.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 99%|█████████▉| 381/384 [52:49<00:08,  2.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 99%|█████████▉| 382/384 [52:52<00:06,  3.00s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0685


100%|█████████▉| 383/384 [52:56<00:03,  3.17s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


100%|██████████| 384/384 [52:59<00:00,  8.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693

✅ Best Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}
✅ Best Loss: 0.0634





##### 2.3 Test

In [None]:
def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    model.to(device)

    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred)
            targets.append(yb)

    preds_tensor = torch.cat(preds, dim=0)      # (B*T, 8, 8)
    targets_tensor = torch.cat(targets, dim=0)  # (B*T, 8, 8)

    return preds_tensor, targets_tensor

with open(f'{model_save_path}/best_model_window10per30_WT_config.json', 'r') as f:
    best_config = json.load(f)

best_model = CorrPredictorTransformer(
    d_model=64,
    nhead=best_config['nhead'],
    num_layers=best_config['num_layers'],
    dim_feedforward=best_config['dim_feedforward'],
    activation=best_config['activation']
)

best_model.load_state_dict(torch.load(f"{model_save_path}/best_model_window10per30_WT_weights.pth"))

test_loader = DataLoader(test_ds, batch_size=best_config['batch_size'], shuffle=False)

preds_tensor, targets_tensor = evaluate_model(best_model, test_loader, device=device)

torch.save({
    'preds': preds_tensor,
    'targets': targets_tensor
}, f"{model_save_path}/best_model_window10per30_WT_result.pt")


In [None]:
# Performance metrics

preds_flat = preds_tensor.view(preds_tensor.size(0), -1).cpu().numpy()
targets_flat = targets_tensor.view(targets_tensor.size(0), -1).cpu().numpy()

mse = mean_squared_error(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
rmse = np.sqrt(mse)

# frobenius_loss
cos_sim = cosine_similarity(targets_flat, preds_flat)
mean_cos_sim = np.diag(cos_sim).mean()

# frobenius_loss
diff = preds_tensor - targets_tensor
frobenius_per_sample = torch.norm(diff, p='fro', dim=(1, 2))
mean_frobenius = frobenius_per_sample.mean().item()

print(f"\n📊 Evaluation Results:")
print(f"MSE               : {mse:.5f}")
print(f"MAE               : {mae:.5f}")
print(f"RMSE              : {rmse:.5f}")
print(f"Cosine Similarity : {mean_cos_sim:.5f}")
print(f"Frobenius Norm    : {mean_frobenius:.5f}")


📊 Evaluation Results:
MSE               : 0.07675
MAE               : 0.18852
RMSE              : 0.27704
Cosine Similarity : 0.93675
Frobenius Norm    : 1.96901
