### 0. Setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
import pywt
import copy
import json

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/data/train_data.csv')
df.set_index('timestamp', inplace=True)

### 1. Data Preprocessing

##### 1.1 Wavelet Frequency Decompose Correlation

In [6]:
def wavelet_decomposed_corr(df, input_window_width=30, label_window_width=10, wavelet='db4', level=3):
    X, Y = [], []
    data = df.values

    for t in range(input_window_width, len(df)-label_window_width+1):
        window_data = data[t-input_window_width : t]

        low_band, mid_band, high_band = [], [], []
        for i in range(window_data.shape[1]):
            comod_per_widow = window_data[:, i]
            coeffs = pywt.wavedec(comod_per_widow, wavelet, level=level)
            cA3, cD3, _, cD1 = coeffs
            low_band.append(cA3)
            mid_band.append(cD3)
            high_band.append(cD1)

        corr_low = np.corrcoef(low_band)
        corr_mid = np.corrcoef(mid_band)
        corr_high = np.corrcoef(high_band)

        corr_tensor = torch.tensor(np.stack([corr_low, corr_mid, corr_high]), dtype=torch.float32)
        X.append(corr_tensor)

        label_window = data[t : t+label_window_width]
        corr_next = np.corrcoef(label_window.T)
        Y.append(torch.tensor(corr_next, dtype=torch.float32))


    return torch.stack(X), torch.stack(Y)

In [7]:
X_tensor, Y_tensor = wavelet_decomposed_corr(df)

total_size = len(X_tensor)
train_size = int(total_size * 0.8)
val_size   = int(total_size * 0.1)

test_size  = total_size - train_size - val_size

X_train = X_tensor[:train_size]
Y_train = Y_tensor[:train_size]

X_val = X_tensor[train_size:train_size + val_size]
Y_val = Y_tensor[train_size:train_size + val_size]

X_test = X_tensor[train_size + val_size:]
Y_test = Y_tensor[train_size + val_size:]

train_ds = TensorDataset(X_train, Y_train)
val_ds   = TensorDataset(X_val, Y_val)
test_ds  = TensorDataset(X_test, Y_test)



### 2. Modeling

##### 2.1 Model Structure Setting

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]

        return x

In [9]:
class CorrPredictorTransformer(nn.Module):
    def __init__(
            self,
            num_channels=3,
            nhead=4,
            num_layers=2,
            dim_feedforward=256,
            activation='relu',
            d_model=64,
        ):
        super().__init__()

        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            activation=activation,
            batch_first=True
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 64)
        )

    def forward(self, x):
        """
        x: (B, 3, 8, 8)
        output: (B, 8, 8)
        """
        B, C, H, W = x.shape

        x, _ = x.max(dim=1)
        x = x.view(B, -1)
        x = x.unsqueeze(0)

        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = x.squeeze(0)

        x = self.fc(x)
        x = x.view(-1, 8, 8)
        return x


##### 2.2 Training

In [10]:
def train_model(model, train_loader, val_loader, optimizer_name='Adam', lr=5e-4, epochs=70, device='cuda'):
    model.to(device)

    # Optimizer 선택
    if optimizer_name == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'RMSprop':
        opt = torch.optim.RMSprop(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        opt = torch.optim.AdamW(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss & LR Scheduler
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode='min', factor=0.5, patience=5,
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()

            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            opt.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        # 스케줄러 적용
        scheduler.step(val_loss)

    return val_loss

In [11]:
model_save_path = '/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/best_model'

In [12]:
param_grid = {
    'nhead': [2, 4],
    'num_layers': [2, 4],
    'dim_feedforward': [256, 512],
    'activation': ['relu', 'gelu'],
    'lr': [0.001, 5e-4],
    'optimizer': ['Adam', 'RMSprop', 'AdamW'],
    'batch_size': [64, 128, 256, 512],
}

best_loss = float('inf')
best_config = None

for config in tqdm(ParameterGrid(param_grid)):
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=False)
    val_loader   = DataLoader(val_ds,   batch_size=config['batch_size'], shuffle=False)

    model = CorrPredictorTransformer(
        d_model=64,  # 고정
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        dim_feedforward=config['dim_feedforward'],
        activation=config['activation'],
    )

    loss = train_model(
        model, train_loader, val_loader,
        optimizer_name=config['optimizer'],
        lr=config['lr'],
        device=device
    )

    print(f"Config: {config}, Loss: {loss:.4f}")

    if loss < best_loss:
        best_loss = loss
        best_config = config
        torch.save(model.state_dict(), f"{model_save_path}/best_model_window10per30_WT2_weights.pth")
        with open(f'{model_save_path}/best_model_window10per30_WT2_config.json', 'w') as f:
            json.dump(best_config, f, indent=4)

print(f"\n✅ Best Config: {best_config}")
print(f"✅ Best Loss: {best_loss:.4f}")


  0%|          | 0/384 [00:00<?, ?it/s]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0678


  1%|          | 2/384 [00:34<1:46:51, 16.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0676


  1%|          | 3/384 [00:48<1:37:38, 15.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0626


  1%|          | 4/384 [01:11<1:55:29, 18.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0686


  1%|▏         | 5/384 [01:32<2:03:24, 19.54s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


  2%|▏         | 6/384 [01:55<2:09:01, 20.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0813


  2%|▏         | 7/384 [02:08<1:53:56, 18.13s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


  2%|▏         | 8/384 [02:21<1:43:27, 16.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


  2%|▏         | 9/384 [02:34<1:36:52, 15.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0670


  3%|▎         | 10/384 [02:56<1:47:50, 17.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0671


  3%|▎         | 11/384 [03:17<1:55:59, 18.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


  3%|▎         | 12/384 [03:40<2:02:33, 19.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


  3%|▎         | 13/384 [03:53<1:50:25, 17.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0656


  4%|▎         | 14/384 [04:06<1:41:35, 16.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0668


  4%|▍         | 15/384 [04:20<1:35:47, 15.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0699


  4%|▍         | 16/384 [04:42<1:47:19, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


  4%|▍         | 17/384 [05:09<2:04:53, 20.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


  5%|▍         | 18/384 [05:32<2:08:49, 21.12s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


  5%|▍         | 19/384 [05:45<1:54:27, 18.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0678


  5%|▌         | 20/384 [05:58<1:43:48, 17.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


  5%|▌         | 21/384 [06:12<1:36:52, 16.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0680


  6%|▌         | 22/384 [06:35<1:49:06, 18.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


  6%|▌         | 23/384 [06:56<1:54:33, 19.04s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


  6%|▋         | 24/384 [07:18<1:59:13, 19.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


  7%|▋         | 25/384 [07:31<1:46:59, 17.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


  7%|▋         | 26/384 [07:44<1:37:57, 16.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


  7%|▋         | 27/384 [07:57<1:32:07, 15.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0683


  7%|▋         | 28/384 [08:20<1:43:53, 17.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0713


  8%|▊         | 29/384 [08:41<1:50:31, 18.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


  8%|▊         | 30/384 [09:04<1:57:25, 19.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0806


  8%|▊         | 31/384 [09:17<1:45:13, 17.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


  8%|▊         | 32/384 [09:30<1:36:14, 16.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0674


  9%|▊         | 33/384 [09:43<1:30:33, 15.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0699


  9%|▉         | 34/384 [10:05<1:42:00, 17.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0686


  9%|▉         | 35/384 [10:27<1:49:06, 18.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


  9%|▉         | 36/384 [10:49<1:53:25, 19.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 10%|▉         | 37/384 [11:02<1:42:08, 17.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 10%|▉         | 38/384 [11:15<1:33:31, 16.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 10%|█         | 39/384 [11:28<1:28:24, 15.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 10%|█         | 40/384 [11:50<1:40:04, 17.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 11%|█         | 41/384 [12:12<1:47:12, 18.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 11%|█         | 42/384 [12:34<1:51:20, 19.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 11%|█         | 43/384 [12:47<1:40:15, 17.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 11%|█▏        | 44/384 [13:00<1:32:20, 16.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0676


 12%|█▏        | 45/384 [13:15<1:29:17, 15.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0663


 12%|█▏        | 46/384 [13:36<1:39:07, 17.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 12%|█▏        | 47/384 [13:58<1:45:56, 18.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 12%|█▎        | 48/384 [14:20<1:50:46, 19.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 13%|█▎        | 49/384 [14:27<1:29:09, 15.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 13%|█▎        | 50/384 [14:35<1:14:38, 13.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 13%|█▎        | 51/384 [14:41<1:03:12, 11.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0683


 14%|█▎        | 52/384 [14:53<1:03:47, 11.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 14%|█▍        | 53/384 [15:05<1:03:43, 11.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 14%|█▍        | 54/384 [15:17<1:03:50, 11.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 14%|█▍        | 55/384 [15:23<55:45, 10.17s/it]  

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 15%|█▍        | 56/384 [15:31<51:20,  9.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0657


 15%|█▍        | 57/384 [15:38<47:41,  8.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0672


 15%|█▌        | 58/384 [15:50<52:10,  9.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 15%|█▌        | 59/384 [16:03<57:28, 10.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 16%|█▌        | 60/384 [16:15<59:40, 11.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 16%|█▌        | 61/384 [16:23<54:12, 10.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 16%|█▌        | 62/384 [16:29<48:48,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 16%|█▋        | 63/384 [16:37<46:27,  8.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0657


 17%|█▋        | 64/384 [16:50<53:08,  9.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 17%|█▋        | 65/384 [17:02<55:31, 10.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 17%|█▋        | 66/384 [17:15<59:24, 11.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 17%|█▋        | 67/384 [17:21<52:07,  9.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0670


 18%|█▊        | 68/384 [17:29<48:19,  9.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0674


 18%|█▊        | 69/384 [17:36<44:41,  8.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 18%|█▊        | 70/384 [17:48<50:34,  9.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 18%|█▊        | 71/384 [18:01<55:25, 10.63s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 19%|█▉        | 72/384 [18:14<58:24, 11.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0745


 19%|█▉        | 73/384 [18:22<53:13, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 19%|█▉        | 74/384 [18:29<48:25,  9.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 20%|█▉        | 75/384 [18:37<46:33,  9.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0669


 20%|█▉        | 76/384 [18:50<52:46, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 20%|██        | 77/384 [19:03<55:59, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 20%|██        | 78/384 [19:15<57:43, 11.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 21%|██        | 79/384 [19:22<50:31,  9.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 21%|██        | 80/384 [19:29<46:32,  9.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 21%|██        | 81/384 [19:36<43:17,  8.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 21%|██▏       | 82/384 [19:48<47:20,  9.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0665


 22%|██▏       | 83/384 [19:59<50:19, 10.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 22%|██▏       | 84/384 [20:11<52:39, 10.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 22%|██▏       | 85/384 [20:19<48:04,  9.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0747


 22%|██▏       | 86/384 [20:25<43:19,  8.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0673


 23%|██▎       | 87/384 [20:33<41:27,  8.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 23%|██▎       | 88/384 [20:44<46:07,  9.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 23%|██▎       | 89/384 [20:55<48:33,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 23%|██▎       | 90/384 [21:07<50:21, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 24%|██▎       | 91/384 [21:15<47:50,  9.80s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0733


 24%|██▍       | 92/384 [21:22<43:39,  8.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 24%|██▍       | 93/384 [21:30<40:51,  8.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0647


 24%|██▍       | 94/384 [21:41<45:29,  9.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0643


 25%|██▍       | 95/384 [21:53<48:30, 10.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 25%|██▌       | 96/384 [22:05<50:43, 10.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


 25%|██▌       | 97/384 [22:09<41:10,  8.61s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0788


 26%|██▌       | 98/384 [22:12<34:10,  7.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0679


 26%|██▌       | 99/384 [22:17<30:37,  6.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0664


 26%|██▌       | 100/384 [22:23<29:59,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 26%|██▋       | 101/384 [22:30<30:30,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 27%|██▋       | 102/384 [22:36<29:38,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 27%|██▋       | 103/384 [22:40<26:46,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 27%|██▋       | 104/384 [22:45<24:45,  5.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 27%|██▋       | 105/384 [22:49<22:52,  4.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 28%|██▊       | 106/384 [22:56<25:31,  5.51s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0699


 28%|██▊       | 107/384 [23:01<25:51,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 28%|██▊       | 108/384 [23:08<27:44,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 28%|██▊       | 109/384 [23:12<24:41,  5.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


 29%|██▊       | 110/384 [23:16<22:25,  4.91s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0679


 29%|██▉       | 111/384 [23:21<22:10,  4.88s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0692


 29%|██▉       | 112/384 [23:27<23:47,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 29%|██▉       | 113/384 [23:34<25:29,  5.64s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 30%|██▉       | 114/384 [23:40<26:22,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 30%|██▉       | 115/384 [23:44<23:38,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 30%|███       | 116/384 [23:49<23:00,  5.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0680


 30%|███       | 117/384 [23:53<21:19,  4.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 31%|███       | 118/384 [23:59<23:29,  5.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 31%|███       | 119/384 [24:06<24:58,  5.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 31%|███▏      | 120/384 [24:12<26:00,  5.91s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 32%|███▏      | 121/384 [24:17<23:56,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 32%|███▏      | 122/384 [24:21<21:55,  5.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0674


 32%|███▏      | 123/384 [24:25<20:47,  4.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0752


 32%|███▏      | 124/384 [24:32<23:23,  5.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 33%|███▎      | 125/384 [24:38<24:09,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 33%|███▎      | 126/384 [24:44<25:27,  5.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 33%|███▎      | 127/384 [24:48<22:58,  5.36s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 33%|███▎      | 128/384 [24:53<22:06,  5.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0676


 34%|███▎      | 129/384 [24:57<20:21,  4.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 34%|███▍      | 130/384 [25:03<22:02,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0686


 34%|███▍      | 131/384 [25:11<24:56,  5.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 34%|███▍      | 132/384 [25:18<25:56,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 35%|███▍      | 133/384 [25:22<23:37,  5.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 35%|███▍      | 134/384 [25:26<21:15,  5.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0681


 35%|███▌      | 135/384 [25:30<20:10,  4.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 35%|███▌      | 136/384 [25:37<22:16,  5.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0707


 36%|███▌      | 137/384 [25:43<22:55,  5.57s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 36%|███▌      | 138/384 [25:50<24:34,  5.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0705


 36%|███▌      | 139/384 [25:54<21:59,  5.38s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0686


 36%|███▋      | 140/384 [25:58<21:07,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 37%|███▋      | 141/384 [26:03<19:40,  4.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 37%|███▋      | 142/384 [26:09<20:57,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 37%|███▋      | 143/384 [26:15<23:00,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 38%|███▊      | 144/384 [26:21<23:15,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0707


 38%|███▊      | 145/384 [26:25<20:28,  5.14s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 38%|███▊      | 146/384 [26:27<17:09,  4.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 38%|███▊      | 147/384 [26:30<15:03,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 39%|███▊      | 148/384 [26:34<14:40,  3.73s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 39%|███▉      | 149/384 [26:38<15:32,  3.97s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 39%|███▉      | 150/384 [26:42<15:10,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 39%|███▉      | 151/384 [26:44<13:25,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 40%|███▉      | 152/384 [26:47<12:18,  3.18s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 40%|███▉      | 153/384 [26:50<12:08,  3.15s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 40%|████      | 154/384 [26:54<13:06,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 40%|████      | 155/384 [26:58<13:14,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 41%|████      | 156/384 [27:01<13:32,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 41%|████      | 157/384 [27:05<13:07,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 41%|████      | 158/384 [27:07<12:00,  3.19s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 41%|████▏     | 159/384 [27:10<11:07,  2.97s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 42%|████▏     | 160/384 [27:13<11:51,  3.18s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 42%|████▏     | 161/384 [27:18<13:18,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 42%|████▏     | 162/384 [27:21<13:19,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 42%|████▏     | 163/384 [27:24<11:58,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 43%|████▎     | 164/384 [27:26<11:09,  3.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 43%|████▎     | 165/384 [27:30<11:20,  3.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0703


 43%|████▎     | 166/384 [27:33<11:59,  3.30s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0688


 43%|████▎     | 167/384 [27:37<12:13,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 44%|████▍     | 168/384 [27:41<12:42,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 44%|████▍     | 169/384 [27:44<12:14,  3.41s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 44%|████▍     | 170/384 [27:46<11:04,  3.10s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 45%|████▍     | 171/384 [27:49<10:28,  2.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0744


 45%|████▍     | 172/384 [27:53<11:10,  3.16s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 45%|████▌     | 173/384 [27:57<12:37,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 45%|████▌     | 174/384 [28:01<12:26,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 46%|████▌     | 175/384 [28:03<11:23,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0754


 46%|████▌     | 176/384 [28:06<10:26,  3.01s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 46%|████▌     | 177/384 [28:09<10:53,  3.16s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 46%|████▋     | 178/384 [28:13<11:27,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 47%|████▋     | 179/384 [28:17<11:40,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 47%|████▋     | 180/384 [28:20<12:06,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 47%|████▋     | 181/384 [28:24<11:39,  3.44s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 47%|████▋     | 182/384 [28:26<10:32,  3.13s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0704


 48%|████▊     | 183/384 [28:29<10:00,  2.99s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 48%|████▊     | 184/384 [28:32<10:38,  3.19s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 48%|████▊     | 185/384 [28:37<11:45,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 48%|████▊     | 186/384 [28:40<11:48,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 49%|████▊     | 187/384 [28:43<10:47,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 49%|████▉     | 188/384 [28:45<09:51,  3.02s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 49%|████▉     | 189/384 [28:49<10:17,  3.17s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0710


 49%|████▉     | 190/384 [28:53<10:42,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0686


 50%|████▉     | 191/384 [28:56<10:43,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 50%|█████     | 192/384 [29:00<11:22,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 50%|█████     | 193/384 [29:15<21:51,  6.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


 51%|█████     | 194/384 [29:28<27:35,  8.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 51%|█████     | 195/384 [29:41<31:47, 10.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 51%|█████     | 196/384 [30:02<42:15, 13.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 51%|█████▏    | 197/384 [30:24<49:46, 15.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 52%|█████▏    | 198/384 [30:46<55:15, 17.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0665


 52%|█████▏    | 199/384 [30:59<50:37, 16.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0666


 52%|█████▏    | 200/384 [31:12<47:08, 15.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0716


 52%|█████▏    | 201/384 [31:26<45:00, 14.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0678


 53%|█████▎    | 202/384 [31:47<50:40, 16.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0665


 53%|█████▎    | 203/384 [32:09<55:07, 18.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 53%|█████▎    | 204/384 [32:31<58:44, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 53%|█████▎    | 205/384 [32:45<52:47, 17.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 54%|█████▎    | 206/384 [32:58<48:47, 16.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 54%|█████▍    | 207/384 [33:12<45:54, 15.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0683


 54%|█████▍    | 208/384 [33:34<51:04, 17.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0691


 54%|█████▍    | 209/384 [33:55<54:15, 18.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 55%|█████▍    | 210/384 [34:17<57:01, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0635


 55%|█████▍    | 211/384 [34:30<51:08, 17.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0685


 55%|█████▌    | 212/384 [34:43<46:44, 16.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 55%|█████▌    | 213/384 [34:56<43:50, 15.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0671


 56%|█████▌    | 214/384 [35:18<49:02, 17.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0664


 56%|█████▌    | 215/384 [35:40<52:05, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 56%|█████▋    | 216/384 [36:02<54:59, 19.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 57%|█████▋    | 217/384 [36:15<49:19, 17.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 57%|█████▋    | 218/384 [36:28<45:02, 16.28s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 57%|█████▋    | 219/384 [36:41<42:16, 15.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 57%|█████▋    | 220/384 [37:04<48:25, 17.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 58%|█████▊    | 221/384 [37:26<50:54, 18.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 58%|█████▊    | 222/384 [37:48<53:32, 19.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0637


 58%|█████▊    | 223/384 [38:01<47:57, 17.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0632


 58%|█████▊    | 224/384 [38:14<43:44, 16.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 59%|█████▊    | 225/384 [38:28<41:01, 15.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0694


 59%|█████▉    | 226/384 [38:50<46:14, 17.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 59%|█████▉    | 227/384 [39:11<48:34, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 59%|█████▉    | 228/384 [39:33<51:07, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 60%|█████▉    | 229/384 [39:46<45:49, 17.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 60%|█████▉    | 230/384 [39:59<41:52, 16.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 60%|██████    | 231/384 [40:13<39:20, 15.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 60%|██████    | 232/384 [40:35<44:21, 17.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 61%|██████    | 233/384 [40:57<47:23, 18.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 61%|██████    | 234/384 [41:19<49:30, 19.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0668


 61%|██████    | 235/384 [41:33<44:26, 17.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0699


 61%|██████▏   | 236/384 [41:46<40:33, 16.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 62%|██████▏   | 237/384 [41:59<38:02, 15.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0640


 62%|██████▏   | 238/384 [42:21<42:43, 17.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 62%|██████▏   | 239/384 [42:43<45:15, 18.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 62%|██████▎   | 240/384 [43:05<47:12, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 63%|██████▎   | 241/384 [43:12<38:09, 16.01s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0673


 63%|██████▎   | 242/384 [43:19<31:10, 13.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 63%|██████▎   | 243/384 [43:26<27:03, 11.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 64%|██████▎   | 244/384 [43:38<26:55, 11.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 64%|██████▍   | 245/384 [43:49<26:41, 11.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 64%|██████▍   | 246/384 [44:01<26:15, 11.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 64%|██████▍   | 247/384 [44:08<23:13, 10.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 65%|██████▍   | 248/384 [44:15<21:04,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0670


 65%|██████▍   | 249/384 [44:22<19:17,  8.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0667


 65%|██████▌   | 250/384 [44:35<22:03,  9.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 65%|██████▌   | 251/384 [44:46<22:52, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 66%|██████▌   | 252/384 [44:58<23:42, 10.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 66%|██████▌   | 253/384 [45:05<20:49,  9.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0678


 66%|██████▌   | 254/384 [45:12<19:13,  8.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0671


 66%|██████▋   | 255/384 [45:19<17:39,  8.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0716


 67%|██████▋   | 256/384 [45:30<19:42,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 67%|██████▋   | 257/384 [45:42<20:57,  9.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 67%|██████▋   | 258/384 [45:54<21:57, 10.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 67%|██████▋   | 259/384 [46:01<19:42,  9.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 68%|██████▊   | 260/384 [46:08<18:02,  8.73s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0673


 68%|██████▊   | 261/384 [46:15<17:11,  8.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 68%|██████▊   | 262/384 [46:26<18:44,  9.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 68%|██████▊   | 263/384 [46:38<19:44,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 69%|██████▉   | 264/384 [46:49<20:46, 10.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0659


 69%|██████▉   | 265/384 [46:57<18:58,  9.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0681


 69%|██████▉   | 266/384 [47:04<17:01,  8.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0670


 70%|██████▉   | 267/384 [47:11<16:15,  8.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0671


 70%|██████▉   | 268/384 [47:23<18:01,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0696


 70%|███████   | 269/384 [47:34<19:06,  9.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 70%|███████   | 270/384 [47:46<19:47, 10.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 71%|███████   | 271/384 [47:53<17:43,  9.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 71%|███████   | 272/384 [48:00<16:28,  8.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0671


 71%|███████   | 273/384 [48:07<15:09,  8.20s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0684


 71%|███████▏  | 274/384 [48:20<17:27,  9.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0640


 72%|███████▏  | 275/384 [48:31<18:22, 10.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 72%|███████▏  | 276/384 [48:43<19:06, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0664


 72%|███████▏  | 277/384 [48:50<17:00,  9.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


 72%|███████▏  | 278/384 [48:57<15:36,  8.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 73%|███████▎  | 279/384 [49:04<14:41,  8.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0662


 73%|███████▎  | 280/384 [49:16<16:01,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 73%|███████▎  | 281/384 [49:27<16:56,  9.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 73%|███████▎  | 282/384 [49:39<17:42, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 74%|███████▎  | 283/384 [49:46<16:05,  9.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 74%|███████▍  | 284/384 [49:53<14:29,  8.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 74%|███████▍  | 285/384 [50:01<13:51,  8.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0651


 74%|███████▍  | 286/384 [50:12<15:19,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 75%|███████▍  | 287/384 [50:24<16:08,  9.98s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 75%|███████▌  | 288/384 [50:35<16:33, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 75%|███████▌  | 289/384 [50:39<13:36,  8.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0681


 76%|███████▌  | 290/384 [50:43<11:11,  7.15s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 76%|███████▌  | 291/384 [50:47<09:40,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 76%|███████▌  | 292/384 [50:54<09:50,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0740


 76%|███████▋  | 293/384 [51:00<09:34,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 77%|███████▋  | 294/384 [51:07<09:37,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 77%|███████▋  | 295/384 [51:11<08:27,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 77%|███████▋  | 296/384 [51:16<07:55,  5.40s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 77%|███████▋  | 297/384 [51:20<07:13,  4.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 78%|███████▊  | 298/384 [51:25<07:33,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 78%|███████▊  | 299/384 [51:32<08:06,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 78%|███████▊  | 300/384 [51:38<08:09,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 78%|███████▊  | 301/384 [51:43<07:36,  5.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 79%|███████▊  | 302/384 [51:47<06:48,  4.99s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0680


 79%|███████▉  | 303/384 [51:51<06:21,  4.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 79%|███████▉  | 304/384 [51:58<07:09,  5.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 79%|███████▉  | 305/384 [52:04<07:34,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 80%|███████▉  | 306/384 [52:12<08:02,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 80%|███████▉  | 307/384 [52:16<07:04,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0685


 80%|████████  | 308/384 [52:20<06:38,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 80%|████████  | 309/384 [52:24<06:08,  4.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 81%|████████  | 310/384 [52:30<06:27,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 81%|████████  | 311/384 [52:37<06:58,  5.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 81%|████████▏ | 312/384 [52:43<06:59,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 82%|████████▏ | 313/384 [52:48<06:30,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 82%|████████▏ | 314/384 [52:52<05:54,  5.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 82%|████████▏ | 315/384 [52:56<05:25,  4.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


 82%|████████▏ | 316/384 [53:03<06:07,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 83%|████████▎ | 317/384 [53:09<06:11,  5.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 83%|████████▎ | 318/384 [53:16<06:32,  5.95s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 83%|████████▎ | 319/384 [53:20<05:48,  5.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 83%|████████▎ | 320/384 [53:24<05:17,  4.95s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 84%|████████▎ | 321/384 [53:28<05:05,  4.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 84%|████████▍ | 322/384 [53:34<05:23,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 84%|████████▍ | 323/384 [53:41<05:45,  5.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 84%|████████▍ | 324/384 [53:47<05:48,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0704


 85%|████████▍ | 325/384 [53:52<05:24,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 85%|████████▍ | 326/384 [53:56<04:51,  5.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0679


 85%|████████▌ | 327/384 [54:00<04:30,  4.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 85%|████████▌ | 328/384 [54:07<05:01,  5.38s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 86%|████████▌ | 329/384 [54:13<05:03,  5.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 86%|████████▌ | 330/384 [54:20<05:23,  5.99s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 86%|████████▌ | 331/384 [54:24<04:43,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 86%|████████▋ | 332/384 [54:28<04:14,  4.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0681


 87%|████████▋ | 333/384 [54:33<04:10,  4.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0692


 87%|████████▋ | 334/384 [54:39<04:21,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 87%|████████▋ | 335/384 [54:45<04:40,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 88%|████████▊ | 336/384 [54:51<04:39,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0716


 88%|████████▊ | 337/384 [54:54<03:48,  4.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 88%|████████▊ | 338/384 [54:57<03:19,  4.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 88%|████████▊ | 339/384 [55:00<02:54,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0729


 89%|████████▊ | 340/384 [55:03<02:45,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 89%|████████▉ | 341/384 [55:07<02:39,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 89%|████████▉ | 342/384 [55:12<02:46,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 89%|████████▉ | 343/384 [55:14<02:23,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 90%|████████▉ | 344/384 [55:17<02:08,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 90%|████████▉ | 345/384 [55:19<01:55,  2.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 90%|█████████ | 346/384 [55:24<02:10,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 90%|█████████ | 347/384 [55:27<02:11,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 91%|█████████ | 348/384 [55:31<02:09,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 91%|█████████ | 349/384 [55:34<01:53,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0700


 91%|█████████ | 350/384 [55:37<01:52,  3.30s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 91%|█████████▏| 351/384 [55:39<01:41,  3.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0705


 92%|█████████▏| 352/384 [55:43<01:43,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 92%|█████████▏| 353/384 [55:47<01:43,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 92%|█████████▏| 354/384 [55:52<01:58,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 92%|█████████▏| 355/384 [55:55<01:45,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 93%|█████████▎| 356/384 [55:58<01:33,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 93%|█████████▎| 357/384 [56:00<01:23,  3.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0699


 93%|█████████▎| 358/384 [56:05<01:31,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0689


 93%|█████████▎| 359/384 [56:08<01:29,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 94%|█████████▍| 360/384 [56:12<01:25,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 256, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0694


 94%|█████████▍| 361/384 [56:15<01:15,  3.29s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0731


 94%|█████████▍| 362/384 [56:18<01:13,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 95%|█████████▍| 363/384 [56:21<01:04,  3.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 95%|█████████▍| 364/384 [56:24<01:05,  3.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0753


 95%|█████████▌| 365/384 [56:28<01:04,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 95%|█████████▌| 366/384 [56:32<01:07,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 96%|█████████▌| 367/384 [56:35<00:57,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 96%|█████████▌| 368/384 [56:38<00:50,  3.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 96%|█████████▌| 369/384 [56:40<00:43,  2.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0713


 96%|█████████▋| 370/384 [56:45<00:48,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 97%|█████████▋| 371/384 [56:48<00:45,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 97%|█████████▋| 372/384 [56:52<00:41,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 97%|█████████▋| 373/384 [56:54<00:35,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0714


 97%|█████████▋| 374/384 [56:58<00:32,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 98%|█████████▊| 375/384 [57:00<00:27,  3.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0712


 98%|█████████▊| 376/384 [57:04<00:26,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 98%|█████████▊| 377/384 [57:07<00:23,  3.30s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 98%|█████████▊| 378/384 [57:12<00:22,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0706


 99%|█████████▊| 379/384 [57:15<00:16,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 99%|█████████▉| 380/384 [57:17<00:12,  3.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 99%|█████████▉| 381/384 [57:20<00:08,  2.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 99%|█████████▉| 382/384 [57:24<00:06,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


100%|█████████▉| 383/384 [57:28<00:03,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


100%|██████████| 384/384 [57:31<00:00,  8.99s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'dim_feedforward': 512, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0707

✅ Best Config: {'activation': 'relu', 'batch_size': 64, 'dim_feedforward': 256, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}
✅ Best Loss: 0.0626





##### 2.3 Test

In [13]:
def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    model.to(device)

    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred)
            targets.append(yb)

    preds_tensor = torch.cat(preds, dim=0)      # (B*T, 8, 8)
    targets_tensor = torch.cat(targets, dim=0)  # (B*T, 8, 8)

    return preds_tensor, targets_tensor

with open(f'{model_save_path}/best_model_window10per30_WT2_config.json', 'r') as f:
    best_config = json.load(f)

best_model = CorrPredictorTransformer(
    d_model=64,
    nhead=best_config['nhead'],
    num_layers=best_config['num_layers'],
    dim_feedforward=best_config['dim_feedforward'],
    activation=best_config['activation']
)

best_model.load_state_dict(torch.load(f"{model_save_path}/best_model_window10per30_WT2_weights.pth"))

test_loader = DataLoader(test_ds, batch_size=best_config['batch_size'], shuffle=False)

preds_tensor, targets_tensor = evaluate_model(best_model, test_loader, device=device)

torch.save({
    'preds': preds_tensor,
    'targets': targets_tensor
}, f"{model_save_path}/best_model_window10per30_WT2_result.pt")


In [14]:
# Performance metrics

preds_flat = preds_tensor.view(preds_tensor.size(0), -1).cpu().numpy()
targets_flat = targets_tensor.view(targets_tensor.size(0), -1).cpu().numpy()

mse = mean_squared_error(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
rmse = np.sqrt(mse)

# frobenius_loss
cos_sim = cosine_similarity(targets_flat, preds_flat)
mean_cos_sim = np.diag(cos_sim).mean()

# frobenius_loss
diff = preds_tensor - targets_tensor
frobenius_per_sample = torch.norm(diff, p='fro', dim=(1, 2))
mean_frobenius = frobenius_per_sample.mean().item()

print(f"\n📊 Evaluation Results:")
print(f"MSE               : {mse:.5f}")
print(f"MAE               : {mae:.5f}")
print(f"RMSE              : {rmse:.5f}")
print(f"Cosine Similarity : {mean_cos_sim:.5f}")
print(f"Frobenius Norm    : {mean_frobenius:.5f}")


📊 Evaluation Results:
MSE               : 0.06555
MAE               : 0.17767
RMSE              : 0.25602
Cosine Similarity : 0.94545
Frobenius Norm    : 1.85746
