### 0. Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import random
import pywt
import copy
import json

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/data/train_data.csv')
df.set_index('timestamp', inplace=True)

In [None]:
len(df)

2559

### 1. Data Preprocessing

##### 1.1 Wavelet Frequency Decompose Correlation

In [None]:
def wavelet_decomposed_corr(df, input_window_width=30, label_window_width=10, wavelet='db4', level=3):
    X, Y = [], []
    data = df.values

    for t in range(input_window_width, len(df)-label_window_width+1):
        window_data = data[t-input_window_width : t] # ex. 0:29, 1:30
        # print(len(window_data))

        low_band, mid_band, high_band = [], [], []
        for i in range(window_data.shape[1]):
            comod_per_window = window_data[:, i]
            coeffs = pywt.wavedec(comod_per_window, wavelet, level=level)
            cA3, cD3, _, cD1 = coeffs
            low_band.append(cA3)
            mid_band.append(cD3)
            high_band.append(cD1)

        corr_low = np.corrcoef(low_band)
        corr_mid = np.corrcoef(mid_band)
        corr_high = np.corrcoef(high_band)

        corr_tensor = torch.tensor(np.stack([corr_low, corr_mid, corr_high]), dtype=torch.float32) #3x8x8
        X.append(corr_tensor)

        label_window = data[t : t+label_window_width] # ex. 30:39, 31:40
        # print(len(label_window))
        corr_next = np.corrcoef(label_window.T) #8x8
        Y.append(torch.tensor(corr_next, dtype=torch.float32))


    return torch.stack(X), torch.stack(Y)

In [None]:
X_tensor, Y_tensor = wavelet_decomposed_corr(df)



In [None]:
total_size = len(X_tensor)
train_size = int(total_size * 0.8)
val_size   = int(total_size * 0.1)

test_size  = total_size - train_size - val_size

X_train = X_tensor[:train_size]
Y_train = Y_tensor[:train_size]

X_val = X_tensor[train_size:train_size + val_size]
Y_val = Y_tensor[train_size:train_size + val_size]

X_test = X_tensor[train_size + val_size:]
Y_test = Y_tensor[train_size + val_size:]

train_ds = TensorDataset(X_train, Y_train)
val_ds   = TensorDataset(X_val, Y_val)
test_ds  = TensorDataset(X_test, Y_test)

### 2. Modeling

##### 2.1 Model Structure Setting

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model) # (T, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) # (T, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))  # (d_model//2)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model) # (1, T, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (B, T, d_model)
        x = x + self.pe[:, :x.size(1)]

        return x

In [None]:
class CorrPredictorCNNTransformer(nn.Module):
    def __init__(
            self,
            num_channels=3,
            conv_channels=32,
            kernel_size=3,
            d_model=128,
            nhead=4,
            num_layers=2,
            dim_feedforward=256,
            activation='relu',
            ):

        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 32, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, d_model, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(d_model)
        )

        # CNN Output-> Transformer Input
        self.flatten = nn.Flatten(start_dim=2)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            activation=activation,
            batch_first=True,
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 64)
            )

    def forward(self, x):
        """
        CB: CNN Batch
        TB: Transformer Batch
        T: Sequence Length
        C: Channel
        H: Height
        W: Width
        d_model: Dimension of model
        """

        B, C, H, W = x.shape
        x = self.cnn(x)
        x = self.flatten(x)
        x = self.pool(x).squeeze(-1)

        x = x.unsqueeze(0)
        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = x.squeeze(0)

        output = self.fc(x)
        output = output.view(-1, 8, 8)

        return output

##### 2.2 Training

In [None]:
def train_model(model, train_loader, val_loader, optimizer_name='Adam', lr=5e-4, epochs=70, device='cuda'):
    model.to(device)

    # Optimizer 선택
    if optimizer_name == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        opt = torch.optim.AdamW(model.parameters(), lr=lr)
    elif optimizer_name == 'RMSprop':
        opt = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss & LR Scheduler
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode='min', factor=0.5, patience=5, verbose=True
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()

            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            opt.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        # 스케줄러 적용
        scheduler.step(val_loss)

    return val_loss

In [None]:
model_save_path = '/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/best_model'

In [None]:
# Grid Search
param_grid = {
    'kernel_size': [3, 5],
    'd_model': [32, 64, 128],
    'nhead': [2, 4],
    'num_layers': [2, 4],
    'dim_feedforward': [256, 512],
    'activation': ['relu', 'gelu'],
    'lr': [0.001, 5e-4], #1e-4,
    'optimizer': ['Adam', 'RMSprop', 'AdamW'],
    'batch_size': [64, 128, 256, 512]
}

best_loss = float('inf')
best_config = None
best_model = None

for config in tqdm(ParameterGrid(param_grid)):
    train_loader = DataLoader(train_ds,  batch_size=config['batch_size'], shuffle=False)
    val_loader   = DataLoader(val_ds,  batch_size=config['batch_size'], shuffle=False)

    model = CorrPredictorCNNTransformer(
        kernel_size=config['kernel_size'],
        d_model=config['d_model'],
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        dim_feedforward=config['dim_feedforward'],
        activation=config['activation'],
    )
    loss = train_model(model, train_loader, val_loader,
                       optimizer_name=config['optimizer'],
                       lr=config['lr'], device=device)

    print(f"Config: {config}, Loss: {loss:.4f}")
    if loss < best_loss:
        best_loss = loss
        best_config = config

        torch.save(model.state_dict(), f"{model_save_path}/best_model_window10per30_WCT_weights.pth")
        with open(f'{model_save_path}/best_model_window10per30_WCT_config.json', 'w') as f:
            json.dump(best_config, f, indent=4)

# 최종 결과
print(f"\n✅ Best Config: {best_config}")
print(f"✅ Best Loss: {best_loss:.4f}")



Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


  0%|          | 2/2304 [00:38<11:53:36, 18.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0857


  0%|          | 3/2304 [00:53<10:57:13, 17.14s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0852


  0%|          | 4/2304 [01:17<12:36:00, 19.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0843


  0%|          | 5/2304 [01:40<13:20:57, 20.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0831


  0%|          | 6/2304 [02:04<13:56:03, 21.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0726


  0%|          | 7/2304 [02:19<12:37:06, 19.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


  0%|          | 8/2304 [02:34<11:38:21, 18.25s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0823


  0%|          | 9/2304 [02:50<11:07:17, 17.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0901


  0%|          | 10/2304 [03:14<12:22:30, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


  0%|          | 11/2304 [03:37<13:06:23, 20.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0759


  1%|          | 12/2304 [04:01<13:44:20, 21.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


  1%|          | 13/2304 [04:16<12:34:20, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


  1%|          | 14/2304 [04:32<11:41:43, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


  1%|          | 15/2304 [04:47<11:08:19, 17.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0793


  1%|          | 16/2304 [05:11<12:19:25, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0795


  1%|          | 17/2304 [05:34<13:02:32, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


  1%|          | 18/2304 [05:58<13:41:37, 21.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0877


  1%|          | 19/2304 [06:14<12:32:36, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0754


  1%|          | 20/2304 [06:29<11:38:34, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


  1%|          | 21/2304 [06:44<11:06:46, 17.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


  1%|          | 22/2304 [07:08<12:20:52, 19.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0806


  1%|          | 23/2304 [07:31<13:00:22, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0966


  1%|          | 24/2304 [07:55<13:36:09, 21.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0794


  1%|          | 25/2304 [08:11<12:30:00, 19.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0696


  1%|          | 26/2304 [08:26<11:40:54, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0927


  1%|          | 27/2304 [08:42<11:09:13, 17.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0865


  1%|          | 28/2304 [09:06<12:22:28, 19.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0863


  1%|▏         | 29/2304 [09:31<13:19:15, 21.08s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0775


  1%|▏         | 30/2304 [09:57<14:17:47, 22.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0783


  1%|▏         | 31/2304 [10:14<13:12:41, 20.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


  1%|▏         | 32/2304 [10:29<12:06:36, 19.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0895


  1%|▏         | 33/2304 [10:45<11:27:39, 18.17s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0924


  1%|▏         | 34/2304 [11:09<12:41:05, 20.12s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


  2%|▏         | 35/2304 [11:34<13:27:38, 21.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1216


  2%|▏         | 36/2304 [11:58<14:05:48, 22.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0897


  2%|▏         | 37/2304 [12:14<12:51:30, 20.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


  2%|▏         | 38/2304 [12:30<11:55:35, 18.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0801


  2%|▏         | 39/2304 [12:46<11:26:10, 18.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


  2%|▏         | 40/2304 [13:12<12:52:34, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0817


  2%|▏         | 41/2304 [13:36<13:38:04, 21.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0899


  2%|▏         | 42/2304 [14:02<14:23:51, 22.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0875


  2%|▏         | 43/2304 [14:18<13:08:05, 20.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0812


  2%|▏         | 44/2304 [14:34<12:09:40, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0877


  2%|▏         | 45/2304 [14:50<11:30:51, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0865


  2%|▏         | 46/2304 [15:17<13:00:58, 20.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


  2%|▏         | 47/2304 [15:43<14:04:58, 22.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0843


  2%|▏         | 48/2304 [16:10<14:52:38, 23.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0802


  2%|▏         | 49/2304 [16:27<13:37:04, 21.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


  2%|▏         | 50/2304 [16:44<12:44:41, 20.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0841


  2%|▏         | 51/2304 [17:01<12:07:51, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0914


  2%|▏         | 52/2304 [17:27<13:25:33, 21.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


  2%|▏         | 53/2304 [17:54<14:19:07, 22.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0744


  2%|▏         | 54/2304 [18:20<15:02:24, 24.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0843


  2%|▏         | 55/2304 [18:38<13:47:50, 22.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0702


  2%|▏         | 56/2304 [18:55<12:49:03, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0844


  2%|▏         | 57/2304 [19:12<12:09:49, 19.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0748


  3%|▎         | 58/2304 [19:37<13:18:31, 21.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0765


  3%|▎         | 59/2304 [20:03<14:04:59, 22.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0804


  3%|▎         | 60/2304 [20:29<14:48:02, 23.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0826


  3%|▎         | 61/2304 [20:46<13:28:36, 21.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


  3%|▎         | 62/2304 [21:02<12:26:20, 19.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


  3%|▎         | 63/2304 [21:19<11:46:29, 18.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0831


  3%|▎         | 64/2304 [21:45<13:05:51, 21.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0854


  3%|▎         | 65/2304 [22:08<13:35:31, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0778


  3%|▎         | 66/2304 [22:33<14:01:50, 22.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0780


  3%|▎         | 67/2304 [22:49<12:46:37, 20.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0836


  3%|▎         | 68/2304 [23:04<11:46:28, 18.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


  3%|▎         | 69/2304 [23:19<11:10:16, 17.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0780


  3%|▎         | 70/2304 [23:44<12:27:25, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0690


  3%|▎         | 71/2304 [24:09<13:12:29, 21.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0758


  3%|▎         | 72/2304 [24:33<13:52:12, 22.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


  3%|▎         | 73/2304 [24:50<12:43:41, 20.54s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0700


  3%|▎         | 74/2304 [25:06<11:51:51, 19.15s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0801


  3%|▎         | 75/2304 [25:22<11:18:42, 18.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1059


  3%|▎         | 76/2304 [25:47<12:37:20, 20.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0865


  3%|▎         | 77/2304 [26:11<13:17:09, 21.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


  3%|▎         | 78/2304 [26:36<13:56:48, 22.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


  3%|▎         | 79/2304 [26:53<12:49:05, 20.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


  3%|▎         | 80/2304 [27:09<11:58:10, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0875


  4%|▎         | 81/2304 [27:26<11:28:34, 18.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


  4%|▎         | 82/2304 [27:51<12:45:59, 20.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0829


  4%|▎         | 83/2304 [28:16<13:33:08, 21.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0943


  4%|▎         | 84/2304 [28:42<14:13:23, 23.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0783


  4%|▎         | 85/2304 [28:58<13:00:56, 21.12s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


  4%|▎         | 86/2304 [29:15<12:10:17, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0832


  4%|▍         | 87/2304 [29:32<11:34:29, 18.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0812


  4%|▍         | 88/2304 [29:57<12:47:10, 20.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0938


  4%|▍         | 89/2304 [30:22<13:35:36, 22.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0850


  4%|▍         | 90/2304 [30:48<14:18:07, 23.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


  4%|▍         | 91/2304 [31:05<13:07:10, 21.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


  4%|▍         | 92/2304 [31:21<12:11:50, 19.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0822


  4%|▍         | 93/2304 [31:38<11:37:19, 18.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0822


  4%|▍         | 94/2304 [32:04<12:56:04, 21.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0848


  4%|▍         | 95/2304 [32:30<13:46:30, 22.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0951


  4%|▍         | 96/2304 [32:56<14:25:45, 23.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0803


  4%|▍         | 97/2304 [33:12<13:01:18, 21.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0863


  4%|▍         | 98/2304 [33:27<11:56:31, 19.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0881


  4%|▍         | 99/2304 [33:42<11:09:37, 18.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0762


  4%|▍         | 100/2304 [34:07<12:13:39, 19.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0865


  4%|▍         | 101/2304 [34:30<12:57:30, 21.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0899


  4%|▍         | 102/2304 [34:55<13:39:13, 22.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


  4%|▍         | 103/2304 [35:11<12:27:22, 20.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


  5%|▍         | 104/2304 [35:27<11:36:34, 19.00s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


  5%|▍         | 105/2304 [35:43<11:03:04, 18.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0796


  5%|▍         | 106/2304 [36:08<12:15:06, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0839


  5%|▍         | 107/2304 [36:32<12:58:50, 21.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


  5%|▍         | 108/2304 [36:57<13:37:22, 22.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0902


  5%|▍         | 109/2304 [37:13<12:26:22, 20.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


  5%|▍         | 110/2304 [37:28<11:33:08, 18.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0776


  5%|▍         | 111/2304 [37:44<10:55:09, 17.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0908


  5%|▍         | 112/2304 [38:08<12:06:28, 19.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0632


  5%|▍         | 113/2304 [38:33<12:58:34, 21.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0832


  5%|▍         | 114/2304 [38:58<13:43:01, 22.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0839


  5%|▍         | 115/2304 [39:14<12:33:04, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


  5%|▌         | 116/2304 [39:30<11:38:42, 19.16s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


  5%|▌         | 117/2304 [39:46<11:04:59, 18.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0793


  5%|▌         | 118/2304 [40:12<12:27:51, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1005


  5%|▌         | 119/2304 [40:38<13:22:19, 22.03s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0836


  5%|▌         | 120/2304 [41:03<14:00:09, 23.08s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0839


  5%|▌         | 121/2304 [41:20<12:48:10, 21.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


  5%|▌         | 122/2304 [41:36<11:52:34, 19.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0840


  5%|▌         | 123/2304 [41:53<11:25:33, 18.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


  5%|▌         | 124/2304 [42:19<12:48:39, 21.16s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0908


  5%|▌         | 125/2304 [42:45<13:39:00, 22.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0797


  5%|▌         | 126/2304 [43:12<14:23:47, 23.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0891


  6%|▌         | 127/2304 [43:29<13:06:46, 21.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0845


  6%|▌         | 128/2304 [43:45<12:08:44, 20.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


  6%|▌         | 129/2304 [44:01<11:27:09, 18.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0890


  6%|▌         | 130/2304 [44:27<12:35:52, 20.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0859


  6%|▌         | 131/2304 [44:50<13:06:57, 21.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0805


  6%|▌         | 132/2304 [45:15<13:34:42, 22.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0791


  6%|▌         | 133/2304 [45:30<12:21:17, 20.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0835


  6%|▌         | 134/2304 [45:46<11:26:28, 18.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


  6%|▌         | 135/2304 [46:02<10:53:48, 18.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0870


  6%|▌         | 136/2304 [46:26<12:03:30, 20.02s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0876


  6%|▌         | 137/2304 [46:50<12:45:32, 21.20s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0926


  6%|▌         | 138/2304 [47:15<13:24:57, 22.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


  6%|▌         | 139/2304 [47:31<12:12:12, 20.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0761


  6%|▌         | 140/2304 [47:47<11:22:09, 18.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0824


  6%|▌         | 141/2304 [48:03<10:50:57, 18.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0958


  6%|▌         | 142/2304 [48:27<12:02:22, 20.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


  6%|▌         | 143/2304 [48:51<12:46:05, 21.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0869


  6%|▋         | 144/2304 [49:16<13:19:20, 22.20s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0861


  6%|▋         | 145/2304 [49:31<12:06:39, 20.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0961


  6%|▋         | 146/2304 [49:47<11:18:32, 18.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0866


  6%|▋         | 147/2304 [50:03<10:44:19, 17.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


  6%|▋         | 148/2304 [50:27<11:56:01, 19.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0734


  6%|▋         | 149/2304 [50:51<12:38:54, 21.13s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


  7%|▋         | 150/2304 [51:16<13:13:23, 22.10s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0764


  7%|▋         | 151/2304 [51:32<12:05:51, 20.23s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0946


  7%|▋         | 152/2304 [51:47<11:15:15, 18.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0783


  7%|▋         | 153/2304 [52:03<10:43:20, 17.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0747


  7%|▋         | 154/2304 [52:27<11:53:09, 19.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0819


  7%|▋         | 155/2304 [52:51<12:28:58, 20.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0854


  7%|▋         | 156/2304 [53:15<13:00:38, 21.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0773


  7%|▋         | 157/2304 [53:30<11:54:23, 19.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0687


  7%|▋         | 158/2304 [53:45<11:02:37, 18.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0777


  7%|▋         | 159/2304 [54:01<10:28:31, 17.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0937


  7%|▋         | 160/2304 [54:25<11:41:23, 19.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


  7%|▋         | 161/2304 [54:49<12:20:57, 20.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0791


  7%|▋         | 162/2304 [55:13<12:58:59, 21.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0889


  7%|▋         | 163/2304 [55:29<11:55:48, 20.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


  7%|▋         | 164/2304 [55:45<11:08:08, 18.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


  7%|▋         | 165/2304 [56:00<10:34:55, 17.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0811


  7%|▋         | 166/2304 [56:24<11:43:29, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


  7%|▋         | 167/2304 [56:47<12:16:14, 20.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0783


  7%|▋         | 168/2304 [57:11<12:44:03, 21.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0690


  7%|▋         | 169/2304 [57:26<11:38:38, 19.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0810


  7%|▋         | 170/2304 [57:41<10:48:44, 18.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


  7%|▋         | 171/2304 [57:56<10:16:34, 17.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0857


  7%|▋         | 172/2304 [58:19<11:18:44, 19.10s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0843


  8%|▊         | 173/2304 [58:42<11:59:00, 20.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0811


  8%|▊         | 174/2304 [59:06<12:34:01, 21.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


  8%|▊         | 175/2304 [59:21<11:31:31, 19.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0805


  8%|▊         | 176/2304 [59:36<10:42:08, 18.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0878


  8%|▊         | 177/2304 [59:52<10:14:11, 17.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


  8%|▊         | 178/2304 [1:00:15<11:21:16, 19.23s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


  8%|▊         | 179/2304 [1:00:38<12:01:51, 20.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0863


  8%|▊         | 180/2304 [1:01:02<12:36:33, 21.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


  8%|▊         | 181/2304 [1:01:17<11:30:48, 19.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0887


  8%|▊         | 182/2304 [1:01:32<10:42:50, 18.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0739


  8%|▊         | 183/2304 [1:01:48<10:11:33, 17.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0947


  8%|▊         | 184/2304 [1:02:11<11:15:18, 19.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


  8%|▊         | 185/2304 [1:02:34<11:52:37, 20.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0845


  8%|▊         | 186/2304 [1:02:57<12:22:46, 21.04s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


  8%|▊         | 187/2304 [1:03:12<11:20:25, 19.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0925


  8%|▊         | 188/2304 [1:03:26<10:29:24, 17.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0834


  8%|▊         | 189/2304 [1:03:42<10:01:30, 17.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


  8%|▊         | 190/2304 [1:04:05<11:04:48, 18.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


  8%|▊         | 191/2304 [1:04:28<11:46:55, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0773


  8%|▊         | 192/2304 [1:04:51<12:23:34, 21.12s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0888


  8%|▊         | 193/2304 [1:05:06<11:22:17, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0845


  8%|▊         | 194/2304 [1:05:21<10:31:15, 17.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0762


  8%|▊         | 195/2304 [1:05:36<10:00:43, 17.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0867


  9%|▊         | 196/2304 [1:05:59<11:05:47, 18.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0786


  9%|▊         | 197/2304 [1:06:22<11:44:37, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


  9%|▊         | 198/2304 [1:06:46<12:22:16, 21.15s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0870


  9%|▊         | 199/2304 [1:07:01<11:18:11, 19.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0679


  9%|▊         | 200/2304 [1:07:16<10:33:49, 18.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0782


  9%|▊         | 201/2304 [1:07:31<10:03:26, 17.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0757


  9%|▉         | 202/2304 [1:07:54<11:04:47, 18.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


  9%|▉         | 203/2304 [1:08:17<11:48:01, 20.22s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0697


  9%|▉         | 204/2304 [1:08:41<12:27:01, 21.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0750


  9%|▉         | 205/2304 [1:08:57<11:22:08, 19.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1431


  9%|▉         | 206/2304 [1:09:11<10:33:43, 18.12s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


  9%|▉         | 207/2304 [1:09:26<10:00:19, 17.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0852


  9%|▉         | 208/2304 [1:09:50<11:05:54, 19.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0703


  9%|▉         | 209/2304 [1:10:12<11:42:07, 20.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


  9%|▉         | 210/2304 [1:10:35<12:11:40, 20.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0896


  9%|▉         | 211/2304 [1:10:50<11:08:39, 19.17s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


  9%|▉         | 212/2304 [1:11:05<10:21:41, 17.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


  9%|▉         | 213/2304 [1:11:20<9:53:34, 17.03s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


  9%|▉         | 214/2304 [1:11:43<10:55:28, 18.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


  9%|▉         | 215/2304 [1:12:06<11:31:51, 19.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0872


  9%|▉         | 216/2304 [1:12:29<12:05:01, 20.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0718


  9%|▉         | 217/2304 [1:12:44<11:04:17, 19.10s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0847


  9%|▉         | 218/2304 [1:12:58<10:18:35, 17.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 10%|▉         | 219/2304 [1:13:14<9:50:24, 16.99s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


 10%|▉         | 220/2304 [1:13:37<10:58:05, 18.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


 10%|▉         | 221/2304 [1:14:00<11:38:56, 20.13s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 10%|▉         | 222/2304 [1:14:24<12:13:53, 21.15s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 10%|▉         | 223/2304 [1:14:39<11:13:44, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0900


 10%|▉         | 224/2304 [1:14:54<10:25:30, 18.04s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 10%|▉         | 225/2304 [1:15:09<9:54:41, 17.16s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0669


 10%|▉         | 226/2304 [1:15:32<10:58:26, 19.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0800


 10%|▉         | 227/2304 [1:15:55<11:35:24, 20.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0768


 10%|▉         | 228/2304 [1:16:18<12:08:30, 21.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


 10%|▉         | 229/2304 [1:16:33<11:04:56, 19.23s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0867


 10%|▉         | 230/2304 [1:16:48<10:17:42, 17.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 10%|█         | 231/2304 [1:17:03<9:49:12, 17.05s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1024


 10%|█         | 232/2304 [1:17:26<10:55:06, 18.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0808


 10%|█         | 233/2304 [1:17:49<11:34:00, 20.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0880


 10%|█         | 234/2304 [1:18:12<12:06:42, 21.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 10%|█         | 235/2304 [1:18:28<11:06:52, 19.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0870


 10%|█         | 236/2304 [1:18:43<10:19:33, 17.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0882


 10%|█         | 237/2304 [1:18:57<9:47:51, 17.06s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 10%|█         | 238/2304 [1:19:21<10:50:10, 18.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


 10%|█         | 239/2304 [1:19:43<11:28:59, 20.02s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0898


 10%|█         | 240/2304 [1:20:07<12:04:53, 21.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0739


 10%|█         | 241/2304 [1:20:22<11:03:28, 19.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0759


 11%|█         | 242/2304 [1:20:37<10:15:44, 17.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0783


 11%|█         | 243/2304 [1:20:52<9:46:13, 17.07s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0835


 11%|█         | 244/2304 [1:21:15<10:47:43, 18.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 11%|█         | 245/2304 [1:21:37<11:25:24, 19.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 11%|█         | 246/2304 [1:22:01<12:00:42, 21.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 11%|█         | 247/2304 [1:22:16<11:00:31, 19.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 11%|█         | 248/2304 [1:22:30<10:11:12, 17.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


 11%|█         | 249/2304 [1:22:45<9:41:22, 16.97s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 11%|█         | 250/2304 [1:23:09<10:45:46, 18.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 11%|█         | 251/2304 [1:23:32<11:26:47, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 11%|█         | 252/2304 [1:23:55<12:00:37, 21.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0793


 11%|█         | 253/2304 [1:24:10<11:00:08, 19.31s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


 11%|█         | 254/2304 [1:24:25<10:12:13, 17.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0825


 11%|█         | 255/2304 [1:24:40<9:43:04, 17.07s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0749


 11%|█         | 256/2304 [1:25:03<10:45:43, 18.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0915


 11%|█         | 257/2304 [1:25:26<11:22:10, 20.00s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


 11%|█         | 258/2304 [1:25:49<11:53:29, 20.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


 11%|█         | 259/2304 [1:26:04<10:54:00, 19.19s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0736


 11%|█▏        | 260/2304 [1:26:19<10:06:37, 17.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0764


 11%|█▏        | 261/2304 [1:26:34<9:37:42, 16.97s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0847


 11%|█▏        | 262/2304 [1:26:57<10:40:38, 18.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 11%|█▏        | 263/2304 [1:27:20<11:24:36, 20.13s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0851


 11%|█▏        | 264/2304 [1:27:43<11:54:13, 21.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0837


 12%|█▏        | 265/2304 [1:27:58<10:55:53, 19.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0841


 12%|█▏        | 266/2304 [1:28:13<10:10:42, 17.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0940


 12%|█▏        | 267/2304 [1:28:28<9:40:47, 17.11s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0948


 12%|█▏        | 268/2304 [1:28:51<10:41:05, 18.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 12%|█▏        | 269/2304 [1:29:14<11:20:30, 20.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0787


 12%|█▏        | 270/2304 [1:29:37<11:52:12, 21.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 12%|█▏        | 271/2304 [1:29:53<10:53:38, 19.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0730


 12%|█▏        | 272/2304 [1:30:08<10:09:20, 17.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0834


 12%|█▏        | 273/2304 [1:30:23<9:40:52, 17.16s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0757


 12%|█▏        | 274/2304 [1:30:46<10:40:11, 18.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0743


 12%|█▏        | 275/2304 [1:31:09<11:18:40, 20.07s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 12%|█▏        | 276/2304 [1:31:32<11:51:10, 21.04s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 12%|█▏        | 277/2304 [1:31:47<10:51:29, 19.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 12%|█▏        | 278/2304 [1:32:02<10:03:19, 17.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 12%|█▏        | 279/2304 [1:32:17<9:35:46, 17.06s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0851


 12%|█▏        | 280/2304 [1:32:40<10:40:39, 18.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0897


 12%|█▏        | 281/2304 [1:33:03<11:16:08, 20.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 12%|█▏        | 282/2304 [1:33:26<11:47:58, 21.01s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0750


 12%|█▏        | 283/2304 [1:33:41<10:49:00, 19.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0826


 12%|█▏        | 284/2304 [1:33:56<10:03:13, 17.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0787


 12%|█▏        | 285/2304 [1:34:11<9:36:11, 17.12s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0850


 12%|█▏        | 286/2304 [1:34:34<10:36:29, 18.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0904


 12%|█▏        | 287/2304 [1:34:57<11:15:16, 20.09s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0781


 12%|█▎        | 288/2304 [1:35:20<11:44:59, 20.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 13%|█▎        | 289/2304 [1:35:28<9:33:01, 17.06s/it] 

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0802


 13%|█▎        | 290/2304 [1:35:36<7:58:43, 14.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 13%|█▎        | 291/2304 [1:35:44<6:55:59, 12.40s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0851


 13%|█▎        | 292/2304 [1:35:56<6:51:31, 12.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0732


 13%|█▎        | 293/2304 [1:36:08<6:47:19, 12.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 13%|█▎        | 294/2304 [1:36:20<6:47:59, 12.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0697


 13%|█▎        | 295/2304 [1:36:28<6:05:54, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 13%|█▎        | 296/2304 [1:36:36<5:35:58, 10.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0850


 13%|█▎        | 297/2304 [1:36:44<5:16:42,  9.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 13%|█▎        | 298/2304 [1:36:56<5:43:19, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0827


 13%|█▎        | 299/2304 [1:37:08<6:00:01, 10.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0780


 13%|█▎        | 300/2304 [1:37:20<6:12:59, 11.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0815


 13%|█▎        | 301/2304 [1:37:28<5:40:25, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0682


 13%|█▎        | 302/2304 [1:37:36<5:18:15,  9.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0657


 13%|█▎        | 303/2304 [1:37:44<5:04:56,  9.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 13%|█▎        | 304/2304 [1:37:57<5:35:40, 10.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1365


 13%|█▎        | 305/2304 [1:38:09<5:54:42, 10.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 13%|█▎        | 306/2304 [1:38:21<6:09:57, 11.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0977


 13%|█▎        | 307/2304 [1:38:29<5:40:04, 10.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 13%|█▎        | 308/2304 [1:38:37<5:17:26,  9.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 13%|█▎        | 309/2304 [1:38:45<5:02:15,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 13%|█▎        | 310/2304 [1:38:57<5:31:43,  9.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 13%|█▎        | 311/2304 [1:39:09<5:49:15, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 14%|█▎        | 312/2304 [1:39:21<6:04:45, 10.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0729


 14%|█▎        | 313/2304 [1:39:29<5:36:16, 10.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0794


 14%|█▎        | 314/2304 [1:39:37<5:13:09,  9.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0854


 14%|█▎        | 315/2304 [1:39:45<4:59:33,  9.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 14%|█▎        | 316/2304 [1:39:57<5:30:14,  9.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1255


 14%|█▍        | 317/2304 [1:40:09<5:49:12, 10.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0963


 14%|█▍        | 318/2304 [1:40:21<6:05:41, 11.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0762


 14%|█▍        | 319/2304 [1:40:29<5:37:11, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 14%|█▍        | 320/2304 [1:40:37<5:14:49,  9.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 14%|█▍        | 321/2304 [1:40:46<5:01:13,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 14%|█▍        | 322/2304 [1:40:58<5:30:19, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0855


 14%|█▍        | 323/2304 [1:41:10<5:50:30, 10.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 14%|█▍        | 324/2304 [1:41:22<6:07:41, 11.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 14%|█▍        | 325/2304 [1:41:30<5:37:31, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 14%|█▍        | 326/2304 [1:41:38<5:15:33,  9.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0797


 14%|█▍        | 327/2304 [1:41:46<5:00:21,  9.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0819


 14%|█▍        | 328/2304 [1:41:58<5:28:42,  9.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0853


 14%|█▍        | 329/2304 [1:42:10<5:48:08, 10.58s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 14%|█▍        | 330/2304 [1:42:22<6:03:56, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0835


 14%|█▍        | 331/2304 [1:42:31<5:35:31, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0888


 14%|█▍        | 332/2304 [1:42:39<5:14:08,  9.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0733


 14%|█▍        | 333/2304 [1:42:47<4:58:52,  9.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0749


 14%|█▍        | 334/2304 [1:42:59<5:30:17, 10.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0724


 15%|█▍        | 335/2304 [1:43:11<5:48:16, 10.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 15%|█▍        | 336/2304 [1:43:23<6:04:56, 11.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0819


 15%|█▍        | 337/2304 [1:43:31<5:34:27, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 15%|█▍        | 338/2304 [1:43:39<5:12:19,  9.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 15%|█▍        | 339/2304 [1:43:47<4:58:30,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0823


 15%|█▍        | 340/2304 [1:44:00<5:28:56, 10.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1134


 15%|█▍        | 341/2304 [1:44:12<5:47:07, 10.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0722


 15%|█▍        | 342/2304 [1:44:24<6:02:44, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


 15%|█▍        | 343/2304 [1:44:32<5:32:18, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 15%|█▍        | 344/2304 [1:44:40<5:09:53,  9.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 15%|█▍        | 345/2304 [1:44:48<4:56:30,  9.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 15%|█▌        | 346/2304 [1:45:00<5:25:17,  9.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0830


 15%|█▌        | 347/2304 [1:45:12<5:43:42, 10.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 15%|█▌        | 348/2304 [1:45:24<5:59:55, 11.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0751


 15%|█▌        | 349/2304 [1:45:32<5:29:39, 10.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0828


 15%|█▌        | 350/2304 [1:45:40<5:08:57,  9.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0770


 15%|█▌        | 351/2304 [1:45:48<4:55:52,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0785


 15%|█▌        | 352/2304 [1:46:00<5:25:25, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


 15%|█▌        | 353/2304 [1:46:12<5:42:27, 10.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 15%|█▌        | 354/2304 [1:46:24<5:59:25, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0766


 15%|█▌        | 355/2304 [1:46:32<5:30:39, 10.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


 15%|█▌        | 356/2304 [1:46:40<5:08:00,  9.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 15%|█▌        | 357/2304 [1:46:48<4:52:57,  9.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 16%|█▌        | 358/2304 [1:47:00<5:23:28,  9.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 16%|█▌        | 359/2304 [1:47:12<5:43:16, 10.59s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 16%|█▌        | 360/2304 [1:47:25<5:58:23, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0804


 16%|█▌        | 361/2304 [1:47:33<5:28:51, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


 16%|█▌        | 362/2304 [1:47:41<5:07:09,  9.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


 16%|█▌        | 363/2304 [1:47:49<4:54:30,  9.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0784


 16%|█▌        | 364/2304 [1:48:01<5:26:49, 10.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0928


 16%|█▌        | 365/2304 [1:48:13<5:46:07, 10.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0751


 16%|█▌        | 366/2304 [1:48:25<6:00:16, 11.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0854


 16%|█▌        | 367/2304 [1:48:34<5:31:34, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0872


 16%|█▌        | 368/2304 [1:48:42<5:08:24,  9.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 16%|█▌        | 369/2304 [1:48:50<4:54:20,  9.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0769


 16%|█▌        | 370/2304 [1:49:02<5:24:45, 10.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 16%|█▌        | 371/2304 [1:49:14<5:43:13, 10.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0748


 16%|█▌        | 372/2304 [1:49:26<5:58:51, 11.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0757


 16%|█▌        | 373/2304 [1:49:34<5:28:29, 10.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0794


 16%|█▌        | 374/2304 [1:49:42<5:05:08,  9.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0855


 16%|█▋        | 375/2304 [1:49:50<4:52:43,  9.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0827


 16%|█▋        | 376/2304 [1:50:03<5:22:35, 10.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 16%|█▋        | 377/2304 [1:50:15<5:41:51, 10.64s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0780


 16%|█▋        | 378/2304 [1:50:27<5:55:22, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1097


 16%|█▋        | 379/2304 [1:50:35<5:26:13, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 16%|█▋        | 380/2304 [1:50:43<5:04:38,  9.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0718


 17%|█▋        | 381/2304 [1:50:51<4:51:23,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0810


 17%|█▋        | 382/2304 [1:51:03<5:20:37, 10.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 17%|█▋        | 383/2304 [1:51:15<5:39:57, 10.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0750


 17%|█▋        | 384/2304 [1:51:27<5:54:09, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 17%|█▋        | 385/2304 [1:51:35<5:25:57, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 17%|█▋        | 386/2304 [1:51:43<5:03:52,  9.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 17%|█▋        | 387/2304 [1:51:51<4:49:49,  9.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0858


 17%|█▋        | 388/2304 [1:52:03<5:18:00,  9.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


 17%|█▋        | 389/2304 [1:52:15<5:36:53, 10.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0732


 17%|█▋        | 390/2304 [1:52:27<5:52:02, 11.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 17%|█▋        | 391/2304 [1:52:35<5:23:00, 10.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0700


 17%|█▋        | 392/2304 [1:52:43<5:00:30,  9.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 17%|█▋        | 393/2304 [1:52:51<4:48:04,  9.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 17%|█▋        | 394/2304 [1:53:03<5:17:43,  9.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 17%|█▋        | 395/2304 [1:53:15<5:34:52, 10.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0743


 17%|█▋        | 396/2304 [1:53:27<5:50:16, 11.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0788


 17%|█▋        | 397/2304 [1:53:36<5:23:44, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0781


 17%|█▋        | 398/2304 [1:53:44<5:03:03,  9.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0834


 17%|█▋        | 399/2304 [1:53:52<4:48:19,  9.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 17%|█▋        | 400/2304 [1:54:04<5:15:42,  9.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0707


 17%|█▋        | 401/2304 [1:54:16<5:33:22, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0791


 17%|█▋        | 402/2304 [1:54:28<5:48:08, 10.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0743


 17%|█▋        | 403/2304 [1:54:36<5:19:35, 10.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0891


 18%|█▊        | 404/2304 [1:54:43<4:57:21,  9.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0748


 18%|█▊        | 405/2304 [1:54:51<4:43:53,  8.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 18%|█▊        | 406/2304 [1:55:04<5:13:58,  9.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 18%|█▊        | 407/2304 [1:55:15<5:31:11, 10.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 18%|█▊        | 408/2304 [1:55:27<5:47:04, 10.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


 18%|█▊        | 409/2304 [1:55:36<5:20:52, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0855


 18%|█▊        | 410/2304 [1:55:44<5:02:12,  9.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 18%|█▊        | 411/2304 [1:55:52<4:49:08,  9.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0827


 18%|█▊        | 412/2304 [1:56:04<5:17:52, 10.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0775


 18%|█▊        | 413/2304 [1:56:16<5:37:06, 10.70s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0834


 18%|█▊        | 414/2304 [1:56:29<5:53:20, 11.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0916


 18%|█▊        | 415/2304 [1:56:37<5:25:26, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0760


 18%|█▊        | 416/2304 [1:56:45<5:04:36,  9.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


 18%|█▊        | 417/2304 [1:56:53<4:49:35,  9.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0837


 18%|█▊        | 418/2304 [1:57:06<5:17:44, 10.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0706


 18%|█▊        | 419/2304 [1:57:18<5:36:29, 10.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0821


 18%|█▊        | 420/2304 [1:57:30<5:49:57, 11.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0838


 18%|█▊        | 421/2304 [1:57:38<5:19:59, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0755


 18%|█▊        | 422/2304 [1:57:46<4:58:47,  9.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0891


 18%|█▊        | 423/2304 [1:57:54<4:45:36,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0886


 18%|█▊        | 424/2304 [1:58:06<5:15:06, 10.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0825


 18%|█▊        | 425/2304 [1:58:18<5:33:56, 10.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0776


 18%|█▊        | 426/2304 [1:58:31<5:50:38, 11.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0792


 19%|█▊        | 427/2304 [1:58:39<5:21:49, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 19%|█▊        | 428/2304 [1:58:47<4:59:46,  9.59s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 19%|█▊        | 429/2304 [1:58:55<4:44:58,  9.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0808


 19%|█▊        | 430/2304 [1:59:07<5:13:51, 10.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0764


 19%|█▊        | 431/2304 [1:59:19<5:30:31, 10.59s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0822


 19%|█▉        | 432/2304 [1:59:31<5:46:07, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0777


 19%|█▉        | 433/2304 [1:59:39<5:18:08, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0800


 19%|█▉        | 434/2304 [1:59:47<4:56:45,  9.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0763


 19%|█▉        | 435/2304 [1:59:55<4:41:55,  9.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0788


 19%|█▉        | 436/2304 [2:00:07<5:11:20, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 19%|█▉        | 437/2304 [2:00:19<5:27:17, 10.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 19%|█▉        | 438/2304 [2:00:31<5:43:27, 11.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0831


 19%|█▉        | 439/2304 [2:00:40<5:16:02, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0793


 19%|█▉        | 440/2304 [2:00:48<4:55:02,  9.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0778


 19%|█▉        | 441/2304 [2:00:56<4:40:46,  9.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0882


 19%|█▉        | 442/2304 [2:01:08<5:09:07,  9.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0850


 19%|█▉        | 443/2304 [2:01:19<5:25:53, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 19%|█▉        | 444/2304 [2:01:32<5:41:26, 11.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0923


 19%|█▉        | 445/2304 [2:01:40<5:13:16, 10.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0784


 19%|█▉        | 446/2304 [2:01:47<4:52:18,  9.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0759


 19%|█▉        | 447/2304 [2:01:55<4:38:36,  9.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0712


 19%|█▉        | 448/2304 [2:02:08<5:08:09,  9.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0914


 19%|█▉        | 449/2304 [2:02:20<5:26:57, 10.58s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0805


 20%|█▉        | 450/2304 [2:02:32<5:41:53, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0861


 20%|█▉        | 451/2304 [2:02:40<5:14:16, 10.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0741


 20%|█▉        | 452/2304 [2:02:48<4:52:23,  9.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 20%|█▉        | 453/2304 [2:02:56<4:38:24,  9.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 20%|█▉        | 454/2304 [2:03:08<5:06:32,  9.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 20%|█▉        | 455/2304 [2:03:20<5:25:09, 10.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0732


 20%|█▉        | 456/2304 [2:03:32<5:41:55, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 20%|█▉        | 457/2304 [2:03:40<5:15:03, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 20%|█▉        | 458/2304 [2:03:48<4:53:09,  9.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0821


 20%|█▉        | 459/2304 [2:03:56<4:39:47,  9.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0839


 20%|█▉        | 460/2304 [2:04:09<5:08:53, 10.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0766


 20%|██        | 461/2304 [2:04:21<5:27:07, 10.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 20%|██        | 462/2304 [2:04:33<5:44:04, 11.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0979


 20%|██        | 463/2304 [2:04:42<5:16:56, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0936


 20%|██        | 464/2304 [2:04:49<4:54:50,  9.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0832


 20%|██        | 465/2304 [2:04:58<4:42:02,  9.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0920


 20%|██        | 466/2304 [2:05:10<5:10:57, 10.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0920


 20%|██        | 467/2304 [2:05:22<5:28:09, 10.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0653


 20%|██        | 468/2304 [2:05:34<5:42:51, 11.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0791


 20%|██        | 469/2304 [2:05:43<5:15:41, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0944


 20%|██        | 470/2304 [2:05:51<4:53:28,  9.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0830


 20%|██        | 471/2304 [2:05:59<4:39:30,  9.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0800


 20%|██        | 472/2304 [2:06:11<5:08:06, 10.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0872


 21%|██        | 473/2304 [2:06:23<5:26:19, 10.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0776


 21%|██        | 474/2304 [2:06:35<5:41:27, 11.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0799


 21%|██        | 475/2304 [2:06:44<5:12:14, 10.24s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0839


 21%|██        | 476/2304 [2:06:51<4:50:07,  9.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 21%|██        | 477/2304 [2:07:00<4:37:55,  9.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0920


 21%|██        | 478/2304 [2:07:12<5:07:51, 10.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0821


 21%|██        | 479/2304 [2:07:24<5:25:13, 10.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0801


 21%|██        | 480/2304 [2:07:36<5:39:10, 11.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 21%|██        | 481/2304 [2:07:44<5:11:26, 10.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0848


 21%|██        | 482/2304 [2:07:52<4:50:19,  9.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0755


 21%|██        | 483/2304 [2:08:01<4:37:56,  9.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0789


 21%|██        | 484/2304 [2:08:13<5:04:19, 10.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0744


 21%|██        | 485/2304 [2:08:25<5:21:41, 10.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 21%|██        | 486/2304 [2:08:37<5:36:03, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0830


 21%|██        | 487/2304 [2:08:45<5:08:39, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0772


 21%|██        | 488/2304 [2:08:53<4:46:26,  9.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0822


 21%|██        | 489/2304 [2:09:01<4:33:05,  9.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 21%|██▏       | 490/2304 [2:09:13<5:00:22,  9.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


 21%|██▏       | 491/2304 [2:09:25<5:17:01, 10.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


 21%|██▏       | 492/2304 [2:09:37<5:32:59, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0819


 21%|██▏       | 493/2304 [2:09:45<5:05:26, 10.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0713


 21%|██▏       | 494/2304 [2:09:53<4:44:07,  9.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


 21%|██▏       | 495/2304 [2:10:01<4:32:03,  9.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0814


 22%|██▏       | 496/2304 [2:10:13<4:59:47,  9.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1008


 22%|██▏       | 497/2304 [2:10:25<5:16:28, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0869


 22%|██▏       | 498/2304 [2:10:37<5:31:14, 11.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0761


 22%|██▏       | 499/2304 [2:10:45<5:05:31, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0856


 22%|██▏       | 500/2304 [2:10:53<4:45:29,  9.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0823


 22%|██▏       | 501/2304 [2:11:01<4:31:42,  9.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0983


 22%|██▏       | 502/2304 [2:11:13<5:00:41, 10.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0699


 22%|██▏       | 503/2304 [2:11:25<5:15:30, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 22%|██▏       | 504/2304 [2:11:37<5:31:24, 11.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0924


 22%|██▏       | 505/2304 [2:11:45<5:04:29, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 22%|██▏       | 506/2304 [2:11:53<4:43:25,  9.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 22%|██▏       | 507/2304 [2:12:01<4:29:38,  9.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 22%|██▏       | 508/2304 [2:12:13<4:55:43,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 22%|██▏       | 509/2304 [2:12:25<5:12:09, 10.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 22%|██▏       | 510/2304 [2:12:37<5:29:00, 11.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 22%|██▏       | 511/2304 [2:12:45<5:01:29, 10.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0785


 22%|██▏       | 512/2304 [2:12:53<4:41:59,  9.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0887


 22%|██▏       | 513/2304 [2:13:01<4:30:11,  9.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0798


 22%|██▏       | 514/2304 [2:13:13<4:59:19, 10.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0849


 22%|██▏       | 515/2304 [2:13:25<5:16:10, 10.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 22%|██▏       | 516/2304 [2:13:38<5:30:48, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 22%|██▏       | 517/2304 [2:13:46<5:03:21, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 22%|██▏       | 518/2304 [2:13:53<4:43:04,  9.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0952


 23%|██▎       | 519/2304 [2:14:02<4:30:59,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0798


 23%|██▎       | 520/2304 [2:14:14<5:00:36, 10.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 23%|██▎       | 521/2304 [2:14:26<5:15:52, 10.63s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 23%|██▎       | 522/2304 [2:14:38<5:31:21, 11.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0780


 23%|██▎       | 523/2304 [2:14:46<5:04:08, 10.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0838


 23%|██▎       | 524/2304 [2:14:54<4:42:53,  9.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 23%|██▎       | 525/2304 [2:15:02<4:29:30,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 23%|██▎       | 526/2304 [2:15:15<4:56:16, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1113


 23%|██▎       | 527/2304 [2:15:27<5:13:47, 10.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 23%|██▎       | 528/2304 [2:15:39<5:28:24, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 23%|██▎       | 529/2304 [2:15:47<5:01:43, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0883


 23%|██▎       | 530/2304 [2:15:55<4:41:27,  9.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0836


 23%|██▎       | 531/2304 [2:16:03<4:28:37,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0883


 23%|██▎       | 532/2304 [2:16:15<4:56:51, 10.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 23%|██▎       | 533/2304 [2:16:27<5:14:04, 10.64s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 23%|██▎       | 534/2304 [2:16:39<5:27:35, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0856


 23%|██▎       | 535/2304 [2:16:47<5:00:18, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0767


 23%|██▎       | 536/2304 [2:16:55<4:40:21,  9.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0804


 23%|██▎       | 537/2304 [2:17:04<4:28:15,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0807


 23%|██▎       | 538/2304 [2:17:16<4:55:12, 10.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0766


 23%|██▎       | 539/2304 [2:17:27<5:10:07, 10.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 23%|██▎       | 540/2304 [2:17:40<5:24:15, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


 23%|██▎       | 541/2304 [2:17:48<4:57:58, 10.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0804


 24%|██▎       | 542/2304 [2:17:56<4:38:26,  9.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0813


 24%|██▎       | 543/2304 [2:18:04<4:26:17,  9.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0887


 24%|██▎       | 544/2304 [2:18:16<4:53:55, 10.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0791


 24%|██▎       | 545/2304 [2:18:28<5:11:19, 10.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0797


 24%|██▎       | 546/2304 [2:18:40<5:25:33, 11.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0807


 24%|██▎       | 547/2304 [2:18:48<4:59:15, 10.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 24%|██▍       | 548/2304 [2:18:56<4:39:26,  9.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0850


 24%|██▍       | 549/2304 [2:19:04<4:26:32,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0814


 24%|██▍       | 550/2304 [2:19:17<4:54:03, 10.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0828


 24%|██▍       | 551/2304 [2:19:29<5:09:53, 10.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


 24%|██▍       | 552/2304 [2:19:41<5:23:17, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0839


 24%|██▍       | 553/2304 [2:19:49<4:56:09, 10.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0920


 24%|██▍       | 554/2304 [2:19:57<4:35:45,  9.45s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0855


 24%|██▍       | 555/2304 [2:20:05<4:24:02,  9.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0718


 24%|██▍       | 556/2304 [2:20:17<4:49:43,  9.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 24%|██▍       | 557/2304 [2:20:29<5:06:47, 10.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 24%|██▍       | 558/2304 [2:20:41<5:22:10, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


 24%|██▍       | 559/2304 [2:20:49<4:55:04, 10.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0754


 24%|██▍       | 560/2304 [2:20:57<4:35:41,  9.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0842


 24%|██▍       | 561/2304 [2:21:05<4:23:34,  9.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 24%|██▍       | 562/2304 [2:21:17<4:50:27, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 24%|██▍       | 563/2304 [2:21:29<5:06:08, 10.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 24%|██▍       | 564/2304 [2:21:41<5:21:13, 11.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 25%|██▍       | 565/2304 [2:21:50<4:56:21, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0748


 25%|██▍       | 566/2304 [2:21:58<4:36:29,  9.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0794


 25%|██▍       | 567/2304 [2:22:06<4:23:16,  9.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


 25%|██▍       | 568/2304 [2:22:18<4:49:25, 10.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0991


 25%|██▍       | 569/2304 [2:22:30<5:06:30, 10.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0817


 25%|██▍       | 570/2304 [2:22:42<5:21:24, 11.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0756


 25%|██▍       | 571/2304 [2:22:50<4:54:44, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 25%|██▍       | 572/2304 [2:22:58<4:35:10,  9.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0809


 25%|██▍       | 573/2304 [2:23:06<4:22:56,  9.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0848


 25%|██▍       | 574/2304 [2:23:19<4:50:18, 10.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0754


 25%|██▍       | 575/2304 [2:23:31<5:07:20, 10.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0837


 25%|██▌       | 576/2304 [2:23:43<5:20:00, 11.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1458


 25%|██▌       | 577/2304 [2:23:47<4:23:24,  9.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 25%|██▌       | 578/2304 [2:23:52<3:42:18,  7.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 25%|██▌       | 579/2304 [2:23:56<3:15:19,  6.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0923


 25%|██▌       | 580/2304 [2:24:03<3:15:08,  6.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0775


 25%|██▌       | 581/2304 [2:24:10<3:14:15,  6.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0796


 25%|██▌       | 582/2304 [2:24:17<3:14:28,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0811


 25%|██▌       | 583/2304 [2:24:21<2:55:41,  6.13s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0730


 25%|██▌       | 584/2304 [2:24:26<2:42:26,  5.67s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0682


 25%|██▌       | 585/2304 [2:24:30<2:32:01,  5.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 25%|██▌       | 586/2304 [2:24:37<2:44:04,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1018


 25%|██▌       | 587/2304 [2:24:44<2:51:11,  5.98s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 26%|██▌       | 588/2304 [2:24:50<2:56:18,  6.16s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 26%|██▌       | 589/2304 [2:24:55<2:43:09,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 26%|██▌       | 590/2304 [2:24:59<2:32:24,  5.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0747


 26%|██▌       | 591/2304 [2:25:04<2:26:08,  5.12s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0739


 26%|██▌       | 592/2304 [2:25:11<2:41:25,  5.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0790


 26%|██▌       | 593/2304 [2:25:17<2:48:48,  5.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 26%|██▌       | 594/2304 [2:25:24<2:56:44,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0793


 26%|██▌       | 595/2304 [2:25:29<2:42:11,  5.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0730


 26%|██▌       | 596/2304 [2:25:33<2:31:18,  5.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0754


 26%|██▌       | 597/2304 [2:25:38<2:26:33,  5.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0733


 26%|██▌       | 598/2304 [2:25:45<2:39:53,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 26%|██▌       | 599/2304 [2:25:52<2:50:37,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 26%|██▌       | 600/2304 [2:25:59<3:00:56,  6.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 26%|██▌       | 601/2304 [2:26:04<2:47:57,  5.92s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0845


 26%|██▌       | 602/2304 [2:26:08<2:37:50,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 26%|██▌       | 603/2304 [2:26:13<2:30:56,  5.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0871


 26%|██▌       | 604/2304 [2:26:20<2:43:38,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0688


 26%|██▋       | 605/2304 [2:26:27<2:52:53,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 26%|██▋       | 606/2304 [2:26:34<3:00:04,  6.36s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0951


 26%|██▋       | 607/2304 [2:26:38<2:45:55,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0800


 26%|██▋       | 608/2304 [2:26:43<2:37:17,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0801


 26%|██▋       | 609/2304 [2:26:48<2:30:20,  5.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 26%|██▋       | 610/2304 [2:26:55<2:44:43,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 27%|██▋       | 611/2304 [2:27:02<2:53:07,  6.14s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 27%|██▋       | 612/2304 [2:27:09<2:59:04,  6.35s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0989


 27%|██▋       | 613/2304 [2:27:14<2:45:26,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


 27%|██▋       | 614/2304 [2:27:18<2:34:49,  5.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0685


 27%|██▋       | 615/2304 [2:27:23<2:28:15,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 27%|██▋       | 616/2304 [2:27:30<2:42:28,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0738


 27%|██▋       | 617/2304 [2:27:37<2:51:16,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


 27%|██▋       | 618/2304 [2:27:44<2:59:36,  6.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 27%|██▋       | 619/2304 [2:27:49<2:45:31,  5.89s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 27%|██▋       | 620/2304 [2:27:53<2:34:45,  5.51s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 27%|██▋       | 621/2304 [2:27:58<2:28:45,  5.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0801


 27%|██▋       | 622/2304 [2:28:05<2:40:56,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0730


 27%|██▋       | 623/2304 [2:28:11<2:48:39,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 27%|██▋       | 624/2304 [2:28:18<2:56:07,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 27%|██▋       | 625/2304 [2:28:23<2:42:09,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 27%|██▋       | 626/2304 [2:28:28<2:32:56,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0726


 27%|██▋       | 627/2304 [2:28:32<2:26:19,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 27%|██▋       | 628/2304 [2:28:39<2:38:31,  5.68s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0777


 27%|██▋       | 629/2304 [2:28:46<2:47:05,  5.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 27%|██▋       | 630/2304 [2:28:53<2:53:23,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0719


 27%|██▋       | 631/2304 [2:28:57<2:40:46,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0782


 27%|██▋       | 632/2304 [2:29:02<2:30:11,  5.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 27%|██▋       | 633/2304 [2:29:06<2:23:12,  5.14s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 28%|██▊       | 634/2304 [2:29:13<2:38:47,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0820


 28%|██▊       | 635/2304 [2:29:20<2:46:48,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 28%|██▊       | 636/2304 [2:29:27<2:52:38,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


 28%|██▊       | 637/2304 [2:29:32<2:41:25,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0740


 28%|██▊       | 638/2304 [2:29:36<2:31:44,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


 28%|██▊       | 639/2304 [2:29:41<2:26:29,  5.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0703


 28%|██▊       | 640/2304 [2:29:48<2:39:37,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0812


 28%|██▊       | 641/2304 [2:29:55<2:47:18,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 28%|██▊       | 642/2304 [2:30:02<2:55:23,  6.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1081


 28%|██▊       | 643/2304 [2:30:06<2:41:54,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 28%|██▊       | 644/2304 [2:30:11<2:31:28,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 28%|██▊       | 645/2304 [2:30:16<2:25:25,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0768


 28%|██▊       | 646/2304 [2:30:23<2:37:44,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 28%|██▊       | 647/2304 [2:30:29<2:45:41,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 28%|██▊       | 648/2304 [2:30:36<2:52:28,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0807


 28%|██▊       | 649/2304 [2:30:41<2:39:46,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0763


 28%|██▊       | 650/2304 [2:30:46<2:31:18,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 28%|██▊       | 651/2304 [2:30:50<2:24:24,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 28%|██▊       | 652/2304 [2:30:57<2:36:04,  5.67s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 28%|██▊       | 653/2304 [2:31:04<2:44:29,  5.98s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 28%|██▊       | 654/2304 [2:31:10<2:50:43,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0864


 28%|██▊       | 655/2304 [2:31:15<2:38:18,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 28%|██▊       | 656/2304 [2:31:19<2:27:26,  5.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 29%|██▊       | 657/2304 [2:31:24<2:21:57,  5.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0716


 29%|██▊       | 658/2304 [2:31:31<2:36:24,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0884


 29%|██▊       | 659/2304 [2:31:38<2:44:52,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 29%|██▊       | 660/2304 [2:31:45<2:52:04,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1331


 29%|██▊       | 661/2304 [2:31:50<2:40:23,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0675


 29%|██▊       | 662/2304 [2:31:54<2:30:40,  5.51s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 29%|██▉       | 663/2304 [2:31:59<2:25:17,  5.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 29%|██▉       | 664/2304 [2:32:06<2:37:26,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0893


 29%|██▉       | 665/2304 [2:32:13<2:45:29,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0703


 29%|██▉       | 666/2304 [2:32:20<2:53:36,  6.36s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 29%|██▉       | 667/2304 [2:32:25<2:39:54,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 29%|██▉       | 668/2304 [2:32:29<2:28:39,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 29%|██▉       | 669/2304 [2:32:34<2:24:20,  5.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 29%|██▉       | 670/2304 [2:32:41<2:36:47,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


 29%|██▉       | 671/2304 [2:32:48<2:45:12,  6.07s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 29%|██▉       | 672/2304 [2:32:54<2:51:09,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 29%|██▉       | 673/2304 [2:32:59<2:37:41,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0826


 29%|██▉       | 674/2304 [2:33:04<2:28:53,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0833


 29%|██▉       | 675/2304 [2:33:08<2:22:05,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1123


 29%|██▉       | 676/2304 [2:33:15<2:35:10,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0646


 29%|██▉       | 677/2304 [2:33:22<2:42:03,  5.98s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 29%|██▉       | 678/2304 [2:33:28<2:46:45,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 29%|██▉       | 679/2304 [2:33:33<2:35:28,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0699


 30%|██▉       | 680/2304 [2:33:38<2:24:35,  5.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0954


 30%|██▉       | 681/2304 [2:33:42<2:18:30,  5.12s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 30%|██▉       | 682/2304 [2:33:49<2:32:05,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 30%|██▉       | 683/2304 [2:33:55<2:38:41,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0722


 30%|██▉       | 684/2304 [2:34:02<2:46:01,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0741


 30%|██▉       | 685/2304 [2:34:07<2:34:11,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 30%|██▉       | 686/2304 [2:34:12<2:24:46,  5.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 30%|██▉       | 687/2304 [2:34:16<2:19:57,  5.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0813


 30%|██▉       | 688/2304 [2:34:23<2:33:16,  5.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1576


 30%|██▉       | 689/2304 [2:34:30<2:40:43,  5.97s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 30%|██▉       | 690/2304 [2:34:37<2:49:37,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0750


 30%|██▉       | 691/2304 [2:34:42<2:36:36,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 30%|███       | 692/2304 [2:34:46<2:27:34,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 30%|███       | 693/2304 [2:34:51<2:20:54,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0811


 30%|███       | 694/2304 [2:34:58<2:32:48,  5.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 30%|███       | 695/2304 [2:35:04<2:40:46,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 30%|███       | 696/2304 [2:35:11<2:46:26,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0866


 30%|███       | 697/2304 [2:35:16<2:34:06,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 30%|███       | 698/2304 [2:35:21<2:26:04,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0813


 30%|███       | 699/2304 [2:35:25<2:19:43,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0945


 30%|███       | 700/2304 [2:35:32<2:33:47,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0827


 30%|███       | 701/2304 [2:35:39<2:40:39,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 30%|███       | 702/2304 [2:35:46<2:47:29,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0989


 31%|███       | 703/2304 [2:35:50<2:35:01,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0804


 31%|███       | 704/2304 [2:35:55<2:25:02,  5.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0829


 31%|███       | 705/2304 [2:36:00<2:18:43,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0826


 31%|███       | 706/2304 [2:36:07<2:33:40,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1254


 31%|███       | 707/2304 [2:36:13<2:40:11,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0645


 31%|███       | 708/2304 [2:36:20<2:47:24,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 31%|███       | 709/2304 [2:36:25<2:34:21,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0736


 31%|███       | 710/2304 [2:36:30<2:24:49,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0764


 31%|███       | 711/2304 [2:36:34<2:19:43,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 31%|███       | 712/2304 [2:36:41<2:31:43,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 31%|███       | 713/2304 [2:36:48<2:39:00,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1000


 31%|███       | 714/2304 [2:36:55<2:44:53,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 31%|███       | 715/2304 [2:36:59<2:32:14,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0725


 31%|███       | 716/2304 [2:37:04<2:24:51,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 31%|███       | 717/2304 [2:37:09<2:19:01,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0749


 31%|███       | 718/2304 [2:37:16<2:30:28,  5.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0784


 31%|███       | 719/2304 [2:37:22<2:39:11,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 31%|███▏      | 720/2304 [2:37:29<2:45:07,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


 31%|███▏      | 721/2304 [2:37:34<2:33:58,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0794


 31%|███▏      | 722/2304 [2:37:39<2:23:49,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0882


 31%|███▏      | 723/2304 [2:37:43<2:16:56,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 31%|███▏      | 724/2304 [2:37:50<2:28:46,  5.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0743


 31%|███▏      | 725/2304 [2:37:56<2:36:03,  5.93s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 32%|███▏      | 726/2304 [2:38:03<2:42:41,  6.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0751


 32%|███▏      | 727/2304 [2:38:08<2:32:00,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 32%|███▏      | 728/2304 [2:38:13<2:21:51,  5.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 32%|███▏      | 729/2304 [2:38:17<2:16:44,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0815


 32%|███▏      | 730/2304 [2:38:24<2:28:21,  5.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 32%|███▏      | 731/2304 [2:38:31<2:35:36,  5.94s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 32%|███▏      | 732/2304 [2:38:38<2:43:48,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0901


 32%|███▏      | 733/2304 [2:38:42<2:31:42,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 32%|███▏      | 734/2304 [2:38:47<2:21:30,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 32%|███▏      | 735/2304 [2:38:52<2:15:53,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 32%|███▏      | 736/2304 [2:38:58<2:27:13,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


 32%|███▏      | 737/2304 [2:39:05<2:36:45,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0811


 32%|███▏      | 738/2304 [2:39:12<2:43:29,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0766


 32%|███▏      | 739/2304 [2:39:17<2:31:03,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 32%|███▏      | 740/2304 [2:39:21<2:23:19,  5.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 32%|███▏      | 741/2304 [2:39:26<2:16:53,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0729


 32%|███▏      | 742/2304 [2:39:33<2:28:18,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0904


 32%|███▏      | 743/2304 [2:39:40<2:37:10,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 32%|███▏      | 744/2304 [2:39:47<2:43:03,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0896


 32%|███▏      | 745/2304 [2:39:51<2:31:26,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0983


 32%|███▏      | 746/2304 [2:39:56<2:21:51,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0827


 32%|███▏      | 747/2304 [2:40:01<2:15:53,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


 32%|███▏      | 748/2304 [2:40:08<2:29:45,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0970


 33%|███▎      | 749/2304 [2:40:14<2:35:46,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 33%|███▎      | 750/2304 [2:40:21<2:42:47,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0799


 33%|███▎      | 751/2304 [2:40:26<2:30:12,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0748


 33%|███▎      | 752/2304 [2:40:30<2:21:05,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 33%|███▎      | 753/2304 [2:40:35<2:16:00,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 33%|███▎      | 754/2304 [2:40:42<2:28:38,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 33%|███▎      | 755/2304 [2:40:49<2:35:42,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 33%|███▎      | 756/2304 [2:40:56<2:43:40,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0876


 33%|███▎      | 757/2304 [2:41:01<2:30:16,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 33%|███▎      | 758/2304 [2:41:05<2:22:31,  5.53s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0653


 33%|███▎      | 759/2304 [2:41:10<2:16:18,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0851


 33%|███▎      | 760/2304 [2:41:17<2:28:24,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0833


 33%|███▎      | 761/2304 [2:41:24<2:36:20,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0816


 33%|███▎      | 762/2304 [2:41:30<2:40:47,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0912


 33%|███▎      | 763/2304 [2:41:35<2:28:46,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


 33%|███▎      | 764/2304 [2:41:40<2:20:52,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 33%|███▎      | 765/2304 [2:41:45<2:14:30,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0752


 33%|███▎      | 766/2304 [2:41:51<2:26:31,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0779


 33%|███▎      | 767/2304 [2:41:58<2:33:39,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 33%|███▎      | 768/2304 [2:42:05<2:39:45,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0733


 33%|███▎      | 769/2304 [2:42:10<2:28:18,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 33%|███▎      | 770/2304 [2:42:14<2:18:16,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0818


 33%|███▎      | 771/2304 [2:42:19<2:12:24,  5.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0739


 34%|███▎      | 772/2304 [2:42:26<2:24:43,  5.67s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 34%|███▎      | 773/2304 [2:42:32<2:32:01,  5.96s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 34%|███▎      | 774/2304 [2:42:39<2:39:07,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0894


 34%|███▎      | 775/2304 [2:42:44<2:26:54,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0762


 34%|███▎      | 776/2304 [2:42:48<2:17:14,  5.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 34%|███▎      | 777/2304 [2:42:53<2:12:27,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0866


 34%|███▍      | 778/2304 [2:43:00<2:23:37,  5.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


 34%|███▍      | 779/2304 [2:43:07<2:33:24,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 34%|███▍      | 780/2304 [2:43:14<2:39:31,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0811


 34%|███▍      | 781/2304 [2:43:18<2:27:36,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0790


 34%|███▍      | 782/2304 [2:43:23<2:19:12,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 34%|███▍      | 783/2304 [2:43:28<2:13:07,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 34%|███▍      | 784/2304 [2:43:35<2:25:12,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 34%|███▍      | 785/2304 [2:43:42<2:33:59,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0732


 34%|███▍      | 786/2304 [2:43:48<2:38:47,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 34%|███▍      | 787/2304 [2:43:53<2:28:16,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 34%|███▍      | 788/2304 [2:43:58<2:18:40,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 34%|███▍      | 789/2304 [2:44:03<2:12:56,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0828


 34%|███▍      | 790/2304 [2:44:09<2:25:23,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1128


 34%|███▍      | 791/2304 [2:44:16<2:31:32,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 34%|███▍      | 792/2304 [2:44:23<2:37:54,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0756


 34%|███▍      | 793/2304 [2:44:28<2:27:19,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0886


 34%|███▍      | 794/2304 [2:44:32<2:18:02,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0849


 35%|███▍      | 795/2304 [2:44:37<2:13:21,  5.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0889


 35%|███▍      | 796/2304 [2:44:44<2:24:43,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0630


 35%|███▍      | 797/2304 [2:44:51<2:32:00,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0681


 35%|███▍      | 798/2304 [2:44:58<2:39:56,  6.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 35%|███▍      | 799/2304 [2:45:03<2:27:32,  5.88s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 35%|███▍      | 800/2304 [2:45:07<2:18:17,  5.52s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0821


 35%|███▍      | 801/2304 [2:45:12<2:12:28,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 35%|███▍      | 802/2304 [2:45:19<2:23:42,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0836


 35%|███▍      | 803/2304 [2:45:26<2:31:38,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 35%|███▍      | 804/2304 [2:45:33<2:37:08,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0697


 35%|███▍      | 805/2304 [2:45:37<2:24:14,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0759


 35%|███▍      | 806/2304 [2:45:42<2:16:47,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 35%|███▌      | 807/2304 [2:45:47<2:10:24,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0703


 35%|███▌      | 808/2304 [2:45:53<2:22:19,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0800


 35%|███▌      | 809/2304 [2:46:00<2:29:52,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0736


 35%|███▌      | 810/2304 [2:46:07<2:36:52,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 35%|███▌      | 811/2304 [2:46:12<2:26:06,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 35%|███▌      | 812/2304 [2:46:17<2:17:23,  5.52s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0911


 35%|███▌      | 813/2304 [2:46:21<2:11:17,  5.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 35%|███▌      | 814/2304 [2:46:28<2:24:35,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0863


 35%|███▌      | 815/2304 [2:46:35<2:30:39,  6.07s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1008


 35%|███▌      | 816/2304 [2:46:42<2:37:14,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0791


 35%|███▌      | 817/2304 [2:46:47<2:24:45,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0966


 36%|███▌      | 818/2304 [2:46:51<2:15:21,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 36%|███▌      | 819/2304 [2:46:56<2:10:59,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0809


 36%|███▌      | 820/2304 [2:47:03<2:22:58,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0647


 36%|███▌      | 821/2304 [2:47:10<2:29:46,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 36%|███▌      | 822/2304 [2:47:17<2:36:28,  6.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 36%|███▌      | 823/2304 [2:47:21<2:23:18,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0689


 36%|███▌      | 824/2304 [2:47:26<2:14:30,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 36%|███▌      | 825/2304 [2:47:31<2:09:13,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0757


 36%|███▌      | 826/2304 [2:47:38<2:20:39,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0812


 36%|███▌      | 827/2304 [2:47:44<2:29:17,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 36%|███▌      | 828/2304 [2:47:51<2:34:36,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 36%|███▌      | 829/2304 [2:47:56<2:23:09,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 36%|███▌      | 830/2304 [2:48:01<2:14:58,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 36%|███▌      | 831/2304 [2:48:06<2:09:48,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0927


 36%|███▌      | 832/2304 [2:48:13<2:22:06,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 36%|███▌      | 833/2304 [2:48:19<2:29:43,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 36%|███▌      | 834/2304 [2:48:26<2:34:56,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 36%|███▌      | 835/2304 [2:48:31<2:24:01,  5.88s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 36%|███▋      | 836/2304 [2:48:36<2:14:03,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0825


 36%|███▋      | 837/2304 [2:48:40<2:08:23,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 36%|███▋      | 838/2304 [2:48:47<2:20:34,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 36%|███▋      | 839/2304 [2:48:54<2:27:43,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 36%|███▋      | 840/2304 [2:49:01<2:34:18,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 37%|███▋      | 841/2304 [2:49:06<2:23:11,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0778


 37%|███▋      | 842/2304 [2:49:10<2:13:46,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0649


 37%|███▋      | 843/2304 [2:49:15<2:09:32,  5.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 37%|███▋      | 844/2304 [2:49:22<2:19:52,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0830


 37%|███▋      | 845/2304 [2:49:29<2:25:47,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 37%|███▋      | 846/2304 [2:49:35<2:32:10,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0995


 37%|███▋      | 847/2304 [2:49:40<2:21:44,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0828


 37%|███▋      | 848/2304 [2:49:45<2:14:01,  5.52s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 37%|███▋      | 849/2304 [2:49:50<2:07:13,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0848


 37%|███▋      | 850/2304 [2:49:57<2:18:39,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0825


 37%|███▋      | 851/2304 [2:50:03<2:27:32,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 37%|███▋      | 852/2304 [2:50:10<2:33:31,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0773


 37%|███▋      | 853/2304 [2:50:15<2:21:57,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0758


 37%|███▋      | 854/2304 [2:50:20<2:14:01,  5.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0869


 37%|███▋      | 855/2304 [2:50:25<2:07:09,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0901


 37%|███▋      | 856/2304 [2:50:32<2:19:53,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0815


 37%|███▋      | 857/2304 [2:50:38<2:26:30,  6.07s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 37%|███▋      | 858/2304 [2:50:45<2:31:13,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0820


 37%|███▋      | 859/2304 [2:50:50<2:20:54,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0847


 37%|███▋      | 860/2304 [2:50:55<2:11:40,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0788


 37%|███▋      | 861/2304 [2:50:59<2:07:12,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0788


 37%|███▋      | 862/2304 [2:51:06<2:18:35,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0783


 37%|███▋      | 863/2304 [2:51:13<2:24:33,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 38%|███▊      | 864/2304 [2:51:20<2:31:36,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 38%|███▊      | 865/2304 [2:51:23<2:06:47,  5.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 38%|███▊      | 866/2304 [2:51:26<1:49:58,  4.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0718


 38%|███▊      | 867/2304 [2:51:29<1:37:35,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 38%|███▊      | 868/2304 [2:51:33<1:37:19,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0766


 38%|███▊      | 869/2304 [2:51:37<1:36:19,  4.03s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0720


 38%|███▊      | 870/2304 [2:51:41<1:36:52,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 38%|███▊      | 871/2304 [2:51:44<1:28:17,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0689


 38%|███▊      | 872/2304 [2:51:47<1:23:02,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 38%|███▊      | 873/2304 [2:51:49<1:18:27,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 38%|███▊      | 874/2304 [2:51:54<1:24:28,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 38%|███▊      | 875/2304 [2:51:58<1:27:45,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0685


 38%|███▊      | 876/2304 [2:52:02<1:30:52,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0743


 38%|███▊      | 877/2304 [2:52:05<1:23:49,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 38%|███▊      | 878/2304 [2:52:07<1:19:39,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0716


 38%|███▊      | 879/2304 [2:52:10<1:15:41,  3.19s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 38%|███▊      | 880/2304 [2:52:14<1:22:15,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


 38%|███▊      | 881/2304 [2:52:18<1:26:30,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 38%|███▊      | 882/2304 [2:52:23<1:29:42,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


 38%|███▊      | 883/2304 [2:52:25<1:22:54,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 38%|███▊      | 884/2304 [2:52:28<1:19:45,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0723


 38%|███▊      | 885/2304 [2:52:31<1:16:10,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 38%|███▊      | 886/2304 [2:52:35<1:21:55,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0699


 38%|███▊      | 887/2304 [2:52:40<1:26:36,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 39%|███▊      | 888/2304 [2:52:44<1:29:46,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 39%|███▊      | 889/2304 [2:52:47<1:23:13,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 39%|███▊      | 890/2304 [2:52:50<1:19:39,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 39%|███▊      | 891/2304 [2:52:52<1:16:18,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0749


 39%|███▊      | 892/2304 [2:52:57<1:22:30,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 39%|███▉      | 893/2304 [2:53:01<1:26:41,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 39%|███▉      | 894/2304 [2:53:05<1:30:32,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 39%|███▉      | 895/2304 [2:53:08<1:23:51,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0704


 39%|███▉      | 896/2304 [2:53:11<1:20:00,  3.41s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 39%|███▉      | 897/2304 [2:53:14<1:16:37,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0768


 39%|███▉      | 898/2304 [2:53:18<1:23:00,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 39%|███▉      | 899/2304 [2:53:22<1:26:45,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 39%|███▉      | 900/2304 [2:53:26<1:29:14,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 39%|███▉      | 901/2304 [2:53:29<1:24:11,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0700


 39%|███▉      | 902/2304 [2:53:32<1:18:37,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0682


 39%|███▉      | 903/2304 [2:53:35<1:16:18,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 39%|███▉      | 904/2304 [2:53:39<1:22:04,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0702


 39%|███▉      | 905/2304 [2:53:43<1:25:43,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 39%|███▉      | 906/2304 [2:53:47<1:27:40,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 39%|███▉      | 907/2304 [2:53:50<1:22:53,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 39%|███▉      | 908/2304 [2:53:53<1:17:41,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0679


 39%|███▉      | 909/2304 [2:53:56<1:15:41,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0670


 39%|███▉      | 910/2304 [2:54:00<1:21:45,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 40%|███▉      | 911/2304 [2:54:04<1:25:13,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 40%|███▉      | 912/2304 [2:54:09<1:28:40,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 40%|███▉      | 913/2304 [2:54:12<1:23:12,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 40%|███▉      | 914/2304 [2:54:14<1:18:15,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0785


 40%|███▉      | 915/2304 [2:54:17<1:15:37,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 40%|███▉      | 916/2304 [2:54:22<1:21:15,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 40%|███▉      | 917/2304 [2:54:26<1:24:52,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 40%|███▉      | 918/2304 [2:54:30<1:26:56,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0684


 40%|███▉      | 919/2304 [2:54:33<1:21:35,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 40%|███▉      | 920/2304 [2:54:35<1:16:43,  3.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0703


 40%|███▉      | 921/2304 [2:54:39<1:15:06,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 40%|████      | 922/2304 [2:54:43<1:21:14,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 40%|████      | 923/2304 [2:54:47<1:24:29,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0767


 40%|████      | 924/2304 [2:54:51<1:26:45,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 40%|████      | 925/2304 [2:54:54<1:21:16,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0704


 40%|████      | 926/2304 [2:54:57<1:16:14,  3.32s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0692


 40%|████      | 927/2304 [2:55:00<1:14:09,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0680


 40%|████      | 928/2304 [2:55:04<1:20:51,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 40%|████      | 929/2304 [2:55:08<1:23:14,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 40%|████      | 930/2304 [2:55:12<1:26:56,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 40%|████      | 931/2304 [2:55:15<1:22:07,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 40%|████      | 932/2304 [2:55:18<1:17:19,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 40%|████      | 933/2304 [2:55:21<1:15:24,  3.30s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 41%|████      | 934/2304 [2:55:25<1:21:23,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0712


 41%|████      | 935/2304 [2:55:29<1:23:54,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 41%|████      | 936/2304 [2:55:33<1:26:45,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 41%|████      | 937/2304 [2:55:36<1:22:01,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 41%|████      | 938/2304 [2:55:39<1:17:30,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 41%|████      | 939/2304 [2:55:42<1:15:11,  3.30s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 41%|████      | 940/2304 [2:55:46<1:21:05,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 41%|████      | 941/2304 [2:55:50<1:24:04,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 41%|████      | 942/2304 [2:55:55<1:27:02,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 41%|████      | 943/2304 [2:55:58<1:21:38,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0741


 41%|████      | 944/2304 [2:56:01<1:16:50,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 41%|████      | 945/2304 [2:56:04<1:15:02,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 41%|████      | 946/2304 [2:56:08<1:21:03,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 41%|████      | 947/2304 [2:56:12<1:23:29,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 41%|████      | 948/2304 [2:56:16<1:27:23,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 41%|████      | 949/2304 [2:56:19<1:21:54,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 41%|████      | 950/2304 [2:56:22<1:16:44,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 41%|████▏     | 951/2304 [2:56:25<1:14:35,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 41%|████▏     | 952/2304 [2:56:29<1:18:58,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0687


 41%|████▏     | 953/2304 [2:56:33<1:22:26,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 41%|████▏     | 954/2304 [2:56:37<1:25:58,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 41%|████▏     | 955/2304 [2:56:40<1:20:47,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 41%|████▏     | 956/2304 [2:56:43<1:16:03,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 42%|████▏     | 957/2304 [2:56:46<1:13:50,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 42%|████▏     | 958/2304 [2:56:50<1:18:44,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 42%|████▏     | 959/2304 [2:56:55<1:22:48,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 42%|████▏     | 960/2304 [2:56:59<1:25:52,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 42%|████▏     | 961/2304 [2:57:02<1:20:09,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 42%|████▏     | 962/2304 [2:57:05<1:16:27,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0782


 42%|████▏     | 963/2304 [2:57:08<1:12:38,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0776


 42%|████▏     | 964/2304 [2:57:12<1:17:58,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 42%|████▏     | 965/2304 [2:57:16<1:22:05,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 42%|████▏     | 966/2304 [2:57:20<1:25:08,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 42%|████▏     | 967/2304 [2:57:23<1:18:58,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 42%|████▏     | 968/2304 [2:57:26<1:15:54,  3.41s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 42%|████▏     | 969/2304 [2:57:29<1:12:31,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0701


 42%|████▏     | 970/2304 [2:57:33<1:18:01,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 42%|████▏     | 971/2304 [2:57:37<1:21:53,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0685


 42%|████▏     | 972/2304 [2:57:41<1:24:57,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 42%|████▏     | 973/2304 [2:57:44<1:18:28,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0717


 42%|████▏     | 974/2304 [2:57:47<1:14:39,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0713


 42%|████▏     | 975/2304 [2:57:50<1:11:47,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 42%|████▏     | 976/2304 [2:57:54<1:17:25,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 42%|████▏     | 977/2304 [2:57:58<1:21:00,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 42%|████▏     | 978/2304 [2:58:02<1:24:22,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 42%|████▏     | 979/2304 [2:58:05<1:18:07,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 43%|████▎     | 980/2304 [2:58:08<1:14:21,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 43%|████▎     | 981/2304 [2:58:11<1:10:56,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0705


 43%|████▎     | 982/2304 [2:58:15<1:17:15,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0696


 43%|████▎     | 983/2304 [2:58:19<1:20:38,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 43%|████▎     | 984/2304 [2:58:23<1:22:35,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0691


 43%|████▎     | 985/2304 [2:58:26<1:18:25,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 43%|████▎     | 986/2304 [2:58:29<1:14:38,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 43%|████▎     | 987/2304 [2:58:32<1:11:12,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0866


 43%|████▎     | 988/2304 [2:58:36<1:17:21,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0872


 43%|████▎     | 989/2304 [2:58:40<1:20:44,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 43%|████▎     | 990/2304 [2:58:44<1:22:16,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0704


 43%|████▎     | 991/2304 [2:58:47<1:17:44,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 43%|████▎     | 992/2304 [2:58:50<1:13:11,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 43%|████▎     | 993/2304 [2:58:53<1:11:25,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0777


 43%|████▎     | 994/2304 [2:58:57<1:17:03,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0728


 43%|████▎     | 995/2304 [2:59:01<1:19:42,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0728


 43%|████▎     | 996/2304 [2:59:06<1:22:44,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 43%|████▎     | 997/2304 [2:59:09<1:17:52,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 43%|████▎     | 998/2304 [2:59:12<1:13:18,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 43%|████▎     | 999/2304 [2:59:15<1:11:32,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0681


 43%|████▎     | 1000/2304 [2:59:19<1:16:50,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0739


 43%|████▎     | 1001/2304 [2:59:23<1:20:11,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 43%|████▎     | 1002/2304 [2:59:27<1:22:35,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 44%|████▎     | 1003/2304 [2:59:30<1:17:32,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0767


 44%|████▎     | 1004/2304 [2:59:33<1:13:06,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 44%|████▎     | 1005/2304 [2:59:36<1:11:06,  3.28s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0741


 44%|████▎     | 1006/2304 [2:59:40<1:16:58,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0736


 44%|████▎     | 1007/2304 [2:59:44<1:19:33,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0730


 44%|████▍     | 1008/2304 [2:59:48<1:22:37,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 44%|████▍     | 1009/2304 [2:59:51<1:17:51,  3.61s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 44%|████▍     | 1010/2304 [2:59:54<1:13:00,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0734


 44%|████▍     | 1011/2304 [2:59:57<1:10:51,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


 44%|████▍     | 1012/2304 [3:00:01<1:16:46,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1218


 44%|████▍     | 1013/2304 [3:00:05<1:19:06,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 44%|████▍     | 1014/2304 [3:00:09<1:21:43,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0775


 44%|████▍     | 1015/2304 [3:00:13<1:17:17,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0792


 44%|████▍     | 1016/2304 [3:00:15<1:12:46,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0782


 44%|████▍     | 1017/2304 [3:00:19<1:10:32,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0730


 44%|████▍     | 1018/2304 [3:00:23<1:16:04,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 44%|████▍     | 1019/2304 [3:00:27<1:18:28,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 44%|████▍     | 1020/2304 [3:00:31<1:21:26,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0800


 44%|████▍     | 1021/2304 [3:00:34<1:16:22,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 44%|████▍     | 1022/2304 [3:00:37<1:11:36,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 44%|████▍     | 1023/2304 [3:00:40<1:10:00,  3.28s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0716


 44%|████▍     | 1024/2304 [3:00:44<1:15:27,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0738


 44%|████▍     | 1025/2304 [3:00:48<1:17:59,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 45%|████▍     | 1026/2304 [3:00:52<1:21:25,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0715


 45%|████▍     | 1027/2304 [3:00:55<1:16:24,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 45%|████▍     | 1028/2304 [3:00:58<1:11:54,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 45%|████▍     | 1029/2304 [3:01:01<1:10:26,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 45%|████▍     | 1030/2304 [3:01:05<1:15:01,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 45%|████▍     | 1031/2304 [3:01:09<1:18:32,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 45%|████▍     | 1032/2304 [3:01:14<1:22:04,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 45%|████▍     | 1033/2304 [3:01:17<1:16:49,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0757


 45%|████▍     | 1034/2304 [3:01:19<1:11:57,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 45%|████▍     | 1035/2304 [3:01:23<1:09:55,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 45%|████▍     | 1036/2304 [3:01:27<1:14:37,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0798


 45%|████▌     | 1037/2304 [3:01:31<1:17:40,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 45%|████▌     | 1038/2304 [3:01:35<1:20:16,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 45%|████▌     | 1039/2304 [3:01:38<1:15:39,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0728


 45%|████▌     | 1040/2304 [3:01:41<1:11:10,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 45%|████▌     | 1041/2304 [3:01:44<1:09:16,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0826


 45%|████▌     | 1042/2304 [3:01:48<1:14:00,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 45%|████▌     | 1043/2304 [3:01:52<1:17:17,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 45%|████▌     | 1044/2304 [3:01:56<1:19:49,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 45%|████▌     | 1045/2304 [3:01:59<1:14:03,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 45%|████▌     | 1046/2304 [3:02:02<1:11:11,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 45%|████▌     | 1047/2304 [3:02:05<1:09:13,  3.30s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 45%|████▌     | 1048/2304 [3:02:09<1:13:18,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 46%|████▌     | 1049/2304 [3:02:13<1:17:26,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 46%|████▌     | 1050/2304 [3:02:17<1:20:18,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 46%|████▌     | 1051/2304 [3:02:20<1:14:11,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0749


 46%|████▌     | 1052/2304 [3:02:23<1:10:34,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 46%|████▌     | 1053/2304 [3:02:26<1:07:42,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 46%|████▌     | 1054/2304 [3:02:30<1:13:22,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 46%|████▌     | 1055/2304 [3:02:34<1:17:02,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 46%|████▌     | 1056/2304 [3:02:39<1:20:23,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0767


 46%|████▌     | 1057/2304 [3:02:42<1:14:26,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0839


 46%|████▌     | 1058/2304 [3:02:45<1:11:03,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 46%|████▌     | 1059/2304 [3:02:48<1:08:09,  3.28s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 46%|████▌     | 1060/2304 [3:02:52<1:13:53,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 46%|████▌     | 1061/2304 [3:02:56<1:16:56,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 46%|████▌     | 1062/2304 [3:03:00<1:20:02,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0689


 46%|████▌     | 1063/2304 [3:03:03<1:13:53,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 46%|████▌     | 1064/2304 [3:03:06<1:10:26,  3.41s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 46%|████▌     | 1065/2304 [3:03:09<1:07:20,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 46%|████▋     | 1066/2304 [3:03:13<1:13:19,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 46%|████▋     | 1067/2304 [3:03:17<1:16:20,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 46%|████▋     | 1068/2304 [3:03:21<1:18:39,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0926


 46%|████▋     | 1069/2304 [3:03:24<1:13:23,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0753


 46%|████▋     | 1070/2304 [3:03:27<1:09:54,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 46%|████▋     | 1071/2304 [3:03:30<1:07:41,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 47%|████▋     | 1072/2304 [3:03:34<1:11:24,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0760


 47%|████▋     | 1073/2304 [3:03:38<1:15:47,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0724


 47%|████▋     | 1074/2304 [3:03:43<1:18:33,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1018


 47%|████▋     | 1075/2304 [3:03:45<1:12:41,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 47%|████▋     | 1076/2304 [3:03:49<1:09:32,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 47%|████▋     | 1077/2304 [3:03:51<1:06:15,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 47%|████▋     | 1078/2304 [3:03:55<1:11:15,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0747


 47%|████▋     | 1079/2304 [3:04:00<1:14:39,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


 47%|████▋     | 1080/2304 [3:04:04<1:17:12,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 47%|████▋     | 1081/2304 [3:04:07<1:12:46,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0753


 47%|████▋     | 1082/2304 [3:04:10<1:10:13,  3.45s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0716


 47%|████▋     | 1083/2304 [3:04:13<1:08:00,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 47%|████▋     | 1084/2304 [3:04:17<1:12:59,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0765


 47%|████▋     | 1085/2304 [3:04:21<1:15:54,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 47%|████▋     | 1086/2304 [3:04:25<1:18:34,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


 47%|████▋     | 1087/2304 [3:04:28<1:13:21,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 47%|████▋     | 1088/2304 [3:04:32<1:10:30,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0783


 47%|████▋     | 1089/2304 [3:04:35<1:07:36,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0751


 47%|████▋     | 1090/2304 [3:04:39<1:13:02,  3.61s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0819


 47%|████▋     | 1091/2304 [3:04:43<1:16:33,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0703


 47%|████▋     | 1092/2304 [3:04:47<1:18:26,  3.88s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 47%|████▋     | 1093/2304 [3:04:50<1:14:14,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 47%|████▋     | 1094/2304 [3:04:53<1:10:16,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 48%|████▊     | 1095/2304 [3:04:57<1:08:32,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0806


 48%|████▊     | 1096/2304 [3:05:01<1:14:11,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


 48%|████▊     | 1097/2304 [3:05:05<1:16:49,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 48%|████▊     | 1098/2304 [3:05:09<1:18:07,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 48%|████▊     | 1099/2304 [3:05:12<1:14:08,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0743


 48%|████▊     | 1100/2304 [3:05:15<1:09:54,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 48%|████▊     | 1101/2304 [3:05:18<1:07:59,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 48%|████▊     | 1102/2304 [3:05:23<1:12:59,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0740


 48%|████▊     | 1103/2304 [3:05:27<1:15:16,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 48%|████▊     | 1104/2304 [3:05:31<1:17:52,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0715


 48%|████▊     | 1105/2304 [3:05:34<1:12:38,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 48%|████▊     | 1106/2304 [3:05:37<1:08:03,  3.41s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 48%|████▊     | 1107/2304 [3:05:40<1:05:33,  3.29s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0875


 48%|████▊     | 1108/2304 [3:05:44<1:10:20,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 48%|████▊     | 1109/2304 [3:05:48<1:12:38,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 48%|████▊     | 1110/2304 [3:05:52<1:15:25,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 48%|████▊     | 1111/2304 [3:05:55<1:10:51,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0827


 48%|████▊     | 1112/2304 [3:05:58<1:06:38,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0784


 48%|████▊     | 1113/2304 [3:06:01<1:04:49,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0722


 48%|████▊     | 1114/2304 [3:06:05<1:09:32,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 48%|████▊     | 1115/2304 [3:06:09<1:11:20,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0698


 48%|████▊     | 1116/2304 [3:06:13<1:14:08,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0847


 48%|████▊     | 1117/2304 [3:06:16<1:09:41,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0802


 49%|████▊     | 1118/2304 [3:06:19<1:05:28,  3.31s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0764


 49%|████▊     | 1119/2304 [3:06:22<1:03:43,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 49%|████▊     | 1120/2304 [3:06:26<1:09:17,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0689


 49%|████▊     | 1121/2304 [3:06:30<1:11:21,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 49%|████▊     | 1122/2304 [3:06:34<1:14:08,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 49%|████▊     | 1123/2304 [3:06:37<1:10:09,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0702


 49%|████▉     | 1124/2304 [3:06:40<1:05:39,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


 49%|████▉     | 1125/2304 [3:06:43<1:04:00,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 49%|████▉     | 1126/2304 [3:06:47<1:09:20,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 49%|████▉     | 1127/2304 [3:06:51<1:11:47,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0782


 49%|████▉     | 1128/2304 [3:06:55<1:14:35,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 49%|████▉     | 1129/2304 [3:06:58<1:11:00,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0778


 49%|████▉     | 1130/2304 [3:07:02<1:08:02,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 49%|████▉     | 1131/2304 [3:07:05<1:06:26,  3.40s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0916


 49%|████▉     | 1132/2304 [3:07:09<1:11:21,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0727


 49%|████▉     | 1133/2304 [3:07:13<1:14:01,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 49%|████▉     | 1134/2304 [3:07:17<1:16:27,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0729


 49%|████▉     | 1135/2304 [3:07:21<1:12:11,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0865


 49%|████▉     | 1136/2304 [3:07:24<1:08:35,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0759


 49%|████▉     | 1137/2304 [3:07:27<1:06:33,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 49%|████▉     | 1138/2304 [3:07:31<1:10:20,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


 49%|████▉     | 1139/2304 [3:07:35<1:14:03,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 49%|████▉     | 1140/2304 [3:07:39<1:16:36,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0731


 50%|████▉     | 1141/2304 [3:07:43<1:12:23,  3.73s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0748


 50%|████▉     | 1142/2304 [3:07:46<1:08:41,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 50%|████▉     | 1143/2304 [3:07:49<1:07:04,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 50%|████▉     | 1144/2304 [3:07:53<1:10:34,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 50%|████▉     | 1145/2304 [3:07:57<1:13:38,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0824


 50%|████▉     | 1146/2304 [3:08:02<1:16:15,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 50%|████▉     | 1147/2304 [3:08:05<1:11:49,  3.72s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 50%|████▉     | 1148/2304 [3:08:08<1:07:52,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 50%|████▉     | 1149/2304 [3:08:11<1:06:07,  3.43s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 50%|████▉     | 1150/2304 [3:08:15<1:09:42,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0753


 50%|████▉     | 1151/2304 [3:08:19<1:12:50,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


 50%|█████     | 1152/2304 [3:08:24<1:15:34,  3.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 50%|█████     | 1153/2304 [3:08:39<2:18:54,  7.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 50%|█████     | 1154/2304 [3:08:53<3:01:29,  9.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0763


 50%|█████     | 1155/2304 [3:09:08<3:34:35, 11.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0855


 50%|█████     | 1156/2304 [3:09:32<4:43:13, 14.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0786


 50%|█████     | 1157/2304 [3:09:54<5:28:33, 17.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0834


 50%|█████     | 1158/2304 [3:10:18<6:03:35, 19.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 50%|█████     | 1159/2304 [3:10:33<5:41:05, 17.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 50%|█████     | 1160/2304 [3:10:48<5:23:14, 16.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 50%|█████     | 1161/2304 [3:11:03<5:12:28, 16.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1176


 50%|█████     | 1162/2304 [3:11:26<5:51:04, 18.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0759


 50%|█████     | 1163/2304 [3:11:49<6:14:28, 19.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 51%|█████     | 1164/2304 [3:12:12<6:35:08, 20.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 51%|█████     | 1165/2304 [3:12:27<6:03:42, 19.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0801


 51%|█████     | 1166/2304 [3:12:42<5:37:45, 17.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 51%|█████     | 1167/2304 [3:12:57<5:22:18, 17.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0839


 51%|█████     | 1168/2304 [3:13:21<5:59:02, 18.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0760


 51%|█████     | 1169/2304 [3:13:44<6:21:12, 20.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0794


 51%|█████     | 1170/2304 [3:14:07<6:37:57, 21.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1023


 51%|█████     | 1171/2304 [3:14:22<6:06:13, 19.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0830


 51%|█████     | 1172/2304 [3:14:38<5:43:08, 18.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 51%|█████     | 1173/2304 [3:14:53<5:27:07, 17.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0778


 51%|█████     | 1174/2304 [3:15:17<6:05:37, 19.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0805


 51%|█████     | 1175/2304 [3:15:40<6:26:12, 20.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0848


 51%|█████     | 1176/2304 [3:16:04<6:44:15, 21.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 51%|█████     | 1177/2304 [3:16:20<6:09:06, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0982


 51%|█████     | 1178/2304 [3:16:34<5:42:03, 18.23s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0844


 51%|█████     | 1179/2304 [3:16:50<5:27:02, 17.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0811


 51%|█████     | 1180/2304 [3:17:14<6:01:52, 19.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 51%|█████▏    | 1181/2304 [3:17:37<6:22:32, 20.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0795


 51%|█████▏    | 1182/2304 [3:18:01<6:41:54, 21.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 51%|█████▏    | 1183/2304 [3:18:16<6:07:02, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0843


 51%|█████▏    | 1184/2304 [3:18:31<5:39:53, 18.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0815


 51%|█████▏    | 1185/2304 [3:18:46<5:24:39, 17.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0919


 51%|█████▏    | 1186/2304 [3:19:10<5:58:55, 19.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 52%|█████▏    | 1187/2304 [3:19:33<6:19:20, 20.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1029


 52%|█████▏    | 1188/2304 [3:19:57<6:38:03, 21.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0803


 52%|█████▏    | 1189/2304 [3:20:12<6:04:44, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


 52%|█████▏    | 1190/2304 [3:20:27<5:38:35, 18.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 52%|█████▏    | 1191/2304 [3:20:43<5:22:59, 17.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0741


 52%|█████▏    | 1192/2304 [3:21:06<5:56:49, 19.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 52%|█████▏    | 1193/2304 [3:21:30<6:18:43, 20.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0798


 52%|█████▏    | 1194/2304 [3:21:54<6:38:12, 21.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


 52%|█████▏    | 1195/2304 [3:22:09<6:03:29, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 52%|█████▏    | 1196/2304 [3:22:24<5:38:34, 18.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0797


 52%|█████▏    | 1197/2304 [3:22:40<5:21:32, 17.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0822


 52%|█████▏    | 1198/2304 [3:23:03<5:55:05, 19.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0870


 52%|█████▏    | 1199/2304 [3:23:26<6:16:22, 20.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0770


 52%|█████▏    | 1200/2304 [3:23:50<6:35:36, 21.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0831


 52%|█████▏    | 1201/2304 [3:24:05<6:00:25, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0839


 52%|█████▏    | 1202/2304 [3:24:20<5:34:29, 18.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


 52%|█████▏    | 1203/2304 [3:24:36<5:17:34, 17.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


 52%|█████▏    | 1204/2304 [3:24:59<5:50:36, 19.12s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0835


 52%|█████▏    | 1205/2304 [3:25:22<6:11:38, 20.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0806


 52%|█████▏    | 1206/2304 [3:25:46<6:29:52, 21.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 52%|█████▏    | 1207/2304 [3:26:01<5:55:49, 19.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0916


 52%|█████▏    | 1208/2304 [3:26:16<5:31:11, 18.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0749


 52%|█████▏    | 1209/2304 [3:26:31<5:17:06, 17.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0752


 53%|█████▎    | 1210/2304 [3:26:55<5:50:05, 19.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0791


 53%|█████▎    | 1211/2304 [3:27:18<6:10:35, 20.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0800


 53%|█████▎    | 1212/2304 [3:27:41<6:26:59, 21.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0848


 53%|█████▎    | 1213/2304 [3:27:57<5:54:04, 19.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 53%|█████▎    | 1214/2304 [3:28:11<5:27:48, 18.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0853


 53%|█████▎    | 1215/2304 [3:28:27<5:12:27, 17.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0703


 53%|█████▎    | 1216/2304 [3:28:50<5:45:38, 19.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0783


 53%|█████▎    | 1217/2304 [3:29:13<6:05:57, 20.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0787


 53%|█████▎    | 1218/2304 [3:29:36<6:24:09, 21.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 53%|█████▎    | 1219/2304 [3:29:52<5:50:48, 19.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0931


 53%|█████▎    | 1220/2304 [3:30:06<5:25:50, 18.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 53%|█████▎    | 1221/2304 [3:30:22<5:10:37, 17.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 53%|█████▎    | 1222/2304 [3:30:45<5:42:55, 19.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


 53%|█████▎    | 1223/2304 [3:31:08<6:03:34, 20.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 53%|█████▎    | 1224/2304 [3:31:31<6:21:57, 21.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0813


 53%|█████▎    | 1225/2304 [3:31:47<5:50:26, 19.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0890


 53%|█████▎    | 1226/2304 [3:32:02<5:25:46, 18.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0916


 53%|█████▎    | 1227/2304 [3:32:17<5:10:26, 17.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0891


 53%|█████▎    | 1228/2304 [3:32:41<5:43:57, 19.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1013


 53%|█████▎    | 1229/2304 [3:33:04<6:04:00, 20.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0941


 53%|█████▎    | 1230/2304 [3:33:28<6:23:26, 21.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0792


 53%|█████▎    | 1231/2304 [3:33:43<5:51:39, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0866


 53%|█████▎    | 1232/2304 [3:33:58<5:25:53, 18.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 54%|█████▎    | 1233/2304 [3:34:14<5:11:29, 17.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 54%|█████▎    | 1234/2304 [3:34:38<5:46:22, 19.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0818


 54%|█████▎    | 1235/2304 [3:35:02<6:09:40, 20.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0800


 54%|█████▎    | 1236/2304 [3:35:26<6:26:55, 21.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0984


 54%|█████▎    | 1237/2304 [3:35:41<5:54:16, 19.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0761


 54%|█████▎    | 1238/2304 [3:35:57<5:30:29, 18.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 54%|█████▍    | 1239/2304 [3:36:13<5:14:25, 17.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1008


 54%|█████▍    | 1240/2304 [3:36:37<5:48:26, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0844


 54%|█████▍    | 1241/2304 [3:37:00<6:09:36, 20.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0969


 54%|█████▍    | 1242/2304 [3:37:25<6:26:19, 21.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0817


 54%|█████▍    | 1243/2304 [3:37:40<5:53:02, 19.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


 54%|█████▍    | 1244/2304 [3:37:55<5:26:44, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0818


 54%|█████▍    | 1245/2304 [3:38:11<5:10:46, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 54%|█████▍    | 1246/2304 [3:38:35<5:42:51, 19.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0854


 54%|█████▍    | 1247/2304 [3:38:58<6:02:08, 20.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0869


 54%|█████▍    | 1248/2304 [3:39:21<6:18:10, 21.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0764


 54%|█████▍    | 1249/2304 [3:39:37<5:45:28, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 54%|█████▍    | 1250/2304 [3:39:52<5:21:32, 18.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0854


 54%|█████▍    | 1251/2304 [3:40:07<5:06:04, 17.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0662


 54%|█████▍    | 1252/2304 [3:40:31<5:39:20, 19.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 54%|█████▍    | 1253/2304 [3:40:54<5:59:57, 20.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 54%|█████▍    | 1254/2304 [3:41:18<6:16:39, 21.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 54%|█████▍    | 1255/2304 [3:41:34<5:45:49, 19.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0714


 55%|█████▍    | 1256/2304 [3:41:49<5:21:20, 18.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0905


 55%|█████▍    | 1257/2304 [3:42:05<5:07:03, 17.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0773


 55%|█████▍    | 1258/2304 [3:42:29<5:40:26, 19.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 55%|█████▍    | 1259/2304 [3:42:52<6:00:46, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0767


 55%|█████▍    | 1260/2304 [3:43:16<6:17:29, 21.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0775


 55%|█████▍    | 1261/2304 [3:43:32<5:44:39, 19.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0874


 55%|█████▍    | 1262/2304 [3:43:47<5:19:27, 18.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 55%|█████▍    | 1263/2304 [3:44:02<5:03:39, 17.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 55%|█████▍    | 1264/2304 [3:44:26<5:35:31, 19.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0724


 55%|█████▍    | 1265/2304 [3:44:50<5:56:59, 20.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0754


 55%|█████▍    | 1266/2304 [3:45:13<6:13:01, 21.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0757


 55%|█████▍    | 1267/2304 [3:45:29<5:41:01, 19.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 55%|█████▌    | 1268/2304 [3:45:44<5:16:08, 18.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 55%|█████▌    | 1269/2304 [3:45:59<4:59:35, 17.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0782


 55%|█████▌    | 1270/2304 [3:46:23<5:32:49, 19.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 55%|█████▌    | 1271/2304 [3:46:46<5:52:11, 20.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0779


 55%|█████▌    | 1272/2304 [3:47:10<6:11:46, 21.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 55%|█████▌    | 1273/2304 [3:47:26<5:39:38, 19.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 55%|█████▌    | 1274/2304 [3:47:41<5:14:34, 18.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0888


 55%|█████▌    | 1275/2304 [3:47:56<5:00:16, 17.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0881


 55%|█████▌    | 1276/2304 [3:48:20<5:31:09, 19.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


 55%|█████▌    | 1277/2304 [3:48:43<5:50:25, 20.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 55%|█████▌    | 1278/2304 [3:49:07<6:07:52, 21.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0755


 56%|█████▌    | 1279/2304 [3:49:22<5:37:03, 19.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0799


 56%|█████▌    | 1280/2304 [3:49:37<5:12:35, 18.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0892


 56%|█████▌    | 1281/2304 [3:49:53<4:57:27, 17.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0764


 56%|█████▌    | 1282/2304 [3:50:17<5:29:15, 19.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0894


 56%|█████▌    | 1283/2304 [3:50:40<5:48:12, 20.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 56%|█████▌    | 1284/2304 [3:51:04<6:05:06, 21.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0838


 56%|█████▌    | 1285/2304 [3:51:19<5:34:08, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1111


 56%|█████▌    | 1286/2304 [3:51:34<5:11:34, 18.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 56%|█████▌    | 1287/2304 [3:51:50<4:57:47, 17.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 56%|█████▌    | 1288/2304 [3:52:14<5:29:49, 19.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0739


 56%|█████▌    | 1289/2304 [3:52:37<5:48:26, 20.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 56%|█████▌    | 1290/2304 [3:53:01<6:03:38, 21.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0741


 56%|█████▌    | 1291/2304 [3:53:16<5:31:17, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0875


 56%|█████▌    | 1292/2304 [3:53:31<5:09:06, 18.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0805


 56%|█████▌    | 1293/2304 [3:53:47<4:55:19, 17.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 56%|█████▌    | 1294/2304 [3:54:11<5:25:27, 19.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0889


 56%|█████▌    | 1295/2304 [3:54:34<5:44:03, 20.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0771


 56%|█████▋    | 1296/2304 [3:54:58<6:01:46, 21.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0792


 56%|█████▋    | 1297/2304 [3:55:13<5:31:09, 19.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0671


 56%|█████▋    | 1298/2304 [3:55:28<5:07:45, 18.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0856


 56%|█████▋    | 1299/2304 [3:55:44<4:53:07, 17.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0830


 56%|█████▋    | 1300/2304 [3:56:08<5:26:08, 19.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0703


 56%|█████▋    | 1301/2304 [3:56:31<5:45:51, 20.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 57%|█████▋    | 1302/2304 [3:56:56<6:03:15, 21.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 57%|█████▋    | 1303/2304 [3:57:11<5:33:01, 19.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0778


 57%|█████▋    | 1304/2304 [3:57:27<5:09:54, 18.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0823


 57%|█████▋    | 1305/2304 [3:57:43<4:56:05, 17.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0857


 57%|█████▋    | 1306/2304 [3:58:07<5:27:09, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


 57%|█████▋    | 1307/2304 [3:58:30<5:45:29, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0775


 57%|█████▋    | 1308/2304 [3:58:54<5:58:23, 21.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0790


 57%|█████▋    | 1309/2304 [3:59:09<5:26:27, 19.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0811


 57%|█████▋    | 1310/2304 [3:59:24<5:00:53, 18.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1420


 57%|█████▋    | 1311/2304 [3:59:39<4:46:24, 17.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0803


 57%|█████▋    | 1312/2304 [4:00:02<5:16:52, 19.17s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0814


 57%|█████▋    | 1313/2304 [4:00:25<5:33:57, 20.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0878


 57%|█████▋    | 1314/2304 [4:00:49<5:49:47, 21.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0827


 57%|█████▋    | 1315/2304 [4:01:04<5:22:40, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0713


 57%|█████▋    | 1316/2304 [4:01:20<5:01:42, 18.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 57%|█████▋    | 1317/2304 [4:01:35<4:46:56, 17.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0898


 57%|█████▋    | 1318/2304 [4:01:59<5:16:02, 19.23s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0780


 57%|█████▋    | 1319/2304 [4:02:22<5:34:55, 20.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0836


 57%|█████▋    | 1320/2304 [4:02:46<5:51:29, 21.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 57%|█████▋    | 1321/2304 [4:03:01<5:21:19, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1070


 57%|█████▋    | 1322/2304 [4:03:16<4:58:47, 18.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0749


 57%|█████▋    | 1323/2304 [4:03:31<4:44:26, 17.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0896


 57%|█████▋    | 1324/2304 [4:03:55<5:12:46, 19.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 58%|█████▊    | 1325/2304 [4:04:18<5:32:38, 20.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0761


 58%|█████▊    | 1326/2304 [4:04:42<5:48:28, 21.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0887


 58%|█████▊    | 1327/2304 [4:04:57<5:18:49, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0914


 58%|█████▊    | 1328/2304 [4:05:12<4:56:37, 18.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0856


 58%|█████▊    | 1329/2304 [4:05:28<4:42:52, 17.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


 58%|█████▊    | 1330/2304 [4:05:51<5:12:37, 19.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 58%|█████▊    | 1331/2304 [4:06:14<5:30:54, 20.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0840


 58%|█████▊    | 1332/2304 [4:06:38<5:45:41, 21.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0796


 58%|█████▊    | 1333/2304 [4:06:53<5:17:43, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


 58%|█████▊    | 1334/2304 [4:07:09<4:56:25, 18.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0958


 58%|█████▊    | 1335/2304 [4:07:24<4:41:34, 17.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0835


 58%|█████▊    | 1336/2304 [4:07:47<5:10:36, 19.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0849


 58%|█████▊    | 1337/2304 [4:08:10<5:27:50, 20.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0839


 58%|█████▊    | 1338/2304 [4:08:34<5:42:52, 21.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0918


 58%|█████▊    | 1339/2304 [4:08:49<5:13:54, 19.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 58%|█████▊    | 1340/2304 [4:09:04<4:51:29, 18.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0813


 58%|█████▊    | 1341/2304 [4:09:20<4:37:40, 17.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0796


 58%|█████▊    | 1342/2304 [4:09:43<5:06:37, 19.12s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0773


 58%|█████▊    | 1343/2304 [4:10:06<5:24:53, 20.28s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0991


 58%|█████▊    | 1344/2304 [4:10:29<5:39:12, 21.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 58%|█████▊    | 1345/2304 [4:10:45<5:11:16, 19.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0643


 58%|█████▊    | 1346/2304 [4:11:00<4:49:45, 18.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0816


 58%|█████▊    | 1347/2304 [4:11:15<4:36:43, 17.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0833


 59%|█████▊    | 1348/2304 [4:11:39<5:05:51, 19.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 59%|█████▊    | 1349/2304 [4:12:02<5:24:09, 20.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 59%|█████▊    | 1350/2304 [4:12:26<5:40:00, 21.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 59%|█████▊    | 1351/2304 [4:12:41<5:10:09, 19.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 59%|█████▊    | 1352/2304 [4:12:56<4:47:56, 18.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0845


 59%|█████▊    | 1353/2304 [4:13:11<4:34:36, 17.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0843


 59%|█████▉    | 1354/2304 [4:13:35<5:03:17, 19.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 59%|█████▉    | 1355/2304 [4:13:57<5:20:26, 20.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0730


 59%|█████▉    | 1356/2304 [4:14:21<5:36:14, 21.28s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 59%|█████▉    | 1357/2304 [4:14:36<5:07:20, 19.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0725


 59%|█████▉    | 1358/2304 [4:14:51<4:46:22, 18.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0874


 59%|█████▉    | 1359/2304 [4:15:07<4:32:39, 17.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 59%|█████▉    | 1360/2304 [4:15:31<5:03:31, 19.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0856


 59%|█████▉    | 1361/2304 [4:15:54<5:21:23, 20.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0812


 59%|█████▉    | 1362/2304 [4:16:18<5:37:31, 21.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0777


 59%|█████▉    | 1363/2304 [4:16:33<5:09:12, 19.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 59%|█████▉    | 1364/2304 [4:16:48<4:46:27, 18.28s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0824


 59%|█████▉    | 1365/2304 [4:17:04<4:34:16, 17.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0741


 59%|█████▉    | 1366/2304 [4:17:28<5:04:50, 19.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 59%|█████▉    | 1367/2304 [4:17:52<5:24:27, 20.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 59%|█████▉    | 1368/2304 [4:18:16<5:41:01, 21.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1007


 59%|█████▉    | 1369/2304 [4:18:32<5:12:20, 20.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0876


 59%|█████▉    | 1370/2304 [4:18:48<4:51:39, 18.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1342


 60%|█████▉    | 1371/2304 [4:19:04<4:38:21, 17.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0789


 60%|█████▉    | 1372/2304 [4:19:28<5:08:21, 19.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 60%|█████▉    | 1373/2304 [4:19:51<5:24:26, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 60%|█████▉    | 1374/2304 [4:20:16<5:39:38, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 60%|█████▉    | 1375/2304 [4:20:32<5:12:11, 20.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 60%|█████▉    | 1376/2304 [4:20:47<4:48:30, 18.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0763


 60%|█████▉    | 1377/2304 [4:21:02<4:32:47, 17.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0990


 60%|█████▉    | 1378/2304 [4:21:26<5:02:23, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 60%|█████▉    | 1379/2304 [4:21:50<5:21:06, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 60%|█████▉    | 1380/2304 [4:22:14<5:36:54, 21.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 60%|█████▉    | 1381/2304 [4:22:30<5:07:10, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0747


 60%|█████▉    | 1382/2304 [4:22:45<4:44:20, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0848


 60%|██████    | 1383/2304 [4:23:01<4:30:43, 17.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0825


 60%|██████    | 1384/2304 [4:23:25<5:00:14, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0875


 60%|██████    | 1385/2304 [4:23:48<5:17:08, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


 60%|██████    | 1386/2304 [4:24:12<5:33:51, 21.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0865


 60%|██████    | 1387/2304 [4:24:28<5:05:11, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 60%|██████    | 1388/2304 [4:24:43<4:43:39, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 60%|██████    | 1389/2304 [4:24:59<4:30:32, 17.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0788


 60%|██████    | 1390/2304 [4:25:24<5:01:55, 19.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 60%|██████    | 1391/2304 [4:25:49<5:25:28, 21.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0793


 60%|██████    | 1392/2304 [4:26:14<5:43:29, 22.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0771


 60%|██████    | 1393/2304 [4:26:31<5:15:42, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0791


 61%|██████    | 1394/2304 [4:26:47<4:53:16, 19.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0827


 61%|██████    | 1395/2304 [4:27:03<4:37:36, 18.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0836


 61%|██████    | 1396/2304 [4:27:28<5:07:43, 20.33s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0706


 61%|██████    | 1397/2304 [4:27:52<5:25:33, 21.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0730


 61%|██████    | 1398/2304 [4:28:17<5:39:26, 22.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1608


 61%|██████    | 1399/2304 [4:28:33<5:12:21, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0751


 61%|██████    | 1400/2304 [4:28:49<4:50:22, 19.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0960


 61%|██████    | 1401/2304 [4:29:06<4:36:47, 18.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0806


 61%|██████    | 1402/2304 [4:29:31<5:07:52, 20.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 61%|██████    | 1403/2304 [4:29:56<5:28:36, 21.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0781


 61%|██████    | 1404/2304 [4:30:22<5:45:07, 23.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0695


 61%|██████    | 1405/2304 [4:30:38<5:15:42, 21.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0823


 61%|██████    | 1406/2304 [4:30:55<4:53:36, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 61%|██████    | 1407/2304 [4:31:10<4:36:14, 18.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0961


 61%|██████    | 1408/2304 [4:31:35<5:03:28, 20.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 61%|██████    | 1409/2304 [4:31:59<5:19:29, 21.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 61%|██████    | 1410/2304 [4:32:24<5:32:29, 22.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0718


 61%|██████    | 1411/2304 [4:32:39<5:01:59, 20.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 61%|██████▏   | 1412/2304 [4:32:54<4:39:17, 18.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0714


 61%|██████▏   | 1413/2304 [4:33:10<4:24:03, 17.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0683


 61%|██████▏   | 1414/2304 [4:33:33<4:49:57, 19.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0847


 61%|██████▏   | 1415/2304 [4:33:57<5:07:01, 20.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 61%|██████▏   | 1416/2304 [4:34:21<5:21:07, 21.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 62%|██████▏   | 1417/2304 [4:34:37<4:54:07, 19.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0857


 62%|██████▏   | 1418/2304 [4:34:52<4:32:20, 18.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0804


 62%|██████▏   | 1419/2304 [4:35:07<4:20:00, 17.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 62%|██████▏   | 1420/2304 [4:35:32<4:49:08, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


 62%|██████▏   | 1421/2304 [4:35:56<5:09:31, 21.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 62%|██████▏   | 1422/2304 [4:36:21<5:25:16, 22.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


 62%|██████▏   | 1423/2304 [4:36:36<4:56:25, 20.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 62%|██████▏   | 1424/2304 [4:36:51<4:33:33, 18.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0835


 62%|██████▏   | 1425/2304 [4:37:07<4:20:12, 17.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0866


 62%|██████▏   | 1426/2304 [4:37:31<4:47:12, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 62%|██████▏   | 1427/2304 [4:37:54<5:03:33, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 62%|██████▏   | 1428/2304 [4:38:18<5:16:49, 21.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 62%|██████▏   | 1429/2304 [4:38:34<4:48:11, 19.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0805


 62%|██████▏   | 1430/2304 [4:38:49<4:27:48, 18.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0802


 62%|██████▏   | 1431/2304 [4:39:04<4:15:43, 17.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 62%|██████▏   | 1432/2304 [4:39:28<4:42:52, 19.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0985


 62%|██████▏   | 1433/2304 [4:39:52<5:02:36, 20.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0929


 62%|██████▏   | 1434/2304 [4:40:17<5:17:50, 21.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 62%|██████▏   | 1435/2304 [4:40:33<4:50:50, 20.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0786


 62%|██████▏   | 1436/2304 [4:40:48<4:29:58, 18.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 62%|██████▏   | 1437/2304 [4:41:03<4:15:44, 17.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 62%|██████▏   | 1438/2304 [4:41:27<4:43:07, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0820


 62%|██████▏   | 1439/2304 [4:41:51<5:00:13, 20.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 62%|██████▎   | 1440/2304 [4:42:15<5:14:27, 21.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0840


 63%|██████▎   | 1441/2304 [4:42:24<4:15:49, 17.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0786


 63%|██████▎   | 1442/2304 [4:42:32<3:34:10, 14.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0658


 63%|██████▎   | 1443/2304 [4:42:40<3:04:24, 12.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0843


 63%|██████▎   | 1444/2304 [4:42:52<3:02:05, 12.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 63%|██████▎   | 1445/2304 [4:43:04<2:58:37, 12.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 63%|██████▎   | 1446/2304 [4:43:17<2:58:26, 12.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0827


 63%|██████▎   | 1447/2304 [4:43:25<2:40:09, 11.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0872


 63%|██████▎   | 1448/2304 [4:43:33<2:26:47, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 63%|██████▎   | 1449/2304 [4:43:41<2:18:27,  9.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 63%|██████▎   | 1450/2304 [4:43:54<2:31:02, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0756


 63%|██████▎   | 1451/2304 [4:44:07<2:38:39, 11.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0759


 63%|██████▎   | 1452/2304 [4:44:19<2:44:48, 11.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0739


 63%|██████▎   | 1453/2304 [4:44:28<2:30:26, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 63%|██████▎   | 1454/2304 [4:44:36<2:19:50,  9.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0681


 63%|██████▎   | 1455/2304 [4:44:44<2:12:43,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 63%|██████▎   | 1456/2304 [4:44:56<2:25:23, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 63%|██████▎   | 1457/2304 [4:45:08<2:32:58, 10.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 63%|██████▎   | 1458/2304 [4:45:21<2:39:21, 11.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 63%|██████▎   | 1459/2304 [4:45:29<2:26:06, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 63%|██████▎   | 1460/2304 [4:45:37<2:16:12,  9.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 63%|██████▎   | 1461/2304 [4:45:45<2:10:18,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 63%|██████▎   | 1462/2304 [4:45:58<2:23:13, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 63%|██████▎   | 1463/2304 [4:46:10<2:32:09, 10.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 64%|██████▎   | 1464/2304 [4:46:23<2:40:32, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 64%|██████▎   | 1465/2304 [4:46:32<2:27:46, 10.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 64%|██████▎   | 1466/2304 [4:46:40<2:17:50,  9.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 64%|██████▎   | 1467/2304 [4:46:48<2:11:33,  9.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0894


 64%|██████▎   | 1468/2304 [4:47:01<2:23:58, 10.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0763


 64%|██████▍   | 1469/2304 [4:47:13<2:31:53, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 64%|██████▍   | 1470/2304 [4:47:25<2:38:03, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0733


 64%|██████▍   | 1471/2304 [4:47:34<2:25:04, 10.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 64%|██████▍   | 1472/2304 [4:47:42<2:15:34,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0851


 64%|██████▍   | 1473/2304 [4:47:50<2:09:49,  9.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 64%|██████▍   | 1474/2304 [4:48:03<2:22:10, 10.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0833


 64%|██████▍   | 1475/2304 [4:48:15<2:30:15, 10.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 64%|██████▍   | 1476/2304 [4:48:28<2:38:10, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0743


 64%|██████▍   | 1477/2304 [4:48:36<2:25:05, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 64%|██████▍   | 1478/2304 [4:48:44<2:15:10,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 64%|██████▍   | 1479/2304 [4:48:53<2:08:54,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 64%|██████▍   | 1480/2304 [4:49:05<2:22:27, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0689


 64%|██████▍   | 1481/2304 [4:49:18<2:30:47, 10.99s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 64%|██████▍   | 1482/2304 [4:49:30<2:37:36, 11.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 64%|██████▍   | 1483/2304 [4:49:39<2:24:14, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0696


 64%|██████▍   | 1484/2304 [4:49:47<2:14:06,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0787


 64%|██████▍   | 1485/2304 [4:49:55<2:07:36,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 64%|██████▍   | 1486/2304 [4:50:08<2:21:19, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0774


 65%|██████▍   | 1487/2304 [4:50:20<2:28:33, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0914


 65%|██████▍   | 1488/2304 [4:50:33<2:35:20, 11.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0772


 65%|██████▍   | 1489/2304 [4:50:41<2:22:18, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1097


 65%|██████▍   | 1490/2304 [4:50:49<2:11:44,  9.71s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 65%|██████▍   | 1491/2304 [4:50:57<2:06:21,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 65%|██████▍   | 1492/2304 [4:51:10<2:19:19, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1274


 65%|██████▍   | 1493/2304 [4:51:22<2:26:31, 10.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1193


 65%|██████▍   | 1494/2304 [4:51:34<2:32:35, 11.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0750


 65%|██████▍   | 1495/2304 [4:51:43<2:20:00, 10.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 65%|██████▍   | 1496/2304 [4:51:50<2:09:40,  9.63s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 65%|██████▍   | 1497/2304 [4:51:59<2:04:19,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 65%|██████▌   | 1498/2304 [4:52:11<2:17:27, 10.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 65%|██████▌   | 1499/2304 [4:52:24<2:25:35, 10.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 65%|██████▌   | 1500/2304 [4:52:36<2:32:05, 11.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0865


 65%|██████▌   | 1501/2304 [4:52:44<2:19:35, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 65%|██████▌   | 1502/2304 [4:52:52<2:09:54,  9.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0745


 65%|██████▌   | 1503/2304 [4:53:01<2:04:06,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 65%|██████▌   | 1504/2304 [4:53:13<2:16:27, 10.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0666


 65%|██████▌   | 1505/2304 [4:53:25<2:23:35, 10.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 65%|██████▌   | 1506/2304 [4:53:38<2:30:03, 11.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0799


 65%|██████▌   | 1507/2304 [4:53:46<2:17:36, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0687


 65%|██████▌   | 1508/2304 [4:53:54<2:08:07,  9.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 65%|██████▌   | 1509/2304 [4:54:02<2:02:29,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0694


 66%|██████▌   | 1510/2304 [4:54:15<2:14:25, 10.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0781


 66%|██████▌   | 1511/2304 [4:54:27<2:23:11, 10.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 66%|██████▌   | 1512/2304 [4:54:39<2:28:55, 11.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0782


 66%|██████▌   | 1513/2304 [4:54:48<2:17:20, 10.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0847


 66%|██████▌   | 1514/2304 [4:54:56<2:08:26,  9.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 66%|██████▌   | 1515/2304 [4:55:04<2:02:36,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0862


 66%|██████▌   | 1516/2304 [4:55:17<2:14:43, 10.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0800


 66%|██████▌   | 1517/2304 [4:55:29<2:22:50, 10.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


 66%|██████▌   | 1518/2304 [4:55:42<2:29:17, 11.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 66%|██████▌   | 1519/2304 [4:55:50<2:17:08, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0778


 66%|██████▌   | 1520/2304 [4:55:58<2:07:34,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 66%|██████▌   | 1521/2304 [4:56:06<2:02:21,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0704


 66%|██████▌   | 1522/2304 [4:56:19<2:15:06, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 66%|██████▌   | 1523/2304 [4:56:31<2:22:04, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 66%|██████▌   | 1524/2304 [4:56:44<2:28:40, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 66%|██████▌   | 1525/2304 [4:56:53<2:17:05, 10.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 66%|██████▌   | 1526/2304 [4:57:01<2:08:11,  9.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0776


 66%|██████▋   | 1527/2304 [4:57:09<2:02:25,  9.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0718


 66%|██████▋   | 1528/2304 [4:57:22<2:14:53, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 66%|██████▋   | 1529/2304 [4:57:34<2:21:46, 10.98s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0804


 66%|██████▋   | 1530/2304 [4:57:47<2:27:52, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 66%|██████▋   | 1531/2304 [4:57:55<2:15:56, 10.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0748


 66%|██████▋   | 1532/2304 [4:58:04<2:06:50,  9.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0739


 67%|██████▋   | 1533/2304 [4:58:12<2:01:16,  9.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 67%|██████▋   | 1534/2304 [4:58:24<2:12:57, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 67%|██████▋   | 1535/2304 [4:58:37<2:19:57, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 67%|██████▋   | 1536/2304 [4:58:49<2:26:28, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 67%|██████▋   | 1537/2304 [4:58:58<2:14:01, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 67%|██████▋   | 1538/2304 [4:59:06<2:04:34,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0832


 67%|██████▋   | 1539/2304 [4:59:14<1:58:25,  9.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


 67%|██████▋   | 1540/2304 [4:59:26<2:10:43, 10.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 67%|██████▋   | 1541/2304 [4:59:39<2:18:02, 10.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0768


 67%|██████▋   | 1542/2304 [4:59:51<2:24:17, 11.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0971


 67%|██████▋   | 1543/2304 [5:00:00<2:12:37, 10.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 67%|██████▋   | 1544/2304 [5:00:08<2:04:26,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0886


 67%|██████▋   | 1545/2304 [5:00:16<1:59:20,  9.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0838


 67%|██████▋   | 1546/2304 [5:00:29<2:11:30, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 67%|██████▋   | 1547/2304 [5:00:41<2:18:47, 11.00s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 67%|██████▋   | 1548/2304 [5:00:54<2:24:09, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 67%|██████▋   | 1549/2304 [5:01:02<2:11:44, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0699


 67%|██████▋   | 1550/2304 [5:01:10<2:01:54,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0745


 67%|██████▋   | 1551/2304 [5:01:18<1:55:44,  9.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 67%|██████▋   | 1552/2304 [5:01:30<2:06:58, 10.13s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0759


 67%|██████▋   | 1553/2304 [5:01:43<2:14:44, 10.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 67%|██████▋   | 1554/2304 [5:01:55<2:22:08, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0639


 67%|██████▋   | 1555/2304 [5:02:04<2:10:39, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 68%|██████▊   | 1556/2304 [5:02:12<2:01:42,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 68%|██████▊   | 1557/2304 [5:02:20<1:55:30,  9.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0762


 68%|██████▊   | 1558/2304 [5:02:32<2:06:53, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


 68%|██████▊   | 1559/2304 [5:02:45<2:14:11, 10.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0751


 68%|██████▊   | 1560/2304 [5:02:57<2:19:42, 11.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 68%|██████▊   | 1561/2304 [5:03:05<2:08:46, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0894


 68%|██████▊   | 1562/2304 [5:03:13<1:59:56,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0804


 68%|██████▊   | 1563/2304 [5:03:22<1:54:52,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0713


 68%|██████▊   | 1564/2304 [5:03:34<2:07:15, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 68%|██████▊   | 1565/2304 [5:03:47<2:14:18, 10.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 68%|██████▊   | 1566/2304 [5:03:59<2:19:27, 11.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 68%|██████▊   | 1567/2304 [5:04:07<2:07:51, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 68%|██████▊   | 1568/2304 [5:04:15<1:58:58,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0805


 68%|██████▊   | 1569/2304 [5:04:24<1:53:32,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0799


 68%|██████▊   | 1570/2304 [5:04:36<2:05:40, 10.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0792


 68%|██████▊   | 1571/2304 [5:04:49<2:13:27, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.2402


 68%|██████▊   | 1572/2304 [5:05:01<2:19:22, 11.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 68%|██████▊   | 1573/2304 [5:05:10<2:07:33, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0823


 68%|██████▊   | 1574/2304 [5:05:18<1:58:56,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0818


 68%|██████▊   | 1575/2304 [5:05:26<1:53:36,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 68%|██████▊   | 1576/2304 [5:05:39<2:04:56, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 68%|██████▊   | 1577/2304 [5:05:51<2:11:57, 10.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0775


 68%|██████▊   | 1578/2304 [5:06:04<2:18:38, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


 69%|██████▊   | 1579/2304 [5:06:12<2:07:16, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0762


 69%|██████▊   | 1580/2304 [5:06:20<1:58:19,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0838


 69%|██████▊   | 1581/2304 [5:06:28<1:52:22,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0738


 69%|██████▊   | 1582/2304 [5:06:41<2:02:48, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 69%|██████▊   | 1583/2304 [5:06:53<2:10:15, 10.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 69%|██████▉   | 1584/2304 [5:07:06<2:16:39, 11.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0714


 69%|██████▉   | 1585/2304 [5:07:14<2:04:52, 10.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 69%|██████▉   | 1586/2304 [5:07:22<1:55:47,  9.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 69%|██████▉   | 1587/2304 [5:07:30<1:50:17,  9.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 69%|██████▉   | 1588/2304 [5:07:42<2:01:20, 10.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 69%|██████▉   | 1589/2304 [5:07:54<2:08:28, 10.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 69%|██████▉   | 1590/2304 [5:08:07<2:15:09, 11.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0706


 69%|██████▉   | 1591/2304 [5:08:15<2:03:53, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 69%|██████▉   | 1592/2304 [5:08:23<1:54:51,  9.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


 69%|██████▉   | 1593/2304 [5:08:32<1:49:29,  9.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 69%|██████▉   | 1594/2304 [5:08:44<2:00:05, 10.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0671


 69%|██████▉   | 1595/2304 [5:08:56<2:07:02, 10.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0782


 69%|██████▉   | 1596/2304 [5:09:08<2:12:56, 11.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 69%|██████▉   | 1597/2304 [5:09:17<2:02:35, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0836


 69%|██████▉   | 1598/2304 [5:09:25<1:54:00,  9.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 69%|██████▉   | 1599/2304 [5:09:33<1:48:43,  9.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0798


 69%|██████▉   | 1600/2304 [5:09:46<1:59:58, 10.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1241


 69%|██████▉   | 1601/2304 [5:09:58<2:07:54, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 70%|██████▉   | 1602/2304 [5:10:11<2:14:02, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0824


 70%|██████▉   | 1603/2304 [5:10:19<2:03:02, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 70%|██████▉   | 1604/2304 [5:10:27<1:54:43,  9.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 70%|██████▉   | 1605/2304 [5:10:36<1:48:57,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 70%|██████▉   | 1606/2304 [5:10:48<1:59:09, 10.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0736


 70%|██████▉   | 1607/2304 [5:11:00<2:05:27, 10.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0836


 70%|██████▉   | 1608/2304 [5:11:13<2:11:59, 11.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 70%|██████▉   | 1609/2304 [5:11:21<2:01:12, 10.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0860


 70%|██████▉   | 1610/2304 [5:11:29<1:53:21,  9.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0824


 70%|██████▉   | 1611/2304 [5:11:38<1:48:02,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0850


 70%|██████▉   | 1612/2304 [5:11:50<1:58:12, 10.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 70%|███████   | 1613/2304 [5:12:02<2:04:45, 10.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 70%|███████   | 1614/2304 [5:12:15<2:10:08, 11.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 70%|███████   | 1615/2304 [5:12:23<1:58:56, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0754


 70%|███████   | 1616/2304 [5:12:31<1:51:07,  9.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0835


 70%|███████   | 1617/2304 [5:12:39<1:46:38,  9.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0812


 70%|███████   | 1618/2304 [5:12:52<1:57:35, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0824


 70%|███████   | 1619/2304 [5:13:04<2:05:08, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 70%|███████   | 1620/2304 [5:13:17<2:10:42, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 70%|███████   | 1621/2304 [5:13:26<2:00:56, 10.62s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0852


 70%|███████   | 1622/2304 [5:13:34<1:53:10,  9.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 70%|███████   | 1623/2304 [5:13:43<1:47:44,  9.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 70%|███████   | 1624/2304 [5:13:55<1:58:46, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 71%|███████   | 1625/2304 [5:14:08<2:05:34, 11.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0765


 71%|███████   | 1626/2304 [5:14:21<2:11:15, 11.62s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0698


 71%|███████   | 1627/2304 [5:14:29<2:00:03, 10.64s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 71%|███████   | 1628/2304 [5:14:37<1:51:48,  9.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 71%|███████   | 1629/2304 [5:14:46<1:46:08,  9.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0762


 71%|███████   | 1630/2304 [5:14:58<1:56:34, 10.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 71%|███████   | 1631/2304 [5:15:11<2:02:58, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0790


 71%|███████   | 1632/2304 [5:15:23<2:08:27, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0794


 71%|███████   | 1633/2304 [5:15:31<1:57:18, 10.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0873


 71%|███████   | 1634/2304 [5:15:39<1:49:14,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0870


 71%|███████   | 1635/2304 [5:15:48<1:43:35,  9.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0813


 71%|███████   | 1636/2304 [5:16:00<1:53:11, 10.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 71%|███████   | 1637/2304 [5:16:12<1:59:35, 10.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 71%|███████   | 1638/2304 [5:16:24<2:04:50, 11.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0712


 71%|███████   | 1639/2304 [5:16:32<1:54:14, 10.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0822


 71%|███████   | 1640/2304 [5:16:41<1:46:57,  9.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0628


 71%|███████   | 1641/2304 [5:16:49<1:41:57,  9.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 71%|███████▏  | 1642/2304 [5:17:01<1:52:06, 10.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0814


 71%|███████▏  | 1643/2304 [5:17:14<1:59:01, 10.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 71%|███████▏  | 1644/2304 [5:17:26<2:04:48, 11.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 71%|███████▏  | 1645/2304 [5:17:34<1:53:49, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


 71%|███████▏  | 1646/2304 [5:17:42<1:45:34,  9.63s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0804


 71%|███████▏  | 1647/2304 [5:17:50<1:40:58,  9.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0836


 72%|███████▏  | 1648/2304 [5:18:03<1:51:26, 10.19s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0710


 72%|███████▏  | 1649/2304 [5:18:15<1:57:39, 10.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 72%|███████▏  | 1650/2304 [5:18:27<2:03:03, 11.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1595


 72%|███████▏  | 1651/2304 [5:18:36<1:52:51, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0808


 72%|███████▏  | 1652/2304 [5:18:44<1:45:36,  9.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0828


 72%|███████▏  | 1653/2304 [5:18:52<1:41:15,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 72%|███████▏  | 1654/2304 [5:19:05<1:51:41, 10.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0724


 72%|███████▏  | 1655/2304 [5:19:17<1:58:33, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0720


 72%|███████▏  | 1656/2304 [5:19:30<2:03:48, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 72%|███████▏  | 1657/2304 [5:19:38<1:53:02, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0695


 72%|███████▏  | 1658/2304 [5:19:46<1:45:13,  9.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0892


 72%|███████▏  | 1659/2304 [5:19:55<1:40:37,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 72%|███████▏  | 1660/2304 [5:20:07<1:50:54, 10.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 72%|███████▏  | 1661/2304 [5:20:20<1:57:15, 10.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 72%|███████▏  | 1662/2304 [5:20:32<2:02:02, 11.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 72%|███████▏  | 1663/2304 [5:20:40<1:51:23, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0874


 72%|███████▏  | 1664/2304 [5:20:48<1:43:59,  9.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0667


 72%|███████▏  | 1665/2304 [5:20:57<1:39:01,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0777


 72%|███████▏  | 1666/2304 [5:21:09<1:48:36, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


 72%|███████▏  | 1667/2304 [5:21:21<1:55:10, 10.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 72%|███████▏  | 1668/2304 [5:21:34<2:01:18, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0813


 72%|███████▏  | 1669/2304 [5:21:43<1:51:44, 10.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0954


 72%|███████▏  | 1670/2304 [5:21:51<1:45:17,  9.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0779


 73%|███████▎  | 1671/2304 [5:22:00<1:40:11,  9.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0810


 73%|███████▎  | 1672/2304 [5:22:12<1:49:47, 10.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0915


 73%|███████▎  | 1673/2304 [5:22:25<1:56:02, 11.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0788


 73%|███████▎  | 1674/2304 [5:22:37<1:59:50, 11.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0730


 73%|███████▎  | 1675/2304 [5:22:45<1:49:28, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 73%|███████▎  | 1676/2304 [5:22:53<1:41:54,  9.74s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0748


 73%|███████▎  | 1677/2304 [5:23:02<1:37:11,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 73%|███████▎  | 1678/2304 [5:23:14<1:46:49, 10.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0777


 73%|███████▎  | 1679/2304 [5:23:26<1:52:39, 10.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 73%|███████▎  | 1680/2304 [5:23:39<1:57:13, 11.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0781


 73%|███████▎  | 1681/2304 [5:23:47<1:47:06, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0869


 73%|███████▎  | 1682/2304 [5:23:55<1:39:32,  9.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 73%|███████▎  | 1683/2304 [5:24:03<1:34:41,  9.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0809


 73%|███████▎  | 1684/2304 [5:24:15<1:43:59, 10.06s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 73%|███████▎  | 1685/2304 [5:24:27<1:49:21, 10.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 73%|███████▎  | 1686/2304 [5:24:39<1:54:49, 11.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 73%|███████▎  | 1687/2304 [5:24:47<1:45:48, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 73%|███████▎  | 1688/2304 [5:24:55<1:38:41,  9.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 73%|███████▎  | 1689/2304 [5:25:04<1:33:58,  9.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0845


 73%|███████▎  | 1690/2304 [5:25:16<1:43:12, 10.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 73%|███████▎  | 1691/2304 [5:25:28<1:49:39, 10.73s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 73%|███████▎  | 1692/2304 [5:25:41<1:55:09, 11.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0762


 73%|███████▎  | 1693/2304 [5:25:49<1:46:14, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0718


 74%|███████▎  | 1694/2304 [5:25:57<1:38:32,  9.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 74%|███████▎  | 1695/2304 [5:26:06<1:34:54,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 74%|███████▎  | 1696/2304 [5:26:18<1:44:48, 10.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


 74%|███████▎  | 1697/2304 [5:26:30<1:50:10, 10.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 74%|███████▎  | 1698/2304 [5:26:43<1:53:45, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0996


 74%|███████▎  | 1699/2304 [5:26:51<1:44:14, 10.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 74%|███████▍  | 1700/2304 [5:26:59<1:36:58,  9.63s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 74%|███████▍  | 1701/2304 [5:27:07<1:32:36,  9.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0704


 74%|███████▍  | 1702/2304 [5:27:19<1:41:56, 10.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 74%|███████▍  | 1703/2304 [5:27:31<1:47:07, 10.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0798


 74%|███████▍  | 1704/2304 [5:27:44<1:51:52, 11.19s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0778


 74%|███████▍  | 1705/2304 [5:27:52<1:43:20, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0689


 74%|███████▍  | 1706/2304 [5:28:00<1:36:08,  9.65s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0846


 74%|███████▍  | 1707/2304 [5:28:08<1:31:47,  9.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0673


 74%|███████▍  | 1708/2304 [5:28:21<1:40:41, 10.14s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0671


 74%|███████▍  | 1709/2304 [5:28:33<1:47:06, 10.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 74%|███████▍  | 1710/2304 [5:28:45<1:52:11, 11.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 74%|███████▍  | 1711/2304 [5:28:54<1:43:13, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 74%|███████▍  | 1712/2304 [5:29:02<1:36:14,  9.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 74%|███████▍  | 1713/2304 [5:29:10<1:31:59,  9.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 74%|███████▍  | 1714/2304 [5:29:23<1:41:21, 10.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 74%|███████▍  | 1715/2304 [5:29:35<1:46:33, 10.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 74%|███████▍  | 1716/2304 [5:29:48<1:51:24, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 75%|███████▍  | 1717/2304 [5:29:56<1:42:09, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0851


 75%|███████▍  | 1718/2304 [5:30:04<1:35:28,  9.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


 75%|███████▍  | 1719/2304 [5:30:13<1:31:33,  9.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0777


 75%|███████▍  | 1720/2304 [5:30:25<1:41:38, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 75%|███████▍  | 1721/2304 [5:30:38<1:48:37, 11.18s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 75%|███████▍  | 1722/2304 [5:30:51<1:53:51, 11.74s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693


 75%|███████▍  | 1723/2304 [5:31:00<1:44:43, 10.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 75%|███████▍  | 1724/2304 [5:31:08<1:37:18, 10.07s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 75%|███████▍  | 1725/2304 [5:31:17<1:32:33,  9.59s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


 75%|███████▍  | 1726/2304 [5:31:30<1:43:06, 10.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0934


 75%|███████▍  | 1727/2304 [5:31:43<1:48:10, 11.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


 75%|███████▌  | 1728/2304 [5:31:56<1:52:49, 11.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 75%|███████▌  | 1729/2304 [5:32:00<1:32:26,  9.65s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 75%|███████▌  | 1730/2304 [5:32:05<1:17:49,  8.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 75%|███████▌  | 1731/2304 [5:32:10<1:08:32,  7.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 75%|███████▌  | 1732/2304 [5:32:17<1:07:47,  7.11s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0824


 75%|███████▌  | 1733/2304 [5:32:24<1:07:34,  7.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 75%|███████▌  | 1734/2304 [5:32:31<1:06:46,  7.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0772


 75%|███████▌  | 1735/2304 [5:32:36<1:00:24,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0743


 75%|███████▌  | 1736/2304 [5:32:40<55:44,  5.89s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 75%|███████▌  | 1737/2304 [5:32:45<52:29,  5.55s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 75%|███████▌  | 1738/2304 [5:32:52<57:15,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 75%|███████▌  | 1739/2304 [5:32:59<59:28,  6.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 76%|███████▌  | 1740/2304 [5:33:06<1:01:14,  6.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0916


 76%|███████▌  | 1741/2304 [5:33:11<56:51,  6.06s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 76%|███████▌  | 1742/2304 [5:33:16<53:01,  5.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 76%|███████▌  | 1743/2304 [5:33:21<50:52,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0675


 76%|███████▌  | 1744/2304 [5:33:28<55:39,  5.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 76%|███████▌  | 1745/2304 [5:33:35<58:00,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0761


 76%|███████▌  | 1746/2304 [5:33:42<1:00:12,  6.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 76%|███████▌  | 1747/2304 [5:33:47<55:27,  5.97s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 76%|███████▌  | 1748/2304 [5:33:52<51:52,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 76%|███████▌  | 1749/2304 [5:33:57<49:59,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 76%|███████▌  | 1750/2304 [5:34:04<54:24,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0666


 76%|███████▌  | 1751/2304 [5:34:10<56:37,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0708


 76%|███████▌  | 1752/2304 [5:34:17<58:38,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0860


 76%|███████▌  | 1753/2304 [5:34:22<54:09,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0835


 76%|███████▌  | 1754/2304 [5:34:27<51:05,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 76%|███████▌  | 1755/2304 [5:34:32<48:40,  5.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0740


 76%|███████▌  | 1756/2304 [5:34:38<53:03,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 76%|███████▋  | 1757/2304 [5:34:45<55:59,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0833


 76%|███████▋  | 1758/2304 [5:34:52<57:50,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 76%|███████▋  | 1759/2304 [5:34:57<53:34,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0736


 76%|███████▋  | 1760/2304 [5:35:02<50:42,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 76%|███████▋  | 1761/2304 [5:35:07<48:33,  5.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


 76%|███████▋  | 1762/2304 [5:35:14<53:19,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0926


 77%|███████▋  | 1763/2304 [5:35:21<55:02,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0720


 77%|███████▋  | 1764/2304 [5:35:27<56:43,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0734


 77%|███████▋  | 1765/2304 [5:35:32<52:41,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1135


 77%|███████▋  | 1766/2304 [5:35:37<49:18,  5.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 77%|███████▋  | 1767/2304 [5:35:42<47:33,  5.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 77%|███████▋  | 1768/2304 [5:35:49<51:48,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0732


 77%|███████▋  | 1769/2304 [5:35:55<54:22,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 77%|███████▋  | 1770/2304 [5:36:03<57:26,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0762


 77%|███████▋  | 1771/2304 [5:36:08<53:09,  5.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 77%|███████▋  | 1772/2304 [5:36:12<49:51,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 77%|███████▋  | 1773/2304 [5:36:17<47:50,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0660


 77%|███████▋  | 1774/2304 [5:36:24<52:02,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 77%|███████▋  | 1775/2304 [5:36:31<54:42,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0800


 77%|███████▋  | 1776/2304 [5:36:38<56:41,  6.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0745


 77%|███████▋  | 1777/2304 [5:36:43<52:16,  5.95s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 77%|███████▋  | 1778/2304 [5:36:48<49:26,  5.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0721


 77%|███████▋  | 1779/2304 [5:36:53<46:55,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 77%|███████▋  | 1780/2304 [5:36:59<50:42,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 77%|███████▋  | 1781/2304 [5:37:06<53:44,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 77%|███████▋  | 1782/2304 [5:37:13<55:50,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0837


 77%|███████▋  | 1783/2304 [5:37:18<51:21,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 77%|███████▋  | 1784/2304 [5:37:23<47:55,  5.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 77%|███████▋  | 1785/2304 [5:37:28<45:41,  5.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 78%|███████▊  | 1786/2304 [5:37:35<49:59,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 78%|███████▊  | 1787/2304 [5:37:41<52:22,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 78%|███████▊  | 1788/2304 [5:37:48<54:42,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0847


 78%|███████▊  | 1789/2304 [5:37:53<50:39,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 78%|███████▊  | 1790/2304 [5:37:58<47:13,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0662


 78%|███████▊  | 1791/2304 [5:38:03<46:16,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 78%|███████▊  | 1792/2304 [5:38:10<50:37,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0705


 78%|███████▊  | 1793/2304 [5:38:17<53:29,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 78%|███████▊  | 1794/2304 [5:38:25<56:09,  6.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0762


 78%|███████▊  | 1795/2304 [5:38:29<51:17,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 78%|███████▊  | 1796/2304 [5:38:34<47:43,  5.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 78%|███████▊  | 1797/2304 [5:38:39<46:19,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0694


 78%|███████▊  | 1798/2304 [5:38:46<49:45,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0841


 78%|███████▊  | 1799/2304 [5:38:53<52:39,  6.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 78%|███████▊  | 1800/2304 [5:39:00<54:32,  6.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0753


 78%|███████▊  | 1801/2304 [5:39:05<50:30,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0903


 78%|███████▊  | 1802/2304 [5:39:10<47:26,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 78%|███████▊  | 1803/2304 [5:39:15<45:38,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0901


 78%|███████▊  | 1804/2304 [5:39:22<49:36,  5.95s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 78%|███████▊  | 1805/2304 [5:39:29<52:54,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 78%|███████▊  | 1806/2304 [5:39:36<54:32,  6.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 78%|███████▊  | 1807/2304 [5:39:41<50:24,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 78%|███████▊  | 1808/2304 [5:39:46<46:45,  5.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 79%|███████▊  | 1809/2304 [5:39:51<44:53,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0709


 79%|███████▊  | 1810/2304 [5:39:58<49:07,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0902


 79%|███████▊  | 1811/2304 [5:40:05<51:14,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 79%|███████▊  | 1812/2304 [5:40:12<53:01,  6.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.2050


 79%|███████▊  | 1813/2304 [5:40:17<48:51,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 79%|███████▊  | 1814/2304 [5:40:21<45:34,  5.58s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 79%|███████▉  | 1815/2304 [5:40:26<43:55,  5.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 79%|███████▉  | 1816/2304 [5:40:33<47:31,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0823


 79%|███████▉  | 1817/2304 [5:40:40<50:36,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0777


 79%|███████▉  | 1818/2304 [5:40:48<53:26,  6.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 79%|███████▉  | 1819/2304 [5:40:53<49:34,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0733


 79%|███████▉  | 1820/2304 [5:40:58<46:56,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 79%|███████▉  | 1821/2304 [5:41:03<45:18,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0713


 79%|███████▉  | 1822/2304 [5:41:10<48:38,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 79%|███████▉  | 1823/2304 [5:41:17<50:44,  6.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


 79%|███████▉  | 1824/2304 [5:41:24<51:54,  6.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0710


 79%|███████▉  | 1825/2304 [5:41:29<47:34,  5.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0844


 79%|███████▉  | 1826/2304 [5:41:34<44:50,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 79%|███████▉  | 1827/2304 [5:41:38<42:37,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 79%|███████▉  | 1828/2304 [5:41:45<46:14,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0933


 79%|███████▉  | 1829/2304 [5:41:52<48:15,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 79%|███████▉  | 1830/2304 [5:41:59<49:44,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0718


 79%|███████▉  | 1831/2304 [5:42:04<46:08,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 80%|███████▉  | 1832/2304 [5:42:08<43:11,  5.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 80%|███████▉  | 1833/2304 [5:42:13<41:02,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0783


 80%|███████▉  | 1834/2304 [5:42:20<44:40,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0808


 80%|███████▉  | 1835/2304 [5:42:26<46:56,  6.01s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 80%|███████▉  | 1836/2304 [5:42:33<49:10,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 80%|███████▉  | 1837/2304 [5:42:38<45:23,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0770


 80%|███████▉  | 1838/2304 [5:42:43<42:30,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 80%|███████▉  | 1839/2304 [5:42:48<40:57,  5.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 80%|███████▉  | 1840/2304 [5:42:55<44:48,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0730


 80%|███████▉  | 1841/2304 [5:43:02<47:33,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 80%|███████▉  | 1842/2304 [5:43:09<49:55,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 80%|███████▉  | 1843/2304 [5:43:14<45:58,  5.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0717


 80%|████████  | 1844/2304 [5:43:19<43:27,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 80%|████████  | 1845/2304 [5:43:23<41:35,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0694


 80%|████████  | 1846/2304 [5:43:31<45:13,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1002


 80%|████████  | 1847/2304 [5:43:38<47:58,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1066


 80%|████████  | 1848/2304 [5:43:45<49:32,  6.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 80%|████████  | 1849/2304 [5:43:50<46:00,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0743


 80%|████████  | 1850/2304 [5:43:54<42:54,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 80%|████████  | 1851/2304 [5:43:59<40:49,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0784


 80%|████████  | 1852/2304 [5:44:06<44:36,  5.92s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 80%|████████  | 1853/2304 [5:44:13<46:28,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 80%|████████  | 1854/2304 [5:44:20<47:47,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


 81%|████████  | 1855/2304 [5:44:25<44:27,  5.94s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 81%|████████  | 1856/2304 [5:44:29<41:14,  5.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 81%|████████  | 1857/2304 [5:44:34<39:43,  5.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0722


 81%|████████  | 1858/2304 [5:44:41<43:19,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0845


 81%|████████  | 1859/2304 [5:44:48<45:50,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 81%|████████  | 1860/2304 [5:44:56<48:18,  6.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 81%|████████  | 1861/2304 [5:45:01<44:50,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 81%|████████  | 1862/2304 [5:45:05<41:45,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 81%|████████  | 1863/2304 [5:45:10<40:18,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0799


 81%|████████  | 1864/2304 [5:45:18<43:46,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0849


 81%|████████  | 1865/2304 [5:45:25<46:07,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0681


 81%|████████  | 1866/2304 [5:45:32<47:28,  6.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0806


 81%|████████  | 1867/2304 [5:45:37<43:46,  6.01s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 81%|████████  | 1868/2304 [5:45:41<41:00,  5.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 81%|████████  | 1869/2304 [5:45:46<39:02,  5.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0763


 81%|████████  | 1870/2304 [5:45:53<42:19,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 81%|████████  | 1871/2304 [5:46:00<44:44,  6.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 81%|████████▏ | 1872/2304 [5:46:07<46:07,  6.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0704


 81%|████████▏ | 1873/2304 [5:46:12<42:50,  5.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0910


 81%|████████▏ | 1874/2304 [5:46:17<39:54,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 81%|████████▏ | 1875/2304 [5:46:21<37:48,  5.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0750


 81%|████████▏ | 1876/2304 [5:46:28<41:25,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 81%|████████▏ | 1877/2304 [5:46:35<43:23,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 82%|████████▏ | 1878/2304 [5:46:42<45:23,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 82%|████████▏ | 1879/2304 [5:46:47<41:33,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0765


 82%|████████▏ | 1880/2304 [5:46:51<39:01,  5.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0971


 82%|████████▏ | 1881/2304 [5:46:56<37:32,  5.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0722


 82%|████████▏ | 1882/2304 [5:47:03<41:05,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 82%|████████▏ | 1883/2304 [5:47:10<42:42,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 82%|████████▏ | 1884/2304 [5:47:17<44:59,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0780


 82%|████████▏ | 1885/2304 [5:47:22<41:21,  5.92s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


 82%|████████▏ | 1886/2304 [5:47:27<39:07,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 82%|████████▏ | 1887/2304 [5:47:32<37:18,  5.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 82%|████████▏ | 1888/2304 [5:47:39<40:33,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0890


 82%|████████▏ | 1889/2304 [5:47:45<42:37,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 82%|████████▏ | 1890/2304 [5:47:53<44:35,  6.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0861


 82%|████████▏ | 1891/2304 [5:47:57<41:07,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 82%|████████▏ | 1892/2304 [5:48:02<38:59,  5.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 82%|████████▏ | 1893/2304 [5:48:07<37:04,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 82%|████████▏ | 1894/2304 [5:48:14<40:29,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 82%|████████▏ | 1895/2304 [5:48:21<42:04,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 82%|████████▏ | 1896/2304 [5:48:28<43:49,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


 82%|████████▏ | 1897/2304 [5:48:33<40:41,  6.00s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0668


 82%|████████▏ | 1898/2304 [5:48:38<38:04,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0827


 82%|████████▏ | 1899/2304 [5:48:43<36:17,  5.38s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0771


 82%|████████▏ | 1900/2304 [5:48:50<39:42,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0835


 83%|████████▎ | 1901/2304 [5:48:56<41:06,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0638


 83%|████████▎ | 1902/2304 [5:49:04<42:56,  6.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 83%|████████▎ | 1903/2304 [5:49:08<39:31,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0772


 83%|████████▎ | 1904/2304 [5:49:13<37:04,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 83%|████████▎ | 1905/2304 [5:49:18<36:00,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0752


 83%|████████▎ | 1906/2304 [5:49:25<39:09,  5.90s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 83%|████████▎ | 1907/2304 [5:49:32<40:56,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0869


 83%|████████▎ | 1908/2304 [5:49:39<43:10,  6.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0941


 83%|████████▎ | 1909/2304 [5:49:44<39:37,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0683


 83%|████████▎ | 1910/2304 [5:49:49<37:23,  5.69s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0788


 83%|████████▎ | 1911/2304 [5:49:54<35:35,  5.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0782


 83%|████████▎ | 1912/2304 [5:50:01<38:13,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0759


 83%|████████▎ | 1913/2304 [5:50:08<40:16,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 83%|████████▎ | 1914/2304 [5:50:15<41:48,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 83%|████████▎ | 1915/2304 [5:50:20<38:29,  5.94s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0731


 83%|████████▎ | 1916/2304 [5:50:24<36:22,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 83%|████████▎ | 1917/2304 [5:50:29<34:35,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 83%|████████▎ | 1918/2304 [5:50:36<37:45,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0680


 83%|████████▎ | 1919/2304 [5:50:43<39:28,  6.15s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 83%|████████▎ | 1920/2304 [5:50:50<41:09,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0828


 83%|████████▎ | 1921/2304 [5:50:55<38:19,  6.00s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 83%|████████▎ | 1922/2304 [5:51:00<35:51,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0679


 83%|████████▎ | 1923/2304 [5:51:05<35:04,  5.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0850


 84%|████████▎ | 1924/2304 [5:51:12<37:46,  5.96s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 84%|████████▎ | 1925/2304 [5:51:19<39:17,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0685


 84%|████████▎ | 1926/2304 [5:51:26<41:12,  6.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0707


 84%|████████▎ | 1927/2304 [5:51:31<38:01,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 84%|████████▎ | 1928/2304 [5:51:36<35:35,  5.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


 84%|████████▎ | 1929/2304 [5:51:41<34:03,  5.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 84%|████████▍ | 1930/2304 [5:51:48<36:27,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1546


 84%|████████▍ | 1931/2304 [5:51:55<38:27,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 84%|████████▍ | 1932/2304 [5:52:02<39:36,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0626


 84%|████████▍ | 1933/2304 [5:52:06<36:35,  5.92s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 84%|████████▍ | 1934/2304 [5:52:11<34:28,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0861


 84%|████████▍ | 1935/2304 [5:52:16<32:55,  5.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 84%|████████▍ | 1936/2304 [5:52:23<35:44,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0922


 84%|████████▍ | 1937/2304 [5:52:30<37:30,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0772


 84%|████████▍ | 1938/2304 [5:52:37<38:50,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0916


 84%|████████▍ | 1939/2304 [5:52:42<36:07,  5.94s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 84%|████████▍ | 1940/2304 [5:52:46<33:39,  5.55s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 84%|████████▍ | 1941/2304 [5:52:51<32:05,  5.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0804


 84%|████████▍ | 1942/2304 [5:52:58<35:08,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 84%|████████▍ | 1943/2304 [5:53:05<36:49,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0644


 84%|████████▍ | 1944/2304 [5:53:12<38:06,  6.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 84%|████████▍ | 1945/2304 [5:53:17<35:30,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0784


 84%|████████▍ | 1946/2304 [5:53:21<33:12,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0758


 85%|████████▍ | 1947/2304 [5:53:26<31:48,  5.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0718


 85%|████████▍ | 1948/2304 [5:53:33<34:27,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 85%|████████▍ | 1949/2304 [5:53:40<36:00,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 85%|████████▍ | 1950/2304 [5:53:47<37:42,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0711


 85%|████████▍ | 1951/2304 [5:53:52<34:49,  5.92s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 85%|████████▍ | 1952/2304 [5:53:56<32:37,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0788


 85%|████████▍ | 1953/2304 [5:54:01<31:27,  5.38s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 85%|████████▍ | 1954/2304 [5:54:08<34:09,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 85%|████████▍ | 1955/2304 [5:54:15<35:59,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 85%|████████▍ | 1956/2304 [5:54:22<37:11,  6.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 85%|████████▍ | 1957/2304 [5:54:27<34:11,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0653


 85%|████████▍ | 1958/2304 [5:54:32<32:11,  5.58s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0882


 85%|████████▌ | 1959/2304 [5:54:37<30:47,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 85%|████████▌ | 1960/2304 [5:54:44<33:33,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0780


 85%|████████▌ | 1961/2304 [5:54:51<35:07,  6.15s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 85%|████████▌ | 1962/2304 [5:54:57<36:25,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 85%|████████▌ | 1963/2304 [5:55:02<33:57,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0710


 85%|████████▌ | 1964/2304 [5:55:07<31:47,  5.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0931


 85%|████████▌ | 1965/2304 [5:55:12<30:18,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0738


 85%|████████▌ | 1966/2304 [5:55:19<33:16,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 85%|████████▌ | 1967/2304 [5:55:26<34:45,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 85%|████████▌ | 1968/2304 [5:55:33<36:18,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


 85%|████████▌ | 1969/2304 [5:55:38<33:31,  6.00s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0944


 86%|████████▌ | 1970/2304 [5:55:43<31:06,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0807


 86%|████████▌ | 1971/2304 [5:55:48<29:42,  5.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 86%|████████▌ | 1972/2304 [5:55:54<31:55,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0675


 86%|████████▌ | 1973/2304 [5:56:01<33:29,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 86%|████████▌ | 1974/2304 [5:56:08<34:57,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 86%|████████▌ | 1975/2304 [5:56:13<32:12,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0791


 86%|████████▌ | 1976/2304 [5:56:18<30:16,  5.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0676


 86%|████████▌ | 1977/2304 [5:56:22<28:53,  5.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 86%|████████▌ | 1978/2304 [5:56:29<31:21,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 86%|████████▌ | 1979/2304 [5:56:36<33:27,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 86%|████████▌ | 1980/2304 [5:56:43<34:15,  6.34s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0844


 86%|████████▌ | 1981/2304 [5:56:48<31:41,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 86%|████████▌ | 1982/2304 [5:56:53<29:49,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0855


 86%|████████▌ | 1983/2304 [5:56:57<28:32,  5.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0918


 86%|████████▌ | 1984/2304 [5:57:05<31:31,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0905


 86%|████████▌ | 1985/2304 [5:57:12<32:50,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 86%|████████▌ | 1986/2304 [5:57:18<33:58,  6.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0789


 86%|████████▌ | 1987/2304 [5:57:23<31:36,  5.98s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 86%|████████▋ | 1988/2304 [5:57:28<29:18,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0773


 86%|████████▋ | 1989/2304 [5:57:33<27:56,  5.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0687


 86%|████████▋ | 1990/2304 [5:57:40<30:42,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0790


 86%|████████▋ | 1991/2304 [5:57:47<32:01,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0921


 86%|████████▋ | 1992/2304 [5:57:54<33:22,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0716


 87%|████████▋ | 1993/2304 [5:57:59<30:46,  5.94s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0963


 87%|████████▋ | 1994/2304 [5:58:03<28:48,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0689


 87%|████████▋ | 1995/2304 [5:58:08<27:37,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0668


 87%|████████▋ | 1996/2304 [5:58:15<30:07,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 87%|████████▋ | 1997/2304 [5:58:22<31:59,  6.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 87%|████████▋ | 1998/2304 [5:58:29<33:07,  6.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0754


 87%|████████▋ | 1999/2304 [5:58:34<30:27,  5.99s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0710


 87%|████████▋ | 2000/2304 [5:58:39<28:42,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 87%|████████▋ | 2001/2304 [5:58:44<27:19,  5.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0738


 87%|████████▋ | 2002/2304 [5:58:51<29:33,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0683


 87%|████████▋ | 2003/2304 [5:58:58<30:52,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 87%|████████▋ | 2004/2304 [5:59:05<32:16,  6.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 87%|████████▋ | 2005/2304 [5:59:10<29:45,  5.97s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0813


 87%|████████▋ | 2006/2304 [5:59:14<27:48,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0821


 87%|████████▋ | 2007/2304 [5:59:19<26:30,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 87%|████████▋ | 2008/2304 [5:59:26<29:00,  5.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 87%|████████▋ | 2009/2304 [5:59:33<30:08,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 87%|████████▋ | 2010/2304 [5:59:40<31:19,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 87%|████████▋ | 2011/2304 [5:59:45<28:57,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 87%|████████▋ | 2012/2304 [5:59:50<27:07,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0769


 87%|████████▋ | 2013/2304 [5:59:55<26:03,  5.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 87%|████████▋ | 2014/2304 [6:00:02<28:17,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0846


 87%|████████▋ | 2015/2304 [6:00:08<29:23,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 88%|████████▊ | 2016/2304 [6:00:15<30:49,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0778


 88%|████████▊ | 2017/2304 [6:00:18<25:39,  5.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0693


 88%|████████▊ | 2018/2304 [6:00:21<22:21,  4.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0704


 88%|████████▊ | 2019/2304 [6:00:25<20:00,  4.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 88%|████████▊ | 2020/2304 [6:00:29<19:59,  4.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 88%|████████▊ | 2021/2304 [6:00:33<19:50,  4.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0685


 88%|████████▊ | 2022/2304 [6:00:37<20:08,  4.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 88%|████████▊ | 2023/2304 [6:00:40<18:17,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0717


 88%|████████▊ | 2024/2304 [6:00:44<17:06,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 88%|████████▊ | 2025/2304 [6:00:47<16:06,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0704


 88%|████████▊ | 2026/2304 [6:00:51<17:15,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 88%|████████▊ | 2027/2304 [6:00:55<17:45,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 88%|████████▊ | 2028/2304 [6:00:59<18:14,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 88%|████████▊ | 2029/2304 [6:01:02<16:50,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 88%|████████▊ | 2030/2304 [6:01:05<15:56,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 88%|████████▊ | 2031/2304 [6:01:08<15:06,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0682


 88%|████████▊ | 2032/2304 [6:01:13<16:22,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0691


 88%|████████▊ | 2033/2304 [6:01:17<16:58,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 88%|████████▊ | 2034/2304 [6:01:21<17:31,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 88%|████████▊ | 2035/2304 [6:01:24<16:11,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0689


 88%|████████▊ | 2036/2304 [6:01:27<15:26,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 88%|████████▊ | 2037/2304 [6:01:30<14:40,  3.30s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0694


 88%|████████▊ | 2038/2304 [6:01:34<15:50,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 88%|████████▊ | 2039/2304 [6:01:38<16:43,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 89%|████████▊ | 2040/2304 [6:01:43<17:13,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0694


 89%|████████▊ | 2041/2304 [6:01:46<15:57,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 89%|████████▊ | 2042/2304 [6:01:49<15:14,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 89%|████████▊ | 2043/2304 [6:01:52<14:44,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 89%|████████▊ | 2044/2304 [6:01:56<15:30,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 89%|████████▉ | 2045/2304 [6:02:00<16:13,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 89%|████████▉ | 2046/2304 [6:02:04<16:58,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 89%|████████▉ | 2047/2304 [6:02:07<15:39,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 89%|████████▉ | 2048/2304 [6:02:10<14:48,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0685


 89%|████████▉ | 2049/2304 [6:02:13<14:08,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 89%|████████▉ | 2050/2304 [6:02:18<15:08,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 89%|████████▉ | 2051/2304 [6:02:22<15:44,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0835


 89%|████████▉ | 2052/2304 [6:02:26<16:21,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0706


 89%|████████▉ | 2053/2304 [6:02:29<15:06,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 89%|████████▉ | 2054/2304 [6:02:32<14:21,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 89%|████████▉ | 2055/2304 [6:02:35<13:43,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0682


 89%|████████▉ | 2056/2304 [6:02:39<14:57,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 89%|████████▉ | 2057/2304 [6:02:43<15:32,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 89%|████████▉ | 2058/2304 [6:02:48<16:04,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 89%|████████▉ | 2059/2304 [6:02:51<14:51,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 89%|████████▉ | 2060/2304 [6:02:54<14:03,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0661


 89%|████████▉ | 2061/2304 [6:02:57<13:23,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 89%|████████▉ | 2062/2304 [6:03:01<14:36,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0673


 90%|████████▉ | 2063/2304 [6:03:05<15:20,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 90%|████████▉ | 2064/2304 [6:03:10<15:51,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 90%|████████▉ | 2065/2304 [6:03:13<14:40,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 90%|████████▉ | 2066/2304 [6:03:16<13:58,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 90%|████████▉ | 2067/2304 [6:03:19<13:15,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0683


 90%|████████▉ | 2068/2304 [6:03:23<14:14,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0708


 90%|████████▉ | 2069/2304 [6:03:27<14:57,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 90%|████████▉ | 2070/2304 [6:03:31<15:23,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 90%|████████▉ | 2071/2304 [6:03:34<14:08,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0676


 90%|████████▉ | 2072/2304 [6:03:37<13:24,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0739


 90%|████████▉ | 2073/2304 [6:03:40<12:43,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0692


 90%|█████████ | 2074/2304 [6:03:45<13:39,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0707


 90%|█████████ | 2075/2304 [6:03:49<14:19,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 90%|█████████ | 2076/2304 [6:03:53<14:40,  3.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1211


 90%|█████████ | 2077/2304 [6:03:56<13:43,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0679


 90%|█████████ | 2078/2304 [6:03:59<13:01,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


 90%|█████████ | 2079/2304 [6:04:02<12:25,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 90%|█████████ | 2080/2304 [6:04:06<13:23,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 90%|█████████ | 2081/2304 [6:04:10<13:56,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 90%|█████████ | 2082/2304 [6:04:15<14:20,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0697


 90%|█████████ | 2083/2304 [6:04:18<13:24,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 90%|█████████ | 2084/2304 [6:04:21<12:39,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 90%|█████████ | 2085/2304 [6:04:24<12:00,  3.29s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 91%|█████████ | 2086/2304 [6:04:28<12:58,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0702


 91%|█████████ | 2087/2304 [6:04:32<13:31,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 91%|█████████ | 2088/2304 [6:04:36<13:51,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0710


 91%|█████████ | 2089/2304 [6:04:39<12:59,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0710


 91%|█████████ | 2090/2304 [6:04:42<12:12,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 91%|█████████ | 2091/2304 [6:04:45<11:51,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0681


 91%|█████████ | 2092/2304 [6:04:50<12:49,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 91%|█████████ | 2093/2304 [6:04:54<13:23,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 91%|█████████ | 2094/2304 [6:04:58<13:38,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 91%|█████████ | 2095/2304 [6:05:01<12:51,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0767


 91%|█████████ | 2096/2304 [6:05:04<12:14,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 91%|█████████ | 2097/2304 [6:05:07<11:38,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0738


 91%|█████████ | 2098/2304 [6:05:12<12:32,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 91%|█████████ | 2099/2304 [6:05:16<13:09,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 91%|█████████ | 2100/2304 [6:05:20<13:26,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 91%|█████████ | 2101/2304 [6:05:23<12:36,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0702


 91%|█████████ | 2102/2304 [6:05:26<11:50,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0706


 91%|█████████▏| 2103/2304 [6:05:30<11:31,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 91%|█████████▏| 2104/2304 [6:05:34<12:22,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0691


 91%|█████████▏| 2105/2304 [6:05:38<12:41,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


 91%|█████████▏| 2106/2304 [6:05:42<13:07,  3.98s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 91%|█████████▏| 2107/2304 [6:05:45<12:13,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0678


 91%|█████████▏| 2108/2304 [6:05:48<11:24,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 92%|█████████▏| 2109/2304 [6:05:52<11:00,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0704


 92%|█████████▏| 2110/2304 [6:05:56<11:48,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 92%|█████████▏| 2111/2304 [6:06:00<12:08,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 92%|█████████▏| 2112/2304 [6:06:04<12:37,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 92%|█████████▏| 2113/2304 [6:06:07<11:49,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 92%|█████████▏| 2114/2304 [6:06:10<10:59,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0680


 92%|█████████▏| 2115/2304 [6:06:14<10:41,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 92%|█████████▏| 2116/2304 [6:06:18<11:29,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 92%|█████████▏| 2117/2304 [6:06:22<11:48,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 92%|█████████▏| 2118/2304 [6:06:26<12:17,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 92%|█████████▏| 2119/2304 [6:06:29<11:26,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 92%|█████████▏| 2120/2304 [6:06:32<10:42,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 92%|█████████▏| 2121/2304 [6:06:36<10:21,  3.40s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 92%|█████████▏| 2122/2304 [6:06:40<10:54,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0721


 92%|█████████▏| 2123/2304 [6:06:44<11:22,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 92%|█████████▏| 2124/2304 [6:06:48<11:43,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 92%|█████████▏| 2125/2304 [6:06:51<10:57,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0696


 92%|█████████▏| 2126/2304 [6:06:54<10:16,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0713


 92%|█████████▏| 2127/2304 [6:06:57<09:57,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 92%|█████████▏| 2128/2304 [6:07:01<10:38,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0718


 92%|█████████▏| 2129/2304 [6:07:06<11:01,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 92%|█████████▏| 2130/2304 [6:07:10<11:16,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0707


 92%|█████████▏| 2131/2304 [6:07:13<10:21,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 93%|█████████▎| 2132/2304 [6:07:16<09:50,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 93%|█████████▎| 2133/2304 [6:07:19<09:31,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0736


 93%|█████████▎| 2134/2304 [6:07:23<10:02,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 93%|█████████▎| 2135/2304 [6:07:27<10:30,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 93%|█████████▎| 2136/2304 [6:07:31<10:51,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0694


 93%|█████████▎| 2137/2304 [6:07:34<10:07,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 93%|█████████▎| 2138/2304 [6:07:37<09:28,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 93%|█████████▎| 2139/2304 [6:07:40<09:09,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 93%|█████████▎| 2140/2304 [6:07:44<09:43,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 93%|█████████▎| 2141/2304 [6:07:49<10:13,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 93%|█████████▎| 2142/2304 [6:07:53<10:31,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 93%|█████████▎| 2143/2304 [6:07:56<09:47,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0728


 93%|█████████▎| 2144/2304 [6:07:59<09:18,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 93%|█████████▎| 2145/2304 [6:08:02<09:05,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0796


 93%|█████████▎| 2146/2304 [6:08:06<09:34,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 93%|█████████▎| 2147/2304 [6:08:11<09:55,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 93%|█████████▎| 2148/2304 [6:08:15<10:15,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 93%|█████████▎| 2149/2304 [6:08:18<09:24,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0793


 93%|█████████▎| 2150/2304 [6:08:21<08:52,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 93%|█████████▎| 2151/2304 [6:08:24<08:27,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0712


 93%|█████████▎| 2152/2304 [6:08:28<09:04,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 93%|█████████▎| 2153/2304 [6:08:32<09:26,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 93%|█████████▎| 2154/2304 [6:08:37<09:45,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 94%|█████████▎| 2155/2304 [6:08:39<08:59,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 94%|█████████▎| 2156/2304 [6:08:43<08:30,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 94%|█████████▎| 2157/2304 [6:08:45<08:04,  3.29s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0701


 94%|█████████▎| 2158/2304 [6:08:50<08:41,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


 94%|█████████▎| 2159/2304 [6:08:54<09:00,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 94%|█████████▍| 2160/2304 [6:08:58<09:18,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 94%|█████████▍| 2161/2304 [6:09:01<08:36,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0829


 94%|█████████▍| 2162/2304 [6:09:04<08:05,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 94%|█████████▍| 2163/2304 [6:09:07<07:40,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0763


 94%|█████████▍| 2164/2304 [6:09:11<08:14,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0786


 94%|█████████▍| 2165/2304 [6:09:15<08:36,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 94%|█████████▍| 2166/2304 [6:09:19<08:44,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0840


 94%|█████████▍| 2167/2304 [6:09:22<08:08,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0733


 94%|█████████▍| 2168/2304 [6:09:25<07:41,  3.40s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 94%|█████████▍| 2169/2304 [6:09:28<07:17,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 94%|█████████▍| 2170/2304 [6:09:32<07:50,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0798


 94%|█████████▍| 2171/2304 [6:09:36<08:11,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 94%|█████████▍| 2172/2304 [6:09:40<08:18,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 94%|█████████▍| 2173/2304 [6:09:43<07:44,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 94%|█████████▍| 2174/2304 [6:09:46<07:18,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 94%|█████████▍| 2175/2304 [6:09:49<06:58,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 94%|█████████▍| 2176/2304 [6:09:53<07:27,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0712


 94%|█████████▍| 2177/2304 [6:09:57<07:44,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 95%|█████████▍| 2178/2304 [6:10:02<08:03,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0716


 95%|█████████▍| 2179/2304 [6:10:05<07:27,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 95%|█████████▍| 2180/2304 [6:10:08<07:03,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 95%|█████████▍| 2181/2304 [6:10:11<06:43,  3.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0710


 95%|█████████▍| 2182/2304 [6:10:15<07:17,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0692


 95%|█████████▍| 2183/2304 [6:10:19<07:34,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0698


 95%|█████████▍| 2184/2304 [6:10:23<07:52,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 95%|█████████▍| 2185/2304 [6:10:26<07:15,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 95%|█████████▍| 2186/2304 [6:10:29<06:49,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 95%|█████████▍| 2187/2304 [6:10:32<06:29,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 95%|█████████▍| 2188/2304 [6:10:37<07:01,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0850


 95%|█████████▌| 2189/2304 [6:10:41<07:13,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 95%|█████████▌| 2190/2304 [6:10:45<07:23,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0707


 95%|█████████▌| 2191/2304 [6:10:48<06:47,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0762


 95%|█████████▌| 2192/2304 [6:10:51<06:23,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 95%|█████████▌| 2193/2304 [6:10:54<06:01,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 95%|█████████▌| 2194/2304 [6:10:58<06:27,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0708


 95%|█████████▌| 2195/2304 [6:11:02<06:42,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 95%|█████████▌| 2196/2304 [6:11:06<06:55,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 95%|█████████▌| 2197/2304 [6:11:09<06:21,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0730


 95%|█████████▌| 2198/2304 [6:11:12<06:02,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


 95%|█████████▌| 2199/2304 [6:11:15<05:47,  3.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0812


 95%|█████████▌| 2200/2304 [6:11:19<06:02,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 96%|█████████▌| 2201/2304 [6:11:23<06:20,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 96%|█████████▌| 2202/2304 [6:11:27<06:28,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0734


 96%|█████████▌| 2203/2304 [6:11:30<05:55,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 96%|█████████▌| 2204/2304 [6:11:33<05:39,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 96%|█████████▌| 2205/2304 [6:11:36<05:22,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0733


 96%|█████████▌| 2206/2304 [6:11:41<05:46,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 96%|█████████▌| 2207/2304 [6:11:45<06:02,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 96%|█████████▌| 2208/2304 [6:11:49<06:12,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0726


 96%|█████████▌| 2209/2304 [6:11:52<05:39,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 96%|█████████▌| 2210/2304 [6:11:55<05:21,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 96%|█████████▌| 2211/2304 [6:11:58<05:05,  3.29s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0879


 96%|█████████▌| 2212/2304 [6:12:02<05:27,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 96%|█████████▌| 2213/2304 [6:12:06<05:40,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 96%|█████████▌| 2214/2304 [6:12:10<05:49,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


 96%|█████████▌| 2215/2304 [6:12:13<05:20,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0758


 96%|█████████▌| 2216/2304 [6:12:16<05:01,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0680


 96%|█████████▌| 2217/2304 [6:12:19<04:44,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0736


 96%|█████████▋| 2218/2304 [6:12:23<05:05,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0695


 96%|█████████▋| 2219/2304 [6:12:28<05:15,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 96%|█████████▋| 2220/2304 [6:12:32<05:19,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 96%|█████████▋| 2221/2304 [6:12:35<04:59,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0834


 96%|█████████▋| 2222/2304 [6:12:38<04:42,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 96%|█████████▋| 2223/2304 [6:12:41<04:24,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 97%|█████████▋| 2224/2304 [6:12:45<04:42,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0800


 97%|█████████▋| 2225/2304 [6:12:49<04:54,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 97%|█████████▋| 2226/2304 [6:12:53<04:57,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0712


 97%|█████████▋| 2227/2304 [6:12:56<04:36,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 97%|█████████▋| 2228/2304 [6:12:59<04:18,  3.40s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 97%|█████████▋| 2229/2304 [6:13:02<04:09,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0704


 97%|█████████▋| 2230/2304 [6:13:06<04:24,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0805


 97%|█████████▋| 2231/2304 [6:13:10<04:33,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0924


 97%|█████████▋| 2232/2304 [6:13:15<04:37,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0703


 97%|█████████▋| 2233/2304 [6:13:18<04:19,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 97%|█████████▋| 2234/2304 [6:13:21<04:02,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 97%|█████████▋| 2235/2304 [6:13:24<03:56,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0764


 97%|█████████▋| 2236/2304 [6:13:28<04:09,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0792


 97%|█████████▋| 2237/2304 [6:13:32<04:12,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 97%|█████████▋| 2238/2304 [6:13:37<04:22,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0697


 97%|█████████▋| 2239/2304 [6:13:40<04:03,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0785


 97%|█████████▋| 2240/2304 [6:13:43<03:46,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0703


 97%|█████████▋| 2241/2304 [6:13:46<03:36,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0780


 97%|█████████▋| 2242/2304 [6:13:51<03:48,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


 97%|█████████▋| 2243/2304 [6:13:55<03:53,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 97%|█████████▋| 2244/2304 [6:13:59<03:54,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0711


 97%|█████████▋| 2245/2304 [6:14:02<03:39,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


 97%|█████████▋| 2246/2304 [6:14:05<03:24,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0721


 98%|█████████▊| 2247/2304 [6:14:08<03:15,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0739


 98%|█████████▊| 2248/2304 [6:14:13<03:27,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 98%|█████████▊| 2249/2304 [6:14:17<03:32,  3.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


 98%|█████████▊| 2250/2304 [6:14:21<03:32,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 98%|█████████▊| 2251/2304 [6:14:24<03:18,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 98%|█████████▊| 2252/2304 [6:14:27<03:03,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 98%|█████████▊| 2253/2304 [6:14:31<02:55,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0784


 98%|█████████▊| 2254/2304 [6:14:35<03:04,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0796


 98%|█████████▊| 2255/2304 [6:14:39<03:07,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 98%|█████████▊| 2256/2304 [6:14:43<03:10,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 98%|█████████▊| 2257/2304 [6:14:47<02:55,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0803


 98%|█████████▊| 2258/2304 [6:14:49<02:40,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 98%|█████████▊| 2259/2304 [6:14:52<02:30,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0774


 98%|█████████▊| 2260/2304 [6:14:57<02:37,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 98%|█████████▊| 2261/2304 [6:15:01<02:38,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 98%|█████████▊| 2262/2304 [6:15:05<02:40,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0876


 98%|█████████▊| 2263/2304 [6:15:08<02:27,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 98%|█████████▊| 2264/2304 [6:15:11<02:15,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0759


 98%|█████████▊| 2265/2304 [6:15:14<02:08,  3.30s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0821


 98%|█████████▊| 2266/2304 [6:15:18<02:14,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 98%|█████████▊| 2267/2304 [6:15:22<02:14,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 98%|█████████▊| 2268/2304 [6:15:26<02:17,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


 98%|█████████▊| 2269/2304 [6:15:29<02:05,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0743


 99%|█████████▊| 2270/2304 [6:15:32<01:53,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0670


 99%|█████████▊| 2271/2304 [6:15:35<01:47,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 99%|█████████▊| 2272/2304 [6:15:39<01:52,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 99%|█████████▊| 2273/2304 [6:15:43<01:52,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 99%|█████████▊| 2274/2304 [6:15:47<01:53,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 99%|█████████▊| 2275/2304 [6:15:50<01:43,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0741


 99%|█████████▉| 2276/2304 [6:15:53<01:33,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 99%|█████████▉| 2277/2304 [6:15:56<01:27,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0741


 99%|█████████▉| 2278/2304 [6:16:00<01:31,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0765


 99%|█████████▉| 2279/2304 [6:16:04<01:30,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 99%|█████████▉| 2280/2304 [6:16:08<01:30,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 99%|█████████▉| 2281/2304 [6:16:11<01:23,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 99%|█████████▉| 2282/2304 [6:16:14<01:15,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 99%|█████████▉| 2283/2304 [6:16:18<01:10,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 99%|█████████▉| 2284/2304 [6:16:22<01:11,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 99%|█████████▉| 2285/2304 [6:16:26<01:12,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 99%|█████████▉| 2286/2304 [6:16:30<01:10,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0629


 99%|█████████▉| 2287/2304 [6:16:33<01:02,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0801


 99%|█████████▉| 2288/2304 [6:16:36<00:56,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 99%|█████████▉| 2289/2304 [6:16:40<00:51,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0749


 99%|█████████▉| 2290/2304 [6:16:44<00:50,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0824


 99%|█████████▉| 2291/2304 [6:16:48<00:49,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 99%|█████████▉| 2292/2304 [6:16:52<00:47,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0721


100%|█████████▉| 2293/2304 [6:16:55<00:40,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0763


100%|█████████▉| 2294/2304 [6:16:58<00:35,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


100%|█████████▉| 2295/2304 [6:17:02<00:30,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


100%|█████████▉| 2296/2304 [6:17:06<00:29,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0759


100%|█████████▉| 2297/2304 [6:17:10<00:26,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


100%|█████████▉| 2298/2304 [6:17:14<00:23,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0771


100%|█████████▉| 2299/2304 [6:17:17<00:18,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


100%|█████████▉| 2300/2304 [6:17:21<00:14,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


100%|█████████▉| 2301/2304 [6:17:24<00:10,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


100%|█████████▉| 2302/2304 [6:17:28<00:07,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0753


100%|█████████▉| 2303/2304 [6:17:32<00:03,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


100%|██████████| 2304/2304 [6:17:36<00:00,  9.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717

✅ Best Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}
✅ Best Loss: 0.0626





##### 2.3 Test

In [None]:
def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    model.to(device)

    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred)
            targets.append(yb)

    # (B*T, 8, 8) 텐서 형태로 합치기
    preds_tensor = torch.cat(preds, dim=0)
    targets_tensor = torch.cat(targets, dim=0)

    return preds_tensor, targets_tensor

with open(f'{model_save_path}/best_model_window10per30_WCT_config.json', 'r') as f:
    best_config = json.load(f)

best_model = CorrPredictorCNNTransformer(
    kernel_size=best_config['kernel_size'],
    d_model=best_config['d_model'],
    nhead=best_config['nhead'],
    num_layers=best_config['num_layers'],
    dim_feedforward=best_config['dim_feedforward'],
    activation=best_config['activation']
)
best_model.load_state_dict(torch.load(f"{model_save_path}/best_model_window10per30_WCT_weights.pth"))

test_loader = DataLoader(test_ds, batch_size=best_config['batch_size'], shuffle=False)
preds_tensor, targets_tensor = evaluate_model(best_model, test_loader, device=device)

# 저장
torch.save({
    'preds': preds_tensor,
    'targets': targets_tensor
}, f"{model_save_path}/best_model_window10per30_WCT_result.pt")

In [None]:
# Performance metrics

preds_flat = preds_tensor.view(preds_tensor.size(0), -1).cpu().numpy()
targets_flat = targets_tensor.view(targets_tensor.size(0), -1).cpu().numpy()

mse = mean_squared_error(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
rmse = np.sqrt(mse)

# frobenius_loss
cos_sim = cosine_similarity(targets_flat, preds_flat)
mean_cos_sim = np.diag(cos_sim).mean()

# frobenius_loss
diff = preds_tensor - targets_tensor
frobenius_per_sample = torch.norm(diff, p='fro', dim=(1, 2))
mean_frobenius = frobenius_per_sample.mean().item()

print(f"\n📊 Evaluation Results:")
print(f"MSE               : {mse:.5f}")
print(f"MAE               : {mae:.5f}")
print(f"RMSE              : {rmse:.5f}")
print(f"Cosine Similarity : {mean_cos_sim:.5f}")
print(f"Frobenius Norm    : {mean_frobenius:.5f}")


📊 Evaluation Results:
MSE               : 0.06203
MAE               : 0.17628
RMSE              : 0.24907
Cosine Similarity : 0.94605
Frobenius Norm    : 1.79540
