### 0. Setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
import pywt
import copy
import json

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/data/train_data.csv')
df.set_index('timestamp', inplace=True)

### 1. Data Preprocessing

##### 1.1 Correlation Transform

In [6]:
def corr_transform(df, input_window_width=30, label_window_width=10):
    X, Y = [], []
    data = df.values

    for t in range(input_window_width, len(df)-label_window_width+1):
        window_data = data[t-input_window_width : t]
        window_corr = np.corrcoef(window_data.T)
        X.append(torch.tensor(window_corr, dtype=torch.float32))

        label_window = data[t : t+label_window_width]
        corr_next = np.corrcoef(label_window.T)
        Y.append(torch.tensor(corr_next, dtype=torch.float32))

    return torch.stack(X), torch.stack(Y)

In [7]:
X_tensor, Y_tensor = corr_transform(df)

In [8]:
X_tensor.shape, Y_tensor.shape

(torch.Size([2520, 8, 8]), torch.Size([2520, 8, 8]))

In [9]:
total_size = len(X_tensor)
train_size = int(total_size * 0.8)
val_size   = int(total_size * 0.1)
test_size  = total_size - train_size - val_size

X_train = X_tensor[:train_size]
Y_train = Y_tensor[:train_size]

X_val = X_tensor[train_size:train_size + val_size]
Y_val = Y_tensor[train_size:train_size + val_size]

X_test = X_tensor[train_size + val_size:]
Y_test = Y_tensor[train_size + val_size:]

train_ds = TensorDataset(X_train, Y_train)
val_ds   = TensorDataset(X_val, Y_val)
test_ds  = TensorDataset(X_test, Y_test)

### 2. Modeling

##### 2.1 Model Structure Setting

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]

        return x

In [11]:
class CorrPredictorCNNTransformer(nn.Module):
    def __init__(
            self,
            num_channels=1,
            conv_channels=32,
            kernel_size=3,
            d_model=128,
            nhead=4,
            num_layers=2,
            dim_feedforward=256,
            activation='relu',
            ):

        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 32, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, d_model, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(d_model)
        )

        self.flatten = nn.Flatten(start_dim=2)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            activation=activation,
            batch_first=True,
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 64)
            )

    def forward(self, x):
        """
        CB: CNN Batch
        TB: Transformer Batch
        T: Sequence Length
        C: Channel
        H: Height
        W: Width
        d_model: Dimension of model
        """

        if x.dim() == 3:
            x = x.unsqueeze(1)

        B, C, H, W = x.shape

        x = self.cnn(x)

        x = self.flatten(x)
        x = self.pool(x).squeeze(-1)
        x = x.unsqueeze(0)

        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = x.squeeze(0)

        output = self.fc(x)
        output = output.view(-1, 8, 8)

        return output

##### 2.2 Training

In [12]:
def train_model(model, train_loader, val_loader, optimizer_name='Adam', lr=5e-4, epochs=70, device='cuda'):
    model.to(device)

    # Optimizer 선택
    if optimizer_name == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'RMSprop':
        opt = torch.optim.RMSprop(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        opt = torch.optim.AdamW(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss & LR Scheduler
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode='min', factor=0.5, patience=5
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()

            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            opt.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        # 스케줄러 적용
        scheduler.step(val_loss)

    return val_loss

In [13]:
model_save_path = '/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/best_model'

In [None]:
# Grid Search
param_grid = {
    'kernel_size': [3, 5],
    'd_model': [32, 64, 128],
    'nhead': [2, 4],
    'num_layers': [2, 4],
    'dim_feedforward': [256, 512],
    'activation': ['relu', 'gelu'],
    'lr': [0.001, 5e-4],
    'optimizer': ['Adam', 'RMSprop', 'AdamW'],
    'batch_size': [64, 128, 256, 512]
}

best_loss = float('inf')
best_config = None
best_model = None

for config in tqdm(ParameterGrid(param_grid)):
    train_loader = DataLoader(train_ds,  batch_size=config['batch_size'], shuffle=False)
    val_loader   = DataLoader(val_ds,  batch_size=config['batch_size'], shuffle=False)

    model = CorrPredictorCNNTransformer(
        kernel_size=config['kernel_size'],
        d_model=config['d_model'],
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        dim_feedforward=config['dim_feedforward'],
        activation=config['activation'],
    )
    loss = train_model(model, train_loader, val_loader,
                       optimizer_name=config['optimizer'],
                       lr=config['lr'], device=device)

    print(f"Config: {config}, Loss: {loss:.4f}")
    if loss < best_loss:
        best_loss = loss
        best_config = config

        torch.save(model.state_dict(), f"{model_save_path}/best_model_window10per30_CT2_weights.pth")
        with open(f'{model_save_path}/best_model_window10per30_CT2_config.json', 'w') as f:
            json.dump(best_config, f, indent=4)

# 최종 결과
print(f"\n✅ Best Config: {best_config}")
print(f"✅ Best Loss: {best_loss:.4f}")

  0%|          | 0/2304 [00:00<?, ?it/s]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0870


  0%|          | 2/2304 [00:38<11:49:24, 18.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0975


  0%|          | 3/2304 [00:53<10:56:00, 17.11s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0964


  0%|          | 4/2304 [01:17<12:40:32, 19.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


  0%|          | 5/2304 [01:41<13:32:49, 21.21s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0861


  0%|          | 6/2304 [02:05<14:09:33, 22.18s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


  0%|          | 7/2304 [02:20<12:42:16, 19.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0843


  0%|          | 8/2304 [02:35<11:41:16, 18.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1051


  0%|          | 9/2304 [02:51<11:04:33, 17.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1273


  0%|          | 10/2304 [03:15<12:23:01, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


  0%|          | 11/2304 [03:38<13:08:44, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0796


  1%|          | 12/2304 [04:02<13:47:01, 21.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1132


  1%|          | 13/2304 [04:17<12:33:11, 19.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0908


  1%|          | 14/2304 [04:32<11:38:29, 18.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0956


  1%|          | 15/2304 [04:48<11:04:52, 17.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0857


  1%|          | 16/2304 [05:12<12:19:42, 19.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1327


  1%|          | 17/2304 [05:35<13:05:35, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0872


  1%|          | 18/2304 [05:59<13:44:33, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0939


  1%|          | 19/2304 [06:15<12:32:32, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0839


  1%|          | 20/2304 [06:29<11:36:42, 18.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1154


  1%|          | 21/2304 [06:45<11:02:19, 17.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0723


  1%|          | 22/2304 [07:09<12:16:38, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


  1%|          | 23/2304 [07:32<13:02:50, 20.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1396


  1%|          | 24/2304 [07:56<13:40:42, 21.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0779


  1%|          | 25/2304 [08:11<12:29:47, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0968


  1%|          | 26/2304 [08:26<11:35:34, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1137


  1%|          | 27/2304 [08:42<11:00:35, 17.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0842


  1%|          | 28/2304 [09:06<12:14:49, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0870


  1%|▏         | 29/2304 [09:29<13:00:55, 20.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0833


  1%|▏         | 30/2304 [09:53<13:40:25, 21.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0806


  1%|▏         | 31/2304 [10:09<12:27:53, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0943


  1%|▏         | 32/2304 [10:23<11:32:38, 18.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0889


  1%|▏         | 33/2304 [10:39<10:59:56, 17.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0959


  1%|▏         | 34/2304 [11:03<12:13:41, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0887


  2%|▏         | 35/2304 [11:26<12:59:12, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0861


  2%|▏         | 36/2304 [11:50<13:36:33, 21.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0922


  2%|▏         | 37/2304 [12:06<12:24:50, 19.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1384


  2%|▏         | 38/2304 [12:21<11:31:42, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0904


  2%|▏         | 39/2304 [12:36<10:57:05, 17.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0878


  2%|▏         | 40/2304 [13:00<12:10:56, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0872


  2%|▏         | 41/2304 [13:23<12:57:58, 20.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0866


  2%|▏         | 42/2304 [13:47<13:36:07, 21.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


  2%|▏         | 43/2304 [14:03<12:25:47, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1194


  2%|▏         | 44/2304 [14:18<11:31:10, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1495


  2%|▏         | 45/2304 [14:33<10:56:02, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1023


  2%|▏         | 46/2304 [14:57<12:09:52, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0964


  2%|▏         | 47/2304 [15:21<12:54:54, 20.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1018


  2%|▏         | 48/2304 [15:45<13:32:42, 21.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0902


  2%|▏         | 49/2304 [16:00<12:22:44, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0921


  2%|▏         | 50/2304 [16:15<11:28:58, 18.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0991


  2%|▏         | 51/2304 [16:30<10:55:14, 17.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1403


  2%|▏         | 52/2304 [16:54<12:09:14, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0962


  2%|▏         | 53/2304 [17:18<12:54:38, 20.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0863


  2%|▏         | 54/2304 [17:42<13:32:25, 21.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


  2%|▏         | 55/2304 [17:57<12:21:34, 19.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0865


  2%|▏         | 56/2304 [18:12<11:27:38, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1267


  2%|▏         | 57/2304 [18:28<10:54:39, 17.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


  3%|▎         | 58/2304 [18:52<12:07:20, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1228


  3%|▎         | 59/2304 [19:15<12:53:53, 20.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0760


  3%|▎         | 60/2304 [19:40<13:33:02, 21.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


  3%|▎         | 61/2304 [19:55<12:22:26, 19.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


  3%|▎         | 62/2304 [20:10<11:29:25, 18.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0876


  3%|▎         | 63/2304 [20:26<10:53:56, 17.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0905


  3%|▎         | 64/2304 [20:50<12:06:08, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1028


  3%|▎         | 65/2304 [21:13<12:52:46, 20.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0819


  3%|▎         | 66/2304 [21:37<13:29:07, 21.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1258


  3%|▎         | 67/2304 [21:52<12:17:38, 19.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1310


  3%|▎         | 68/2304 [22:07<11:23:34, 18.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0819


  3%|▎         | 69/2304 [22:23<10:50:09, 17.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0853


  3%|▎         | 70/2304 [22:47<12:04:11, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


  3%|▎         | 71/2304 [23:10<12:48:25, 20.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


  3%|▎         | 72/2304 [23:34<13:26:20, 21.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0873


  3%|▎         | 73/2304 [23:50<12:15:05, 19.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0955


  3%|▎         | 74/2304 [24:05<11:20:45, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


  3%|▎         | 75/2304 [24:20<10:48:05, 17.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0841


  3%|▎         | 76/2304 [24:44<11:59:01, 19.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0914


  3%|▎         | 77/2304 [25:07<12:43:06, 20.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0911


  3%|▎         | 78/2304 [25:31<13:22:53, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


  3%|▎         | 79/2304 [25:47<12:12:07, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


  3%|▎         | 80/2304 [26:02<11:20:17, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0849


  4%|▎         | 81/2304 [26:17<10:48:18, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0864


  4%|▎         | 82/2304 [26:41<12:00:17, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


  4%|▎         | 83/2304 [27:05<12:45:42, 20.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0758


  4%|▎         | 84/2304 [27:29<13:23:28, 21.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0818


  4%|▎         | 85/2304 [27:44<12:11:23, 19.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0923


  4%|▎         | 86/2304 [27:59<11:17:31, 18.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0916


  4%|▍         | 87/2304 [28:15<10:44:45, 17.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0946


  4%|▍         | 88/2304 [28:39<11:56:11, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


  4%|▍         | 89/2304 [29:02<12:39:27, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1014


  4%|▍         | 90/2304 [29:26<13:16:45, 21.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0951


  4%|▍         | 91/2304 [29:41<12:07:35, 19.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0880


  4%|▍         | 92/2304 [29:56<11:15:46, 18.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0976


  4%|▍         | 93/2304 [30:12<10:41:49, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0959


  4%|▍         | 94/2304 [30:36<11:53:54, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0867


  4%|▍         | 95/2304 [30:59<12:37:55, 20.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0886


  4%|▍         | 96/2304 [31:23<13:16:54, 21.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0711


  4%|▍         | 97/2304 [31:39<12:07:15, 19.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1007


  4%|▍         | 98/2304 [31:54<11:14:36, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1198


  4%|▍         | 99/2304 [32:09<10:41:53, 17.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0819


  4%|▍         | 100/2304 [32:33<11:53:46, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


  4%|▍         | 101/2304 [32:56<12:38:08, 20.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


  4%|▍         | 102/2304 [33:21<13:16:34, 21.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


  4%|▍         | 103/2304 [33:36<12:06:26, 19.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0860


  5%|▍         | 104/2304 [33:51<11:11:55, 18.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1195


  5%|▍         | 105/2304 [34:06<10:40:12, 17.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


  5%|▍         | 106/2304 [34:30<11:51:49, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0687


  5%|▍         | 107/2304 [34:54<12:36:15, 20.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1165


  5%|▍         | 108/2304 [35:18<13:15:17, 21.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


  5%|▍         | 109/2304 [35:34<12:05:19, 19.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0893


  5%|▍         | 110/2304 [35:48<11:11:42, 18.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1570


  5%|▍         | 111/2304 [36:04<10:37:01, 17.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0954


  5%|▍         | 112/2304 [36:28<11:49:16, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0895


  5%|▍         | 113/2304 [36:51<12:33:36, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0976


  5%|▍         | 114/2304 [37:15<13:11:00, 21.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0968


  5%|▍         | 115/2304 [37:31<12:02:07, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0845


  5%|▌         | 116/2304 [37:46<11:08:26, 18.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


  5%|▌         | 117/2304 [38:01<10:34:50, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


  5%|▌         | 118/2304 [38:25<11:47:19, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


  5%|▌         | 119/2304 [38:49<12:33:38, 20.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0852


  5%|▌         | 120/2304 [39:13<13:10:14, 21.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0879


  5%|▌         | 121/2304 [39:28<12:00:34, 19.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0946


  5%|▌         | 122/2304 [39:43<11:08:52, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1281


  5%|▌         | 123/2304 [39:59<10:36:07, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0911


  5%|▌         | 124/2304 [40:23<11:48:18, 19.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0869


  5%|▌         | 125/2304 [40:46<12:32:49, 20.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


  5%|▌         | 126/2304 [41:11<13:10:57, 21.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0718


  6%|▌         | 127/2304 [41:26<12:01:29, 19.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0895


  6%|▌         | 128/2304 [41:41<11:08:06, 18.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1070


  6%|▌         | 129/2304 [41:56<10:33:58, 17.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0913


  6%|▌         | 130/2304 [42:21<11:45:15, 19.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0754


  6%|▌         | 131/2304 [42:44<12:30:00, 20.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0877


  6%|▌         | 132/2304 [43:08<13:07:43, 21.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0817


  6%|▌         | 133/2304 [43:24<11:58:33, 19.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0821


  6%|▌         | 134/2304 [43:39<11:06:16, 18.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1094


  6%|▌         | 135/2304 [43:54<10:32:42, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0820


  6%|▌         | 136/2304 [44:18<11:43:36, 19.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0904


  6%|▌         | 137/2304 [44:42<12:28:06, 20.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0853


  6%|▌         | 138/2304 [45:06<13:03:28, 21.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0942


  6%|▌         | 139/2304 [45:21<11:54:11, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


  6%|▌         | 140/2304 [45:36<11:02:21, 18.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0879


  6%|▌         | 141/2304 [45:52<10:30:19, 17.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1074


  6%|▌         | 142/2304 [46:16<11:41:38, 19.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0852


  6%|▌         | 143/2304 [46:39<12:26:23, 20.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0834


  6%|▋         | 144/2304 [47:04<13:02:48, 21.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0959


  6%|▋         | 145/2304 [47:19<11:52:02, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1207


  6%|▋         | 146/2304 [47:34<10:59:02, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0930


  6%|▋         | 147/2304 [47:49<10:26:14, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0841


  6%|▋         | 148/2304 [48:13<11:34:51, 19.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


  6%|▋         | 149/2304 [48:36<12:17:23, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


  7%|▋         | 150/2304 [49:00<12:53:37, 21.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0726


  7%|▋         | 151/2304 [49:15<11:46:45, 19.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0912


  7%|▋         | 152/2304 [49:30<10:55:29, 18.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0989


  7%|▋         | 153/2304 [49:46<10:24:27, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0895


  7%|▋         | 154/2304 [50:10<11:34:28, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0893


  7%|▋         | 155/2304 [50:33<12:18:12, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0708


  7%|▋         | 156/2304 [50:57<12:53:55, 21.62s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0866


  7%|▋         | 157/2304 [51:12<11:44:43, 19.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0790


  7%|▋         | 158/2304 [51:27<10:53:28, 18.27s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1307


  7%|▋         | 159/2304 [51:43<10:21:49, 17.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


  7%|▋         | 160/2304 [52:07<11:31:48, 19.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0815


  7%|▋         | 161/2304 [52:30<12:15:15, 20.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1137


  7%|▋         | 162/2304 [52:54<12:50:45, 21.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0936


  7%|▋         | 163/2304 [53:09<11:44:19, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


  7%|▋         | 164/2304 [53:24<10:51:52, 18.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


  7%|▋         | 165/2304 [53:40<10:20:54, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1019


  7%|▋         | 166/2304 [54:04<11:31:44, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0893


  7%|▋         | 167/2304 [54:27<12:14:26, 20.62s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0839


  7%|▋         | 168/2304 [54:51<12:50:13, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0758


  7%|▋         | 169/2304 [55:07<11:42:21, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0843


  7%|▋         | 170/2304 [55:22<10:52:50, 18.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0915


  7%|▋         | 171/2304 [55:37<10:21:09, 17.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


  7%|▋         | 172/2304 [56:01<11:30:00, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0763


  8%|▊         | 173/2304 [56:24<12:12:20, 20.62s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0979


  8%|▊         | 174/2304 [56:49<12:48:12, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0942


  8%|▊         | 175/2304 [57:04<11:40:15, 19.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


  8%|▊         | 176/2304 [57:19<10:48:15, 18.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0856


  8%|▊         | 177/2304 [57:34<10:17:24, 17.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0924


  8%|▊         | 178/2304 [57:58<11:26:43, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0938


  8%|▊         | 179/2304 [58:21<12:08:26, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0984


  8%|▊         | 180/2304 [58:45<12:43:52, 21.58s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0997


  8%|▊         | 181/2304 [59:01<11:37:06, 19.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0703


  8%|▊         | 182/2304 [59:16<10:45:48, 18.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1025


  8%|▊         | 183/2304 [59:31<10:14:49, 17.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1372


  8%|▊         | 184/2304 [59:55<11:23:42, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0906


  8%|▊         | 185/2304 [1:00:18<12:05:45, 20.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


  8%|▊         | 186/2304 [1:00:42<12:41:04, 21.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0908


  8%|▊         | 187/2304 [1:00:57<11:33:56, 19.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0897


  8%|▊         | 188/2304 [1:01:12<10:43:51, 18.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0922


  8%|▊         | 189/2304 [1:01:28<10:12:15, 17.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0842


  8%|▊         | 190/2304 [1:01:52<11:21:42, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0861


  8%|▊         | 191/2304 [1:02:15<12:04:16, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0942


  8%|▊         | 192/2304 [1:02:39<12:40:09, 21.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0829


  8%|▊         | 193/2304 [1:02:54<11:34:07, 19.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0903


  8%|▊         | 194/2304 [1:03:09<10:44:53, 18.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0809


  8%|▊         | 195/2304 [1:03:25<10:12:44, 17.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0912


  9%|▊         | 196/2304 [1:03:49<11:19:40, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0884


  9%|▊         | 197/2304 [1:04:12<12:03:16, 20.60s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0753


  9%|▊         | 198/2304 [1:04:36<12:39:31, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0876


  9%|▊         | 199/2304 [1:04:52<11:34:23, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0856


  9%|▊         | 200/2304 [1:05:07<10:43:31, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0895


  9%|▊         | 201/2304 [1:05:22<10:12:42, 17.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


  9%|▉         | 202/2304 [1:05:46<11:20:54, 19.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0702


  9%|▉         | 203/2304 [1:06:10<12:02:51, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


  9%|▉         | 204/2304 [1:06:34<12:38:01, 21.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0881


  9%|▉         | 205/2304 [1:06:49<11:31:24, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


  9%|▉         | 206/2304 [1:07:04<10:39:33, 18.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0928


  9%|▉         | 207/2304 [1:07:19<10:07:45, 17.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1249


  9%|▉         | 208/2304 [1:07:43<11:16:54, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0840


  9%|▉         | 209/2304 [1:08:07<12:00:07, 20.62s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0762


  9%|▉         | 210/2304 [1:08:31<12:35:25, 21.65s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


  9%|▉         | 211/2304 [1:08:46<11:28:24, 19.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


  9%|▉         | 212/2304 [1:09:01<10:38:13, 18.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0892


  9%|▉         | 213/2304 [1:09:16<10:07:27, 17.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0967


  9%|▉         | 214/2304 [1:09:40<11:16:09, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0841


  9%|▉         | 215/2304 [1:10:04<11:58:15, 20.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0996


  9%|▉         | 216/2304 [1:10:28<12:33:06, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0825


  9%|▉         | 217/2304 [1:10:43<11:27:15, 19.76s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0796


  9%|▉         | 218/2304 [1:10:58<10:37:35, 18.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0887


 10%|▉         | 219/2304 [1:11:14<10:07:02, 17.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0798


 10%|▉         | 220/2304 [1:11:38<11:14:51, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 10%|▉         | 221/2304 [1:12:01<11:56:10, 20.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0779


 10%|▉         | 222/2304 [1:12:25<12:32:09, 21.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 10%|▉         | 223/2304 [1:12:41<11:26:29, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0958


 10%|▉         | 224/2304 [1:12:55<10:35:36, 18.34s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0868


 10%|▉         | 225/2304 [1:13:11<10:05:05, 17.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0933


 10%|▉         | 226/2304 [1:13:35<11:12:16, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


 10%|▉         | 227/2304 [1:13:58<11:55:25, 20.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0944


 10%|▉         | 228/2304 [1:14:23<12:30:58, 21.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0790


 10%|▉         | 229/2304 [1:14:38<11:25:25, 19.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0830


 10%|▉         | 230/2304 [1:14:53<10:35:13, 18.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0990


 10%|█         | 231/2304 [1:15:08<10:04:29, 17.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1101


 10%|█         | 232/2304 [1:15:33<11:12:12, 19.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 10%|█         | 233/2304 [1:15:56<11:54:41, 20.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0798


 10%|█         | 234/2304 [1:16:20<12:29:23, 21.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0977


 10%|█         | 235/2304 [1:16:36<11:24:02, 19.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0747


 10%|█         | 236/2304 [1:16:51<10:34:36, 18.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0914


 10%|█         | 237/2304 [1:17:06<10:03:38, 17.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0912


 10%|█         | 238/2304 [1:17:30<11:11:38, 19.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 10%|█         | 239/2304 [1:17:54<11:53:45, 20.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0903


 10%|█         | 240/2304 [1:18:18<12:30:19, 21.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 10%|█         | 241/2304 [1:18:34<11:23:15, 19.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0800


 11%|█         | 242/2304 [1:18:49<10:31:53, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0748


 11%|█         | 243/2304 [1:19:04<9:59:31, 17.45s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0764


 11%|█         | 244/2304 [1:19:28<11:05:57, 19.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 11%|█         | 245/2304 [1:19:51<11:48:19, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0831


 11%|█         | 246/2304 [1:20:15<12:23:48, 21.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0856


 11%|█         | 247/2304 [1:20:31<11:18:32, 19.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1075


 11%|█         | 248/2304 [1:20:46<10:29:00, 18.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1053


 11%|█         | 249/2304 [1:21:01<9:58:21, 17.47s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 11%|█         | 250/2304 [1:21:25<11:04:42, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0646


 11%|█         | 251/2304 [1:21:48<11:43:54, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0764


 11%|█         | 252/2304 [1:22:12<12:17:37, 21.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 11%|█         | 253/2304 [1:22:28<11:14:00, 19.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0880


 11%|█         | 254/2304 [1:22:43<10:24:58, 18.29s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0950


 11%|█         | 255/2304 [1:22:58<9:53:34, 17.38s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0922


 11%|█         | 256/2304 [1:23:22<10:59:54, 19.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0933


 11%|█         | 257/2304 [1:23:45<11:40:33, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0850


 11%|█         | 258/2304 [1:24:09<12:14:17, 21.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


 11%|█         | 259/2304 [1:24:24<11:09:16, 19.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1087


 11%|█▏        | 260/2304 [1:24:39<10:22:40, 18.28s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0906


 11%|█▏        | 261/2304 [1:24:55<9:52:02, 17.39s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 11%|█▏        | 262/2304 [1:25:19<10:58:31, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0886


 11%|█▏        | 263/2304 [1:25:42<11:38:21, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 11%|█▏        | 264/2304 [1:26:06<12:13:30, 21.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 12%|█▏        | 265/2304 [1:26:21<11:10:13, 19.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 12%|█▏        | 266/2304 [1:26:36<10:22:09, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 12%|█▏        | 267/2304 [1:26:52<9:51:18, 17.42s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0958


 12%|█▏        | 268/2304 [1:27:16<10:57:55, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0779


 12%|█▏        | 269/2304 [1:27:39<11:39:00, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0807


 12%|█▏        | 270/2304 [1:28:03<12:13:37, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 12%|█▏        | 271/2304 [1:28:18<11:08:58, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0776


 12%|█▏        | 272/2304 [1:28:33<10:20:34, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0961


 12%|█▏        | 273/2304 [1:28:49<9:51:05, 17.46s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0872


 12%|█▏        | 274/2304 [1:29:13<10:56:47, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0822


 12%|█▏        | 275/2304 [1:29:36<11:37:54, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0804


 12%|█▏        | 276/2304 [1:30:00<12:11:26, 21.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 12%|█▏        | 277/2304 [1:30:16<11:07:17, 19.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0816


 12%|█▏        | 278/2304 [1:30:31<10:17:59, 18.30s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0913


 12%|█▏        | 279/2304 [1:30:46<9:48:03, 17.42s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0900


 12%|█▏        | 280/2304 [1:31:10<10:53:35, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 12%|█▏        | 281/2304 [1:31:33<11:34:52, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0947


 12%|█▏        | 282/2304 [1:31:57<12:08:22, 21.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0878


 12%|█▏        | 283/2304 [1:32:13<11:05:03, 19.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1031


 12%|█▏        | 284/2304 [1:32:28<10:16:54, 18.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0946


 12%|█▏        | 285/2304 [1:32:43<9:47:24, 17.46s/it] 

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0918


 12%|█▏        | 286/2304 [1:33:07<10:54:12, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0803


 12%|█▏        | 287/2304 [1:33:31<11:35:37, 20.69s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0783


 12%|█▎        | 288/2304 [1:33:55<12:08:33, 21.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0907


 13%|█▎        | 289/2304 [1:34:03<9:52:45, 17.65s/it] 

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0903


 13%|█▎        | 290/2304 [1:34:11<8:15:07, 14.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0788


 13%|█▎        | 291/2304 [1:34:19<7:09:01, 12.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1062


 13%|█▎        | 292/2304 [1:34:32<7:06:09, 12.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0863


 13%|█▎        | 293/2304 [1:34:44<7:01:45, 12.58s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


 13%|█▎        | 294/2304 [1:34:57<7:02:00, 12.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0760


 13%|█▎        | 295/2304 [1:35:05<6:17:38, 11.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0800


 13%|█▎        | 296/2304 [1:35:13<5:44:42, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0922


 13%|█▎        | 297/2304 [1:35:21<5:24:11,  9.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 13%|█▎        | 298/2304 [1:35:34<5:53:20, 10.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 13%|█▎        | 299/2304 [1:35:46<6:10:31, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1187


 13%|█▎        | 300/2304 [1:35:59<6:25:33, 11.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0937


 13%|█▎        | 301/2304 [1:36:07<5:52:07, 10.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0799


 13%|█▎        | 302/2304 [1:36:15<5:26:47,  9.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 13%|█▎        | 303/2304 [1:36:23<5:10:16,  9.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0705


 13%|█▎        | 304/2304 [1:36:36<5:43:35, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0957


 13%|█▎        | 305/2304 [1:36:48<6:03:35, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0801


 13%|█▎        | 306/2304 [1:37:01<6:20:16, 11.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1593


 13%|█▎        | 307/2304 [1:37:09<5:48:09, 10.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0792


 13%|█▎        | 308/2304 [1:37:17<5:23:42,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 13%|█▎        | 309/2304 [1:37:25<5:08:06,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0740


 13%|█▎        | 310/2304 [1:37:38<5:41:10, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 13%|█▎        | 311/2304 [1:37:50<6:01:06, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 14%|█▎        | 312/2304 [1:38:03<6:18:03, 11.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1681


 14%|█▎        | 313/2304 [1:38:11<5:46:03, 10.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0970


 14%|█▎        | 314/2304 [1:38:19<5:22:21,  9.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0833


 14%|█▎        | 315/2304 [1:38:27<5:07:46,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0893


 14%|█▎        | 316/2304 [1:38:40<5:40:49, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1061


 14%|█▍        | 317/2304 [1:38:52<6:00:35, 10.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0844


 14%|█▍        | 318/2304 [1:39:05<6:19:38, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 14%|█▍        | 319/2304 [1:39:13<5:47:31, 10.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1078


 14%|█▍        | 320/2304 [1:39:21<5:22:53,  9.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0774


 14%|█▍        | 321/2304 [1:39:29<5:07:05,  9.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0989


 14%|█▍        | 322/2304 [1:39:42<5:40:12, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


 14%|█▍        | 323/2304 [1:39:54<6:01:22, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0724


 14%|█▍        | 324/2304 [1:40:07<6:18:24, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1406


 14%|█▍        | 325/2304 [1:40:15<5:46:02, 10.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0788


 14%|█▍        | 326/2304 [1:40:23<5:21:09,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0785


 14%|█▍        | 327/2304 [1:40:32<5:06:01,  9.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0785


 14%|█▍        | 328/2304 [1:40:44<5:38:44, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0828


 14%|█▍        | 329/2304 [1:40:57<5:58:29, 10.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1031


 14%|█▍        | 330/2304 [1:41:09<6:15:44, 11.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0952


 14%|█▍        | 331/2304 [1:41:17<5:44:11, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 14%|█▍        | 332/2304 [1:41:25<5:19:28,  9.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 14%|█▍        | 333/2304 [1:41:34<5:04:41,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0956


 14%|█▍        | 334/2304 [1:41:46<5:37:15, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0752


 15%|█▍        | 335/2304 [1:41:59<5:56:45, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0751


 15%|█▍        | 336/2304 [1:42:11<6:13:04, 11.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0735


 15%|█▍        | 337/2304 [1:42:19<5:41:40, 10.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


 15%|█▍        | 338/2304 [1:42:27<5:17:41,  9.70s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0799


 15%|█▍        | 339/2304 [1:42:35<5:02:48,  9.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 15%|█▍        | 340/2304 [1:42:48<5:34:51, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0919


 15%|█▍        | 341/2304 [1:43:00<5:54:11, 10.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1083


 15%|█▍        | 342/2304 [1:43:13<6:10:52, 11.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 15%|█▍        | 343/2304 [1:43:21<5:40:05, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 15%|█▍        | 344/2304 [1:43:29<5:16:38,  9.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 15%|█▍        | 345/2304 [1:43:37<5:01:29,  9.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0962


 15%|█▌        | 346/2304 [1:43:50<5:33:53, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0864


 15%|█▌        | 347/2304 [1:44:02<5:54:06, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0933


 15%|█▌        | 348/2304 [1:44:15<6:11:56, 11.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0783


 15%|█▌        | 349/2304 [1:44:23<5:40:00, 10.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1246


 15%|█▌        | 350/2304 [1:44:31<5:16:21,  9.71s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0755


 15%|█▌        | 351/2304 [1:44:39<5:01:10,  9.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0782


 15%|█▌        | 352/2304 [1:44:52<5:33:49, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 15%|█▌        | 353/2304 [1:45:04<5:53:04, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0733


 15%|█▌        | 354/2304 [1:45:17<6:09:48, 11.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 15%|█▌        | 355/2304 [1:45:25<5:38:18, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1224


 15%|█▌        | 356/2304 [1:45:33<5:14:57,  9.70s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 15%|█▌        | 357/2304 [1:45:41<4:59:51,  9.24s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0776


 16%|█▌        | 358/2304 [1:45:53<5:32:03, 10.24s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 16%|█▌        | 359/2304 [1:46:06<5:52:55, 10.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 16%|█▌        | 360/2304 [1:46:19<6:10:19, 11.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 16%|█▌        | 361/2304 [1:46:27<5:39:02, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0900


 16%|█▌        | 362/2304 [1:46:35<5:15:59,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 16%|█▌        | 363/2304 [1:46:43<5:01:13,  9.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0887


 16%|█▌        | 364/2304 [1:46:56<5:32:14, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0895


 16%|█▌        | 365/2304 [1:47:08<5:51:23, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 16%|█▌        | 366/2304 [1:47:21<6:07:23, 11.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 16%|█▌        | 367/2304 [1:47:29<5:35:54, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0909


 16%|█▌        | 368/2304 [1:47:37<5:12:55,  9.70s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 16%|█▌        | 369/2304 [1:47:45<4:58:11,  9.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0840


 16%|█▌        | 370/2304 [1:47:57<5:29:38, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0856


 16%|█▌        | 371/2304 [1:48:10<5:48:50, 10.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 16%|█▌        | 372/2304 [1:48:22<6:05:30, 11.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0652


 16%|█▌        | 373/2304 [1:48:30<5:35:19, 10.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0921


 16%|█▌        | 374/2304 [1:48:39<5:12:43,  9.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0778


 16%|█▋        | 375/2304 [1:48:47<4:58:00,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0870


 16%|█▋        | 376/2304 [1:48:59<5:29:38, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 16%|█▋        | 377/2304 [1:49:12<5:50:37, 10.92s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0724


 16%|█▋        | 378/2304 [1:49:25<6:07:51, 11.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0882


 16%|█▋        | 379/2304 [1:49:33<5:36:15, 10.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0860


 16%|█▋        | 380/2304 [1:49:41<5:12:11,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 17%|█▋        | 381/2304 [1:49:49<4:57:34,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1152


 17%|█▋        | 382/2304 [1:50:02<5:28:51, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


 17%|█▋        | 383/2304 [1:50:14<5:48:15, 10.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 17%|█▋        | 384/2304 [1:50:26<6:04:12, 11.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0797


 17%|█▋        | 385/2304 [1:50:35<5:34:14, 10.45s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 17%|█▋        | 386/2304 [1:50:43<5:11:16,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 17%|█▋        | 387/2304 [1:50:51<4:56:16,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0927


 17%|█▋        | 388/2304 [1:51:04<5:29:06, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1434


 17%|█▋        | 389/2304 [1:51:16<5:48:48, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0765


 17%|█▋        | 390/2304 [1:51:29<6:05:23, 11.45s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0830


 17%|█▋        | 391/2304 [1:51:37<5:34:14, 10.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0872


 17%|█▋        | 392/2304 [1:51:45<5:10:45,  9.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0840


 17%|█▋        | 393/2304 [1:51:53<4:57:48,  9.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 17%|█▋        | 394/2304 [1:52:06<5:30:06, 10.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0852


 17%|█▋        | 395/2304 [1:52:19<5:49:41, 10.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 17%|█▋        | 396/2304 [1:52:31<6:06:08, 11.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 17%|█▋        | 397/2304 [1:52:40<5:35:13, 10.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 17%|█▋        | 398/2304 [1:52:48<5:11:56,  9.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0797


 17%|█▋        | 399/2304 [1:52:56<4:57:15,  9.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


 17%|█▋        | 400/2304 [1:53:09<5:29:21, 10.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0789


 17%|█▋        | 401/2304 [1:53:21<5:49:21, 11.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1200


 17%|█▋        | 402/2304 [1:53:34<6:04:50, 11.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0851


 17%|█▋        | 403/2304 [1:53:42<5:33:11, 10.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0820


 18%|█▊        | 404/2304 [1:53:50<5:09:07,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 18%|█▊        | 405/2304 [1:53:58<4:55:12,  9.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1238


 18%|█▊        | 406/2304 [1:54:11<5:26:10, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 18%|█▊        | 407/2304 [1:54:23<5:45:01, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0762


 18%|█▊        | 408/2304 [1:54:36<6:01:13, 11.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1347


 18%|█▊        | 409/2304 [1:54:44<5:31:09, 10.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0794


 18%|█▊        | 410/2304 [1:54:52<5:08:15,  9.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0938


 18%|█▊        | 411/2304 [1:55:01<4:53:47,  9.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0912


 18%|█▊        | 412/2304 [1:55:13<5:25:45, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0891


 18%|█▊        | 413/2304 [1:55:26<5:44:47, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0720


 18%|█▊        | 414/2304 [1:55:38<6:00:03, 11.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0876


 18%|█▊        | 415/2304 [1:55:47<5:29:45, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0718


 18%|█▊        | 416/2304 [1:55:55<5:07:01,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0870


 18%|█▊        | 417/2304 [1:56:03<4:52:22,  9.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0979


 18%|█▊        | 418/2304 [1:56:16<5:24:21, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 18%|█▊        | 419/2304 [1:56:28<5:43:20, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 18%|█▊        | 420/2304 [1:56:41<6:00:11, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


 18%|█▊        | 421/2304 [1:56:49<5:29:42, 10.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1147


 18%|█▊        | 422/2304 [1:56:57<5:06:24,  9.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0842


 18%|█▊        | 423/2304 [1:57:05<4:51:52,  9.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0937


 18%|█▊        | 424/2304 [1:57:18<5:22:45, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0895


 18%|█▊        | 425/2304 [1:57:30<5:42:35, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1638


 18%|█▊        | 426/2304 [1:57:43<5:59:06, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0794


 19%|█▊        | 427/2304 [1:57:51<5:28:21, 10.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1149


 19%|█▊        | 428/2304 [1:57:59<5:05:06,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0851


 19%|█▊        | 429/2304 [1:58:07<4:50:46,  9.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0776


 19%|█▊        | 430/2304 [1:58:20<5:21:51, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1359


 19%|█▊        | 431/2304 [1:58:32<5:40:21, 10.90s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0767


 19%|█▉        | 432/2304 [1:58:45<5:56:03, 11.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 19%|█▉        | 433/2304 [1:58:53<5:26:32, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


 19%|█▉        | 434/2304 [1:59:01<5:03:16,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 19%|█▉        | 435/2304 [1:59:09<4:49:20,  9.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1004


 19%|█▉        | 436/2304 [1:59:22<5:20:28, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 19%|█▉        | 437/2304 [1:59:34<5:39:26, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0760


 19%|█▉        | 438/2304 [1:59:47<5:55:47, 11.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0844


 19%|█▉        | 439/2304 [1:59:55<5:26:02, 10.49s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0830


 19%|█▉        | 440/2304 [2:00:03<5:03:12,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0881


 19%|█▉        | 441/2304 [2:00:12<4:49:58,  9.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0768


 19%|█▉        | 442/2304 [2:00:24<5:20:48, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0830


 19%|█▉        | 443/2304 [2:00:37<5:39:29, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 19%|█▉        | 444/2304 [2:00:49<5:54:14, 11.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


 19%|█▉        | 445/2304 [2:00:58<5:24:19, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0871


 19%|█▉        | 446/2304 [2:01:06<5:01:40,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0833


 19%|█▉        | 447/2304 [2:01:14<4:47:02,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1084


 19%|█▉        | 448/2304 [2:01:26<5:17:41, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0917


 19%|█▉        | 449/2304 [2:01:39<5:36:09, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0800


 20%|█▉        | 450/2304 [2:01:51<5:52:29, 11.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0786


 20%|█▉        | 451/2304 [2:02:00<5:23:15, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0934


 20%|█▉        | 452/2304 [2:02:08<5:00:22,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0748


 20%|█▉        | 453/2304 [2:02:16<4:46:14,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0860


 20%|█▉        | 454/2304 [2:02:29<5:17:32, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


 20%|█▉        | 455/2304 [2:02:41<5:37:11, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0784


 20%|█▉        | 456/2304 [2:02:54<5:53:41, 11.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0761


 20%|█▉        | 457/2304 [2:03:02<5:22:20, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0916


 20%|█▉        | 458/2304 [2:03:10<4:59:28,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 20%|█▉        | 459/2304 [2:03:18<4:45:25,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0831


 20%|█▉        | 460/2304 [2:03:31<5:15:29, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 20%|██        | 461/2304 [2:03:43<5:33:42, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 20%|██        | 462/2304 [2:03:56<5:49:38, 11.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0767


 20%|██        | 463/2304 [2:04:04<5:19:30, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0944


 20%|██        | 464/2304 [2:04:12<4:58:02,  9.72s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 20%|██        | 465/2304 [2:04:20<4:44:18,  9.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 20%|██        | 466/2304 [2:04:33<5:14:19, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0978


 20%|██        | 467/2304 [2:04:45<5:32:51, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


 20%|██        | 468/2304 [2:04:57<5:48:19, 11.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0943


 20%|██        | 469/2304 [2:05:06<5:19:25, 10.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0855


 20%|██        | 470/2304 [2:05:14<4:57:26,  9.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0819


 20%|██        | 471/2304 [2:05:22<4:43:10,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0780


 20%|██        | 472/2304 [2:05:35<5:13:01, 10.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0819


 21%|██        | 473/2304 [2:05:47<5:31:20, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1108


 21%|██        | 474/2304 [2:05:59<5:46:29, 11.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0716


 21%|██        | 475/2304 [2:06:08<5:17:12, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0829


 21%|██        | 476/2304 [2:06:16<4:55:05,  9.69s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0802


 21%|██        | 477/2304 [2:06:24<4:42:19,  9.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0946


 21%|██        | 478/2304 [2:06:36<5:12:48, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


 21%|██        | 479/2304 [2:06:49<5:30:39, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 21%|██        | 480/2304 [2:07:01<5:45:54, 11.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0817


 21%|██        | 481/2304 [2:07:09<5:16:30, 10.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 21%|██        | 482/2304 [2:07:17<4:53:44,  9.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 21%|██        | 483/2304 [2:07:26<4:40:04,  9.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0730


 21%|██        | 484/2304 [2:07:38<5:08:34, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0667


 21%|██        | 485/2304 [2:07:50<5:26:48, 10.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 21%|██        | 486/2304 [2:08:03<5:41:15, 11.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 21%|██        | 487/2304 [2:08:11<5:12:09, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 21%|██        | 488/2304 [2:08:19<4:50:41,  9.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0765


 21%|██        | 489/2304 [2:08:27<4:37:21,  9.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0835


 21%|██▏       | 490/2304 [2:08:39<5:08:07, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0752


 21%|██▏       | 491/2304 [2:08:52<5:26:51, 10.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0746


 21%|██▏       | 492/2304 [2:09:04<5:41:19, 11.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 21%|██▏       | 493/2304 [2:09:12<5:13:06, 10.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 21%|██▏       | 494/2304 [2:09:20<4:51:31,  9.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 21%|██▏       | 495/2304 [2:09:28<4:37:56,  9.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1101


 22%|██▏       | 496/2304 [2:09:41<5:06:24, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0693


 22%|██▏       | 497/2304 [2:09:53<5:23:36, 10.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0770


 22%|██▏       | 498/2304 [2:10:05<5:37:55, 11.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0835


 22%|██▏       | 499/2304 [2:10:13<5:09:30, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 22%|██▏       | 500/2304 [2:10:21<4:48:31,  9.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0797


 22%|██▏       | 501/2304 [2:10:30<4:35:45,  9.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 22%|██▏       | 502/2304 [2:10:42<5:04:46, 10.15s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0850


 22%|██▏       | 503/2304 [2:10:54<5:22:47, 10.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0738


 22%|██▏       | 504/2304 [2:11:07<5:38:06, 11.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0790


 22%|██▏       | 505/2304 [2:11:15<5:09:22, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0798


 22%|██▏       | 506/2304 [2:11:23<4:48:00,  9.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0778


 22%|██▏       | 507/2304 [2:11:31<4:35:12,  9.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 22%|██▏       | 508/2304 [2:11:43<5:03:39, 10.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0783


 22%|██▏       | 509/2304 [2:11:55<5:22:10, 10.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 22%|██▏       | 510/2304 [2:12:08<5:36:53, 11.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 22%|██▏       | 511/2304 [2:12:16<5:08:09, 10.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 22%|██▏       | 512/2304 [2:12:24<4:47:30,  9.63s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0889


 22%|██▏       | 513/2304 [2:12:32<4:34:17,  9.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0762


 22%|██▏       | 514/2304 [2:12:45<5:03:05, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 22%|██▏       | 515/2304 [2:12:57<5:20:58, 10.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 22%|██▏       | 516/2304 [2:13:09<5:35:47, 11.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 22%|██▏       | 517/2304 [2:13:17<5:07:40, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0964


 22%|██▏       | 518/2304 [2:13:25<4:47:42,  9.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0667


 23%|██▎       | 519/2304 [2:13:34<4:34:24,  9.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1201


 23%|██▎       | 520/2304 [2:13:46<5:02:12, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 23%|██▎       | 521/2304 [2:13:58<5:19:46, 10.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 23%|██▎       | 522/2304 [2:14:11<5:34:50, 11.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0965


 23%|██▎       | 523/2304 [2:14:19<5:06:41, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1047


 23%|██▎       | 524/2304 [2:14:27<4:45:34,  9.63s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1042


 23%|██▎       | 525/2304 [2:14:35<4:32:34,  9.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0812


 23%|██▎       | 526/2304 [2:14:47<5:00:57, 10.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0870


 23%|██▎       | 527/2304 [2:14:59<5:18:25, 10.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 23%|██▎       | 528/2304 [2:15:12<5:33:18, 11.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0793


 23%|██▎       | 529/2304 [2:15:20<5:05:32, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 23%|██▎       | 530/2304 [2:15:28<4:44:22,  9.62s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


 23%|██▎       | 531/2304 [2:15:36<4:31:38,  9.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0800


 23%|██▎       | 532/2304 [2:15:49<5:01:59, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0861


 23%|██▎       | 533/2304 [2:16:01<5:20:35, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0765


 23%|██▎       | 534/2304 [2:16:14<5:35:04, 11.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0885


 23%|██▎       | 535/2304 [2:16:22<5:06:47, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0844


 23%|██▎       | 536/2304 [2:16:30<4:44:57,  9.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0944


 23%|██▎       | 537/2304 [2:16:38<4:31:23,  9.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0819


 23%|██▎       | 538/2304 [2:16:50<5:00:05, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 23%|██▎       | 539/2304 [2:17:03<5:17:46, 10.80s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 23%|██▎       | 540/2304 [2:17:15<5:32:25, 11.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0815


 23%|██▎       | 541/2304 [2:17:23<5:04:38, 10.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 24%|██▎       | 542/2304 [2:17:31<4:43:28,  9.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0940


 24%|██▎       | 543/2304 [2:17:39<4:30:01,  9.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1007


 24%|██▎       | 544/2304 [2:17:52<4:58:24, 10.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 24%|██▎       | 545/2304 [2:18:04<5:15:57, 10.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0865


 24%|██▎       | 546/2304 [2:18:17<5:30:56, 11.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0797


 24%|██▎       | 547/2304 [2:18:25<5:02:54, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


 24%|██▍       | 548/2304 [2:18:33<4:42:31,  9.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0910


 24%|██▍       | 549/2304 [2:18:41<4:29:20,  9.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 24%|██▍       | 550/2304 [2:18:54<4:58:26, 10.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0850


 24%|██▍       | 551/2304 [2:19:06<5:16:37, 10.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0738


 24%|██▍       | 552/2304 [2:19:18<5:31:46, 11.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 24%|██▍       | 553/2304 [2:19:27<5:03:58, 10.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0976


 24%|██▍       | 554/2304 [2:19:35<4:42:27,  9.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0796


 24%|██▍       | 555/2304 [2:19:43<4:28:47,  9.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 24%|██▍       | 556/2304 [2:19:55<4:57:34, 10.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0873


 24%|██▍       | 557/2304 [2:20:07<5:14:40, 10.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0710


 24%|██▍       | 558/2304 [2:20:20<5:28:53, 11.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0737


 24%|██▍       | 559/2304 [2:20:28<5:01:23, 10.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0854


 24%|██▍       | 560/2304 [2:20:36<4:41:10,  9.67s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0814


 24%|██▍       | 561/2304 [2:20:44<4:28:11,  9.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0850


 24%|██▍       | 562/2304 [2:20:57<4:56:31, 10.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 24%|██▍       | 563/2304 [2:21:09<5:14:39, 10.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 24%|██▍       | 564/2304 [2:21:22<5:29:25, 11.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 25%|██▍       | 565/2304 [2:21:30<5:01:45, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0884


 25%|██▍       | 566/2304 [2:21:38<4:40:23,  9.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0941


 25%|██▍       | 567/2304 [2:21:46<4:27:18,  9.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0788


 25%|██▍       | 568/2304 [2:21:59<4:55:44, 10.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0806


 25%|██▍       | 569/2304 [2:22:11<5:13:12, 10.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0786


 25%|██▍       | 570/2304 [2:22:23<5:27:35, 11.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1141


 25%|██▍       | 571/2304 [2:22:32<5:00:40, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0830


 25%|██▍       | 572/2304 [2:22:40<4:39:17,  9.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0754


 25%|██▍       | 573/2304 [2:22:48<4:26:29,  9.24s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0836


 25%|██▍       | 574/2304 [2:23:00<4:54:05, 10.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0915


 25%|██▍       | 575/2304 [2:23:13<5:11:45, 10.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0791


 25%|██▌       | 576/2304 [2:23:25<5:26:48, 11.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0667


 25%|██▌       | 577/2304 [2:23:30<4:28:29,  9.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0782


 25%|██▌       | 578/2304 [2:23:34<3:48:31,  7.94s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0823


 25%|██▌       | 579/2304 [2:23:39<3:20:05,  6.96s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1303


 25%|██▌       | 580/2304 [2:23:46<3:20:42,  6.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0762


 25%|██▌       | 581/2304 [2:23:53<3:18:09,  6.90s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 25%|██▌       | 582/2304 [2:24:00<3:17:44,  6.89s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0913


 25%|██▌       | 583/2304 [2:24:04<2:59:26,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0801


 25%|██▌       | 584/2304 [2:24:09<2:44:41,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 25%|██▌       | 585/2304 [2:24:14<2:34:48,  5.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0736


 25%|██▌       | 586/2304 [2:24:21<2:48:06,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1488


 25%|██▌       | 587/2304 [2:24:27<2:54:30,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 26%|██▌       | 588/2304 [2:24:34<3:01:57,  6.36s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1023


 26%|██▌       | 589/2304 [2:24:39<2:46:50,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 26%|██▌       | 590/2304 [2:24:43<2:35:32,  5.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 26%|██▌       | 591/2304 [2:24:48<2:29:41,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


 26%|██▌       | 592/2304 [2:24:55<2:43:05,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0984


 26%|██▌       | 593/2304 [2:25:02<2:51:42,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 26%|██▌       | 594/2304 [2:25:09<3:00:14,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0749


 26%|██▌       | 595/2304 [2:25:13<2:45:36,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0917


 26%|██▌       | 596/2304 [2:25:18<2:34:24,  5.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0734


 26%|██▌       | 597/2304 [2:25:23<2:28:56,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0748


 26%|██▌       | 598/2304 [2:25:29<2:42:01,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0713


 26%|██▌       | 599/2304 [2:25:36<2:51:42,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 26%|██▌       | 600/2304 [2:25:43<2:58:05,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0777


 26%|██▌       | 601/2304 [2:25:48<2:44:27,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0770


 26%|██▌       | 602/2304 [2:25:52<2:34:36,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 26%|██▌       | 603/2304 [2:25:57<2:27:25,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0832


 26%|██▌       | 604/2304 [2:26:04<2:40:30,  5.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0913


 26%|██▋       | 605/2304 [2:26:11<2:50:56,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 26%|██▋       | 606/2304 [2:26:18<2:57:47,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1121


 26%|██▋       | 607/2304 [2:26:22<2:45:39,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0920


 26%|██▋       | 608/2304 [2:26:27<2:34:15,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 26%|██▋       | 609/2304 [2:26:32<2:27:30,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0920


 26%|██▋       | 610/2304 [2:26:39<2:42:21,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1248


 27%|██▋       | 611/2304 [2:26:45<2:50:27,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0698


 27%|██▋       | 612/2304 [2:26:52<2:56:34,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0880


 27%|██▋       | 613/2304 [2:26:57<2:43:47,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


 27%|██▋       | 614/2304 [2:27:01<2:32:26,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0720


 27%|██▋       | 615/2304 [2:27:06<2:26:53,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0757


 27%|██▋       | 616/2304 [2:27:13<2:40:41,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 27%|██▋       | 617/2304 [2:27:20<2:48:58,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0878


 27%|██▋       | 618/2304 [2:27:27<2:56:35,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1217


 27%|██▋       | 619/2304 [2:27:31<2:42:47,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0833


 27%|██▋       | 620/2304 [2:27:36<2:32:07,  5.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0714


 27%|██▋       | 621/2304 [2:27:41<2:26:25,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 27%|██▋       | 622/2304 [2:27:47<2:39:53,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0781


 27%|██▋       | 623/2304 [2:27:54<2:49:30,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 27%|██▋       | 624/2304 [2:28:01<2:56:33,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0949


 27%|██▋       | 625/2304 [2:28:06<2:42:30,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1012


 27%|██▋       | 626/2304 [2:28:10<2:33:28,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0846


 27%|██▋       | 627/2304 [2:28:15<2:26:10,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0760


 27%|██▋       | 628/2304 [2:28:22<2:39:41,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0780


 27%|██▋       | 629/2304 [2:28:29<2:49:07,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 27%|██▋       | 630/2304 [2:28:36<2:55:54,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1143


 27%|██▋       | 631/2304 [2:28:40<2:41:43,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0741


 27%|██▋       | 632/2304 [2:28:45<2:32:20,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 27%|██▋       | 633/2304 [2:28:50<2:25:05,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0770


 28%|██▊       | 634/2304 [2:28:57<2:39:48,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1117


 28%|██▊       | 635/2304 [2:29:03<2:47:44,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 28%|██▊       | 636/2304 [2:29:10<2:55:00,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0887


 28%|██▊       | 637/2304 [2:29:15<2:42:17,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 28%|██▊       | 638/2304 [2:29:20<2:31:50,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


 28%|██▊       | 639/2304 [2:29:24<2:25:05,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 28%|██▊       | 640/2304 [2:29:31<2:39:24,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 28%|██▊       | 641/2304 [2:29:38<2:48:04,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0882


 28%|██▊       | 642/2304 [2:29:45<2:55:43,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0883


 28%|██▊       | 643/2304 [2:29:50<2:41:47,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0718


 28%|██▊       | 644/2304 [2:29:54<2:30:53,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 28%|██▊       | 645/2304 [2:29:59<2:25:47,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0751


 28%|██▊       | 646/2304 [2:30:06<2:39:06,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0904


 28%|██▊       | 647/2304 [2:30:13<2:47:15,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 28%|██▊       | 648/2304 [2:30:20<2:55:04,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 28%|██▊       | 649/2304 [2:30:24<2:40:48,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1004


 28%|██▊       | 650/2304 [2:30:29<2:31:09,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0954


 28%|██▊       | 651/2304 [2:30:34<2:24:31,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1040


 28%|██▊       | 652/2304 [2:30:41<2:37:16,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0854


 28%|██▊       | 653/2304 [2:30:47<2:46:47,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 28%|██▊       | 654/2304 [2:30:54<2:53:22,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 28%|██▊       | 655/2304 [2:30:59<2:39:26,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 28%|██▊       | 656/2304 [2:31:04<2:30:09,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 29%|██▊       | 657/2304 [2:31:08<2:23:13,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0841


 29%|██▊       | 658/2304 [2:31:15<2:37:37,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0961


 29%|██▊       | 659/2304 [2:31:22<2:45:39,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 29%|██▊       | 660/2304 [2:31:29<2:51:57,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0926


 29%|██▊       | 661/2304 [2:31:34<2:39:49,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0771


 29%|██▊       | 662/2304 [2:31:38<2:29:07,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 29%|██▉       | 663/2304 [2:31:43<2:22:22,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 29%|██▉       | 664/2304 [2:31:50<2:37:15,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0876


 29%|██▉       | 665/2304 [2:31:57<2:45:14,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 29%|██▉       | 666/2304 [2:32:03<2:51:45,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1027


 29%|██▉       | 667/2304 [2:32:08<2:39:47,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 29%|██▉       | 668/2304 [2:32:13<2:29:13,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0777


 29%|██▉       | 669/2304 [2:32:18<2:23:41,  5.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 29%|██▉       | 670/2304 [2:32:25<2:36:37,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0972


 29%|██▉       | 671/2304 [2:32:31<2:44:11,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 29%|██▉       | 672/2304 [2:32:38<2:52:01,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 29%|██▉       | 673/2304 [2:32:43<2:38:23,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0999


 29%|██▉       | 674/2304 [2:32:47<2:27:56,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0861


 29%|██▉       | 675/2304 [2:32:52<2:22:17,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0983


 29%|██▉       | 676/2304 [2:32:59<2:35:44,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0847


 29%|██▉       | 677/2304 [2:33:06<2:44:58,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0743


 29%|██▉       | 678/2304 [2:33:13<2:52:27,  6.36s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


 29%|██▉       | 679/2304 [2:33:18<2:38:41,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


 30%|██▉       | 680/2304 [2:33:22<2:29:46,  5.53s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 30%|██▉       | 681/2304 [2:33:27<2:22:42,  5.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0965


 30%|██▉       | 682/2304 [2:33:34<2:35:39,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0876


 30%|██▉       | 683/2304 [2:33:41<2:44:38,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


 30%|██▉       | 684/2304 [2:33:48<2:50:57,  6.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0953


 30%|██▉       | 685/2304 [2:33:53<2:38:05,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0810


 30%|██▉       | 686/2304 [2:33:57<2:27:18,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0845


 30%|██▉       | 687/2304 [2:34:02<2:20:40,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0802


 30%|██▉       | 688/2304 [2:34:09<2:34:42,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1105


 30%|██▉       | 689/2304 [2:34:15<2:42:07,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0809


 30%|██▉       | 690/2304 [2:34:22<2:48:46,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0810


 30%|██▉       | 691/2304 [2:34:27<2:36:46,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 30%|███       | 692/2304 [2:34:32<2:26:34,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0726


 30%|███       | 693/2304 [2:34:36<2:21:01,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 30%|███       | 694/2304 [2:34:43<2:33:34,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0828


 30%|███       | 695/2304 [2:34:50<2:41:24,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0740


 30%|███       | 696/2304 [2:34:57<2:49:25,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1045


 30%|███       | 697/2304 [2:35:02<2:35:41,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1069


 30%|███       | 698/2304 [2:35:06<2:25:13,  5.43s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0779


 30%|███       | 699/2304 [2:35:11<2:20:17,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0974


 30%|███       | 700/2304 [2:35:18<2:33:10,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0832


 30%|███       | 701/2304 [2:35:25<2:41:11,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0837


 30%|███       | 702/2304 [2:35:31<2:48:27,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0690


 31%|███       | 703/2304 [2:35:36<2:35:04,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0834


 31%|███       | 704/2304 [2:35:41<2:25:59,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 31%|███       | 705/2304 [2:35:45<2:19:16,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1055


 31%|███       | 706/2304 [2:35:52<2:31:57,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1175


 31%|███       | 707/2304 [2:35:59<2:41:43,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1225


 31%|███       | 708/2304 [2:36:06<2:48:09,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0928


 31%|███       | 709/2304 [2:36:11<2:35:12,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0857


 31%|███       | 710/2304 [2:36:16<2:26:12,  5.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 31%|███       | 711/2304 [2:36:20<2:19:27,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 31%|███       | 712/2304 [2:36:27<2:33:07,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1081


 31%|███       | 713/2304 [2:36:34<2:40:48,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0776


 31%|███       | 714/2304 [2:36:41<2:47:06,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0881


 31%|███       | 715/2304 [2:36:46<2:35:24,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 31%|███       | 716/2304 [2:36:50<2:24:33,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 31%|███       | 717/2304 [2:36:55<2:17:50,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1197


 31%|███       | 718/2304 [2:37:02<2:32:21,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0862


 31%|███       | 719/2304 [2:37:09<2:40:01,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 31%|███▏      | 720/2304 [2:37:16<2:47:30,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1079


 31%|███▏      | 721/2304 [2:37:20<2:34:07,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0770


 31%|███▏      | 722/2304 [2:37:25<2:23:53,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0747


 31%|███▏      | 723/2304 [2:37:30<2:18:22,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0905


 31%|███▏      | 724/2304 [2:37:36<2:31:10,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0871


 31%|███▏      | 725/2304 [2:37:43<2:38:29,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 32%|███▏      | 726/2304 [2:37:50<2:46:05,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0836


 32%|███▏      | 727/2304 [2:37:55<2:32:27,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1227


 32%|███▏      | 728/2304 [2:38:00<2:24:03,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0786


 32%|███▏      | 729/2304 [2:38:04<2:17:04,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0892


 32%|███▏      | 730/2304 [2:38:11<2:29:53,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 32%|███▏      | 731/2304 [2:38:18<2:38:42,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 32%|███▏      | 732/2304 [2:38:25<2:45:13,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0910


 32%|███▏      | 733/2304 [2:38:29<2:31:53,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 32%|███▏      | 734/2304 [2:38:34<2:23:24,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0806


 32%|███▏      | 735/2304 [2:38:39<2:16:54,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 32%|███▏      | 736/2304 [2:38:46<2:30:04,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0981


 32%|███▏      | 737/2304 [2:38:53<2:39:17,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0741


 32%|███▏      | 738/2304 [2:38:59<2:45:22,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1157


 32%|███▏      | 739/2304 [2:39:04<2:32:55,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 32%|███▏      | 740/2304 [2:39:09<2:22:45,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


 32%|███▏      | 741/2304 [2:39:13<2:16:03,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0807


 32%|███▏      | 742/2304 [2:39:20<2:29:39,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0806


 32%|███▏      | 743/2304 [2:39:27<2:37:01,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 32%|███▏      | 744/2304 [2:39:34<2:43:50,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 32%|███▏      | 745/2304 [2:39:39<2:31:54,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1013


 32%|███▏      | 746/2304 [2:39:43<2:21:35,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0769


 32%|███▏      | 747/2304 [2:39:48<2:16:22,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0828


 32%|███▏      | 748/2304 [2:39:55<2:28:17,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0881


 33%|███▎      | 749/2304 [2:40:02<2:36:11,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 33%|███▎      | 750/2304 [2:40:09<2:43:43,  6.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0941


 33%|███▎      | 751/2304 [2:40:13<2:30:44,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1154


 33%|███▎      | 752/2304 [2:40:18<2:20:33,  5.43s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0949


 33%|███▎      | 753/2304 [2:40:23<2:15:59,  5.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0891


 33%|███▎      | 754/2304 [2:40:30<2:27:59,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0890


 33%|███▎      | 755/2304 [2:40:37<2:37:05,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 33%|███▎      | 756/2304 [2:40:43<2:42:37,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0878


 33%|███▎      | 757/2304 [2:40:48<2:29:52,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0830


 33%|███▎      | 758/2304 [2:40:53<2:21:09,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0808


 33%|███▎      | 759/2304 [2:40:57<2:14:56,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1011


 33%|███▎      | 760/2304 [2:41:04<2:26:59,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1002


 33%|███▎      | 761/2304 [2:41:11<2:36:30,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0781


 33%|███▎      | 762/2304 [2:41:18<2:42:07,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1040


 33%|███▎      | 763/2304 [2:41:23<2:29:19,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0821


 33%|███▎      | 764/2304 [2:41:27<2:20:41,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 33%|███▎      | 765/2304 [2:41:32<2:14:41,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0822


 33%|███▎      | 766/2304 [2:41:39<2:28:21,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1018


 33%|███▎      | 767/2304 [2:41:46<2:35:44,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0900


 33%|███▎      | 768/2304 [2:41:53<2:41:32,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0913


 33%|███▎      | 769/2304 [2:41:58<2:29:58,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 33%|███▎      | 770/2304 [2:42:02<2:19:40,  5.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 33%|███▎      | 771/2304 [2:42:07<2:13:05,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1193


 34%|███▎      | 772/2304 [2:42:14<2:26:38,  5.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0763


 34%|███▎      | 773/2304 [2:42:20<2:33:54,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 34%|███▎      | 774/2304 [2:42:27<2:41:35,  6.34s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1005


 34%|███▎      | 775/2304 [2:42:32<2:28:31,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0962


 34%|███▎      | 776/2304 [2:42:37<2:18:46,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 34%|███▎      | 777/2304 [2:42:41<2:13:16,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1008


 34%|███▍      | 778/2304 [2:42:48<2:25:22,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0975


 34%|███▍      | 779/2304 [2:42:55<2:32:36,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 34%|███▍      | 780/2304 [2:43:02<2:40:15,  6.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0918


 34%|███▍      | 781/2304 [2:43:07<2:27:21,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0797


 34%|███▍      | 782/2304 [2:43:11<2:18:50,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0712


 34%|███▍      | 783/2304 [2:43:16<2:12:24,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1356


 34%|███▍      | 784/2304 [2:43:23<2:24:58,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0822


 34%|███▍      | 785/2304 [2:43:30<2:32:55,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


 34%|███▍      | 786/2304 [2:43:36<2:39:03,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0990


 34%|███▍      | 787/2304 [2:43:41<2:26:36,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1530


 34%|███▍      | 788/2304 [2:43:46<2:18:15,  5.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 34%|███▍      | 789/2304 [2:43:50<2:11:32,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0799


 34%|███▍      | 790/2304 [2:43:57<2:24:36,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0892


 34%|███▍      | 791/2304 [2:44:04<2:31:31,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0839


 34%|███▍      | 792/2304 [2:44:11<2:37:39,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1034


 34%|███▍      | 793/2304 [2:44:16<2:26:09,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0874


 34%|███▍      | 794/2304 [2:44:20<2:16:27,  5.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0745


 35%|███▍      | 795/2304 [2:44:25<2:10:24,  5.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0916


 35%|███▍      | 796/2304 [2:44:32<2:23:12,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 35%|███▍      | 797/2304 [2:44:38<2:30:41,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 35%|███▍      | 798/2304 [2:44:45<2:36:38,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0822


 35%|███▍      | 799/2304 [2:44:50<2:25:26,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 35%|███▍      | 800/2304 [2:44:54<2:15:33,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0806


 35%|███▍      | 801/2304 [2:44:59<2:10:58,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0865


 35%|███▍      | 802/2304 [2:45:06<2:23:06,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0848


 35%|███▍      | 803/2304 [2:45:13<2:30:38,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 35%|███▍      | 804/2304 [2:45:20<2:37:32,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0882


 35%|███▍      | 805/2304 [2:45:24<2:24:57,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0838


 35%|███▍      | 806/2304 [2:45:29<2:15:09,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


 35%|███▌      | 807/2304 [2:45:34<2:10:36,  5.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0985


 35%|███▌      | 808/2304 [2:45:41<2:22:35,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1000


 35%|███▌      | 809/2304 [2:45:47<2:31:10,  6.07s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 35%|███▌      | 810/2304 [2:45:54<2:36:30,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0847


 35%|███▌      | 811/2304 [2:45:59<2:24:41,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1034


 35%|███▌      | 812/2304 [2:46:04<2:16:24,  5.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0796


 35%|███▌      | 813/2304 [2:46:08<2:10:13,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0946


 35%|███▌      | 814/2304 [2:46:15<2:22:14,  5.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0941


 35%|███▌      | 815/2304 [2:46:22<2:30:36,  6.07s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0878


 35%|███▌      | 816/2304 [2:46:29<2:35:54,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0695


 35%|███▌      | 817/2304 [2:46:34<2:24:52,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0913


 36%|███▌      | 818/2304 [2:46:38<2:14:52,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0763


 36%|███▌      | 819/2304 [2:46:43<2:08:32,  5.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


 36%|███▌      | 820/2304 [2:46:50<2:21:29,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0948


 36%|███▌      | 821/2304 [2:46:56<2:28:13,  6.00s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 36%|███▌      | 822/2304 [2:47:03<2:33:58,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0897


 36%|███▌      | 823/2304 [2:47:08<2:22:47,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0974


 36%|███▌      | 824/2304 [2:47:12<2:13:30,  5.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0873


 36%|███▌      | 825/2304 [2:47:17<2:08:32,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0881


 36%|███▌      | 826/2304 [2:47:24<2:20:32,  5.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0987


 36%|███▌      | 827/2304 [2:47:31<2:27:28,  5.99s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 36%|███▌      | 828/2304 [2:47:38<2:34:51,  6.29s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0960


 36%|███▌      | 829/2304 [2:47:42<2:22:06,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 36%|███▌      | 830/2304 [2:47:47<2:13:02,  5.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


 36%|███▌      | 831/2304 [2:47:52<2:08:44,  5.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0792


 36%|███▌      | 832/2304 [2:47:59<2:20:22,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0878


 36%|███▌      | 833/2304 [2:48:05<2:27:25,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 36%|███▌      | 834/2304 [2:48:12<2:34:21,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0927


 36%|███▌      | 835/2304 [2:48:17<2:21:51,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1059


 36%|███▋      | 836/2304 [2:48:22<2:14:02,  5.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0795


 36%|███▋      | 837/2304 [2:48:26<2:07:42,  5.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0881


 36%|███▋      | 838/2304 [2:48:33<2:19:10,  5.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 36%|███▋      | 839/2304 [2:48:40<2:27:14,  6.03s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0877


 36%|███▋      | 840/2304 [2:48:47<2:32:48,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0830


 37%|███▋      | 841/2304 [2:48:51<2:20:55,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0988


 37%|███▋      | 842/2304 [2:48:56<2:12:39,  5.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 37%|███▋      | 843/2304 [2:49:01<2:06:46,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0877


 37%|███▋      | 844/2304 [2:49:07<2:19:06,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0869


 37%|███▋      | 845/2304 [2:49:14<2:26:07,  6.01s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 37%|███▋      | 846/2304 [2:49:21<2:31:52,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0791


 37%|███▋      | 847/2304 [2:49:26<2:21:24,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0883


 37%|███▋      | 848/2304 [2:49:30<2:12:10,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 37%|███▋      | 849/2304 [2:49:35<2:06:27,  5.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1104


 37%|███▋      | 850/2304 [2:49:42<2:19:25,  5.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 37%|███▋      | 851/2304 [2:49:49<2:26:33,  6.05s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 37%|███▋      | 852/2304 [2:49:56<2:33:05,  6.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0786


 37%|███▋      | 853/2304 [2:50:00<2:21:04,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1231


 37%|███▋      | 854/2304 [2:50:05<2:11:42,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0794


 37%|███▋      | 855/2304 [2:50:10<2:06:45,  5.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1111


 37%|███▋      | 856/2304 [2:50:17<2:17:59,  5.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0896


 37%|███▋      | 857/2304 [2:50:23<2:25:14,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 37%|███▋      | 858/2304 [2:50:30<2:31:48,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0848


 37%|███▋      | 859/2304 [2:50:35<2:19:32,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0900


 37%|███▋      | 860/2304 [2:50:40<2:11:10,  5.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0785


 37%|███▋      | 861/2304 [2:50:44<2:05:02,  5.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0770


 37%|███▋      | 862/2304 [2:50:51<2:16:37,  5.68s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0910


 37%|███▋      | 863/2304 [2:50:58<2:24:37,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0797


 38%|███▊      | 864/2304 [2:51:05<2:30:03,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1022


 38%|███▊      | 865/2304 [2:51:08<2:06:41,  5.28s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 38%|███▊      | 866/2304 [2:51:10<1:49:16,  4.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 38%|███▊      | 867/2304 [2:51:13<1:38:03,  4.09s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 38%|███▊      | 868/2304 [2:51:17<1:37:13,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0804


 38%|███▊      | 869/2304 [2:51:22<1:37:18,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 38%|███▊      | 870/2304 [2:51:26<1:37:57,  4.10s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 38%|███▊      | 871/2304 [2:51:29<1:29:00,  3.73s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 38%|███▊      | 872/2304 [2:51:32<1:23:29,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 38%|███▊      | 873/2304 [2:51:35<1:20:06,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0669


 38%|███▊      | 874/2304 [2:51:39<1:24:27,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 38%|███▊      | 875/2304 [2:51:43<1:28:06,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 38%|███▊      | 876/2304 [2:51:47<1:31:31,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0721


 38%|███▊      | 877/2304 [2:51:50<1:24:23,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 38%|███▊      | 878/2304 [2:51:53<1:20:11,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0689


 38%|███▊      | 879/2304 [2:51:55<1:16:27,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 38%|███▊      | 880/2304 [2:52:00<1:23:18,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 38%|███▊      | 881/2304 [2:52:04<1:27:18,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 38%|███▊      | 882/2304 [2:52:08<1:30:37,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 38%|███▊      | 883/2304 [2:52:11<1:24:07,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0690


 38%|███▊      | 884/2304 [2:52:14<1:20:07,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 38%|███▊      | 885/2304 [2:52:17<1:16:23,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0696


 38%|███▊      | 886/2304 [2:52:21<1:22:49,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 38%|███▊      | 887/2304 [2:52:25<1:26:56,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 39%|███▊      | 888/2304 [2:52:29<1:30:03,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 39%|███▊      | 889/2304 [2:52:32<1:23:20,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0679


 39%|███▊      | 890/2304 [2:52:35<1:19:42,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 39%|███▊      | 891/2304 [2:52:38<1:16:08,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 39%|███▊      | 892/2304 [2:52:42<1:22:41,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 39%|███▉      | 893/2304 [2:52:46<1:26:43,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 39%|███▉      | 894/2304 [2:52:50<1:30:13,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 39%|███▉      | 895/2304 [2:52:53<1:23:28,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 39%|███▉      | 896/2304 [2:52:56<1:19:26,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0681


 39%|███▉      | 897/2304 [2:52:59<1:16:01,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0738


 39%|███▉      | 898/2304 [2:53:03<1:22:13,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 39%|███▉      | 899/2304 [2:53:07<1:26:02,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 39%|███▉      | 900/2304 [2:53:11<1:28:38,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 39%|███▉      | 901/2304 [2:53:14<1:23:07,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0702


 39%|███▉      | 902/2304 [2:53:17<1:17:58,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 39%|███▉      | 903/2304 [2:53:20<1:15:41,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 39%|███▉      | 904/2304 [2:53:24<1:21:58,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0724


 39%|███▉      | 905/2304 [2:53:28<1:24:50,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 39%|███▉      | 906/2304 [2:53:32<1:28:15,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 39%|███▉      | 907/2304 [2:53:35<1:23:02,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0671


 39%|███▉      | 908/2304 [2:53:38<1:17:41,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


 39%|███▉      | 909/2304 [2:53:41<1:15:19,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0711


 39%|███▉      | 910/2304 [2:53:45<1:21:36,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0695


 40%|███▉      | 911/2304 [2:53:49<1:24:40,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0791


 40%|███▉      | 912/2304 [2:53:54<1:28:15,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 40%|███▉      | 913/2304 [2:53:57<1:22:47,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0728


 40%|███▉      | 914/2304 [2:53:59<1:17:50,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 40%|███▉      | 915/2304 [2:54:02<1:15:17,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 40%|███▉      | 916/2304 [2:54:06<1:20:21,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


 40%|███▉      | 917/2304 [2:54:11<1:24:30,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 40%|███▉      | 918/2304 [2:54:15<1:27:47,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0656


 40%|███▉      | 919/2304 [2:54:18<1:21:13,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 40%|███▉      | 920/2304 [2:54:20<1:17:22,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 40%|███▉      | 921/2304 [2:54:23<1:14:12,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0676


 40%|████      | 922/2304 [2:54:28<1:20:43,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 40%|████      | 923/2304 [2:54:32<1:24:31,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0738


 40%|████      | 924/2304 [2:54:36<1:28:03,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0766


 40%|████      | 925/2304 [2:54:39<1:21:29,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 40%|████      | 926/2304 [2:54:42<1:17:38,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 40%|████      | 927/2304 [2:54:45<1:14:13,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0678


 40%|████      | 928/2304 [2:54:49<1:20:55,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0721


 40%|████      | 929/2304 [2:54:53<1:24:47,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 40%|████      | 930/2304 [2:54:57<1:28:03,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0689


 40%|████      | 931/2304 [2:55:00<1:21:37,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0699


 40%|████      | 932/2304 [2:55:03<1:17:34,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 40%|████      | 933/2304 [2:55:06<1:14:00,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0701


 41%|████      | 934/2304 [2:55:10<1:20:10,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0707


 41%|████      | 935/2304 [2:55:14<1:24:15,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 41%|████      | 936/2304 [2:55:18<1:26:17,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


 41%|████      | 937/2304 [2:55:21<1:20:59,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 41%|████      | 938/2304 [2:55:24<1:17:11,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 41%|████      | 939/2304 [2:55:27<1:13:36,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0783


 41%|████      | 940/2304 [2:55:31<1:19:45,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 41%|████      | 941/2304 [2:55:35<1:23:52,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


 41%|████      | 942/2304 [2:55:39<1:26:00,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0734


 41%|████      | 943/2304 [2:55:42<1:20:38,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 41%|████      | 944/2304 [2:55:45<1:15:41,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 41%|████      | 945/2304 [2:55:48<1:13:40,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 41%|████      | 946/2304 [2:55:52<1:19:39,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 41%|████      | 947/2304 [2:55:56<1:22:26,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 41%|████      | 948/2304 [2:56:00<1:25:59,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 41%|████      | 949/2304 [2:56:03<1:20:26,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 41%|████      | 950/2304 [2:56:06<1:15:13,  3.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0712


 41%|████▏     | 951/2304 [2:56:09<1:13:00,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 41%|████▏     | 952/2304 [2:56:13<1:19:28,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0703


 41%|████▏     | 953/2304 [2:56:17<1:22:03,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 41%|████▏     | 954/2304 [2:56:22<1:25:19,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0739


 41%|████▏     | 955/2304 [2:56:25<1:20:16,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 41%|████▏     | 956/2304 [2:56:27<1:15:08,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 42%|████▏     | 957/2304 [2:56:30<1:12:53,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0733


 42%|████▏     | 958/2304 [2:56:35<1:19:16,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0736


 42%|████▏     | 959/2304 [2:56:39<1:22:24,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 42%|████▏     | 960/2304 [2:56:43<1:25:38,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 42%|████▏     | 961/2304 [2:56:46<1:20:11,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0822


 42%|████▏     | 962/2304 [2:56:49<1:15:09,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 42%|████▏     | 963/2304 [2:56:52<1:12:52,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0827


 42%|████▏     | 964/2304 [2:56:56<1:18:46,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 42%|████▏     | 965/2304 [2:57:00<1:21:36,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0724


 42%|████▏     | 966/2304 [2:57:04<1:24:52,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 42%|████▏     | 967/2304 [2:57:07<1:19:34,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 42%|████▏     | 968/2304 [2:57:10<1:14:38,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 42%|████▏     | 969/2304 [2:57:13<1:12:33,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0733


 42%|████▏     | 970/2304 [2:57:17<1:18:27,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0835


 42%|████▏     | 971/2304 [2:57:21<1:21:08,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 42%|████▏     | 972/2304 [2:57:25<1:24:37,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 42%|████▏     | 973/2304 [2:57:28<1:19:20,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0733


 42%|████▏     | 974/2304 [2:57:31<1:14:15,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 42%|████▏     | 975/2304 [2:57:34<1:12:05,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0828


 42%|████▏     | 976/2304 [2:57:38<1:16:58,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0738


 42%|████▏     | 977/2304 [2:57:42<1:20:43,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 42%|████▏     | 978/2304 [2:57:46<1:24:01,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 42%|████▏     | 979/2304 [2:57:49<1:18:01,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 43%|████▎     | 980/2304 [2:57:52<1:14:09,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0758


 43%|████▎     | 981/2304 [2:57:55<1:11:50,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0718


 43%|████▎     | 982/2304 [2:57:59<1:16:56,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 43%|████▎     | 983/2304 [2:58:03<1:20:53,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 43%|████▎     | 984/2304 [2:58:07<1:23:52,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0747


 43%|████▎     | 985/2304 [2:58:10<1:17:42,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


 43%|████▎     | 986/2304 [2:58:13<1:13:59,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0714


 43%|████▎     | 987/2304 [2:58:16<1:10:37,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0833


 43%|████▎     | 988/2304 [2:58:20<1:16:31,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 43%|████▎     | 989/2304 [2:58:24<1:20:35,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 43%|████▎     | 990/2304 [2:58:28<1:23:37,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


 43%|████▎     | 991/2304 [2:58:31<1:17:15,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 43%|████▎     | 992/2304 [2:58:34<1:13:31,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 43%|████▎     | 993/2304 [2:58:37<1:10:19,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0672


 43%|████▎     | 994/2304 [2:58:41<1:16:14,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0773


 43%|████▎     | 995/2304 [2:58:45<1:20:05,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 43%|████▎     | 996/2304 [2:58:50<1:23:47,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0757


 43%|████▎     | 997/2304 [2:58:53<1:17:24,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 43%|████▎     | 998/2304 [2:58:56<1:13:47,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0805


 43%|████▎     | 999/2304 [2:58:58<1:10:28,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 43%|████▎     | 1000/2304 [2:59:03<1:16:39,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0820


 43%|████▎     | 1001/2304 [2:59:07<1:20:15,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 43%|████▎     | 1002/2304 [2:59:11<1:22:14,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0797


 44%|████▎     | 1003/2304 [2:59:14<1:17:09,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0701


 44%|████▎     | 1004/2304 [2:59:17<1:13:08,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 44%|████▎     | 1005/2304 [2:59:20<1:09:38,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 44%|████▎     | 1006/2304 [2:59:24<1:15:40,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 44%|████▎     | 1007/2304 [2:59:28<1:19:16,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 44%|████▍     | 1008/2304 [2:59:32<1:21:19,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 44%|████▍     | 1009/2304 [2:59:35<1:16:30,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0672


 44%|████▍     | 1010/2304 [2:59:38<1:12:10,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0758


 44%|████▍     | 1011/2304 [2:59:41<1:10:12,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0768


 44%|████▍     | 1012/2304 [2:59:45<1:15:56,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0743


 44%|████▍     | 1013/2304 [2:59:49<1:18:55,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 44%|████▍     | 1014/2304 [2:59:53<1:22:06,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0806


 44%|████▍     | 1015/2304 [2:59:56<1:16:57,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 44%|████▍     | 1016/2304 [2:59:59<1:12:06,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 44%|████▍     | 1017/2304 [3:00:02<1:10:02,  3.27s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0828


 44%|████▍     | 1018/2304 [3:00:06<1:15:32,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0855


 44%|████▍     | 1019/2304 [3:00:10<1:18:12,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 44%|████▍     | 1020/2304 [3:00:14<1:21:20,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 44%|████▍     | 1021/2304 [3:00:17<1:16:14,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 44%|████▍     | 1022/2304 [3:00:20<1:11:27,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0796


 44%|████▍     | 1023/2304 [3:00:23<1:09:16,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0716


 44%|████▍     | 1024/2304 [3:00:27<1:14:10,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0689


 44%|████▍     | 1025/2304 [3:00:31<1:17:51,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 45%|████▍     | 1026/2304 [3:00:35<1:21:05,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 45%|████▍     | 1027/2304 [3:00:38<1:16:02,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0714


 45%|████▍     | 1028/2304 [3:00:41<1:11:11,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0752


 45%|████▍     | 1029/2304 [3:00:44<1:08:04,  3.20s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 45%|████▍     | 1030/2304 [3:00:48<1:14:11,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0740


 45%|████▍     | 1031/2304 [3:00:52<1:17:39,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 45%|████▍     | 1032/2304 [3:00:56<1:20:47,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0877


 45%|████▍     | 1033/2304 [3:00:59<1:14:51,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0731


 45%|████▍     | 1034/2304 [3:01:02<1:11:10,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 45%|████▍     | 1035/2304 [3:01:05<1:07:52,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0797


 45%|████▍     | 1036/2304 [3:01:09<1:13:36,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 45%|████▌     | 1037/2304 [3:01:13<1:17:21,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 45%|████▌     | 1038/2304 [3:01:17<1:20:13,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0824


 45%|████▌     | 1039/2304 [3:01:20<1:14:12,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0731


 45%|████▌     | 1040/2304 [3:01:23<1:10:39,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


 45%|████▌     | 1041/2304 [3:01:26<1:07:33,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0890


 45%|████▌     | 1042/2304 [3:01:30<1:13:20,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0851


 45%|████▌     | 1043/2304 [3:01:34<1:17:05,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0744


 45%|████▌     | 1044/2304 [3:01:38<1:19:54,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


 45%|████▌     | 1045/2304 [3:01:41<1:13:51,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 45%|████▌     | 1046/2304 [3:01:44<1:10:17,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 45%|████▌     | 1047/2304 [3:01:47<1:07:17,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 45%|████▌     | 1048/2304 [3:01:51<1:13:09,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0883


 46%|████▌     | 1049/2304 [3:01:55<1:16:40,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0768


 46%|████▌     | 1050/2304 [3:02:00<1:19:50,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0739


 46%|████▌     | 1051/2304 [3:02:02<1:13:51,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 46%|████▌     | 1052/2304 [3:02:05<1:10:10,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0794


 46%|████▌     | 1053/2304 [3:02:08<1:06:57,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 46%|████▌     | 1054/2304 [3:02:12<1:13:03,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0744


 46%|████▌     | 1055/2304 [3:02:17<1:16:22,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 46%|████▌     | 1056/2304 [3:02:20<1:18:13,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 46%|████▌     | 1057/2304 [3:02:24<1:13:34,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0707


 46%|████▌     | 1058/2304 [3:02:26<1:08:59,  3.32s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0815


 46%|████▌     | 1059/2304 [3:02:29<1:06:57,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0780


 46%|████▌     | 1060/2304 [3:02:33<1:12:27,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


 46%|████▌     | 1061/2304 [3:02:37<1:15:21,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 46%|████▌     | 1062/2304 [3:02:42<1:18:16,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 46%|████▌     | 1063/2304 [3:02:45<1:13:22,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


 46%|████▌     | 1064/2304 [3:02:47<1:08:47,  3.33s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0711


 46%|████▌     | 1065/2304 [3:02:50<1:06:45,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0787


 46%|████▋     | 1066/2304 [3:02:54<1:12:13,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0839


 46%|████▋     | 1067/2304 [3:02:58<1:14:55,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 46%|████▋     | 1068/2304 [3:03:03<1:18:24,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1008


 46%|████▋     | 1069/2304 [3:03:06<1:13:26,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0696


 46%|████▋     | 1070/2304 [3:03:09<1:08:53,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 46%|████▋     | 1071/2304 [3:03:12<1:06:47,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 47%|████▋     | 1072/2304 [3:03:16<1:11:16,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0848


 47%|████▋     | 1073/2304 [3:03:20<1:14:49,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0807


 47%|████▋     | 1074/2304 [3:03:24<1:17:48,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 47%|████▋     | 1075/2304 [3:03:27<1:12:08,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0729


 47%|████▋     | 1076/2304 [3:03:30<1:08:45,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0773


 47%|████▋     | 1077/2304 [3:03:32<1:05:43,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 47%|████▋     | 1078/2304 [3:03:37<1:11:27,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0764


 47%|████▋     | 1079/2304 [3:03:41<1:14:54,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 47%|████▋     | 1080/2304 [3:03:45<1:17:46,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0739


 47%|████▋     | 1081/2304 [3:03:48<1:11:59,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0755


 47%|████▋     | 1082/2304 [3:03:51<1:08:31,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 47%|████▋     | 1083/2304 [3:03:54<1:05:23,  3.21s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0763


 47%|████▋     | 1084/2304 [3:03:58<1:10:52,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 47%|████▋     | 1085/2304 [3:04:02<1:14:38,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 47%|████▋     | 1086/2304 [3:04:06<1:17:30,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 47%|████▋     | 1087/2304 [3:04:09<1:11:37,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0717


 47%|████▋     | 1088/2304 [3:04:12<1:08:17,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0754


 47%|████▋     | 1089/2304 [3:04:15<1:05:15,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0757


 47%|████▋     | 1090/2304 [3:04:19<1:10:42,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0863


 47%|████▋     | 1091/2304 [3:04:23<1:14:11,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 47%|████▋     | 1092/2304 [3:04:27<1:17:07,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0967


 47%|████▋     | 1093/2304 [3:04:30<1:11:20,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0785


 47%|████▋     | 1094/2304 [3:04:33<1:07:50,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 48%|████▊     | 1095/2304 [3:04:36<1:04:51,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0803


 48%|████▊     | 1096/2304 [3:04:40<1:10:24,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0695


 48%|████▊     | 1097/2304 [3:04:44<1:13:49,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 48%|████▊     | 1098/2304 [3:04:48<1:16:04,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0650


 48%|████▊     | 1099/2304 [3:04:51<1:11:26,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0747


 48%|████▊     | 1100/2304 [3:04:54<1:06:55,  3.34s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0726


 48%|████▊     | 1101/2304 [3:04:57<1:04:55,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0712


 48%|████▊     | 1102/2304 [3:05:01<1:10:23,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 48%|████▊     | 1103/2304 [3:05:05<1:13:41,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0733


 48%|████▊     | 1104/2304 [3:05:09<1:15:30,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0801


 48%|████▊     | 1105/2304 [3:05:12<1:11:00,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 48%|████▊     | 1106/2304 [3:05:15<1:06:53,  3.35s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0634


 48%|████▊     | 1107/2304 [3:05:18<1:04:56,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 48%|████▊     | 1108/2304 [3:05:22<1:10:12,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1045


 48%|████▊     | 1109/2304 [3:05:26<1:12:50,  3.66s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0698


 48%|████▊     | 1110/2304 [3:05:30<1:15:54,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0762


 48%|████▊     | 1111/2304 [3:05:33<1:11:02,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 48%|████▊     | 1112/2304 [3:05:36<1:06:44,  3.36s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 48%|████▊     | 1113/2304 [3:05:39<1:04:43,  3.26s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0770


 48%|████▊     | 1114/2304 [3:05:43<1:09:08,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0796


 48%|████▊     | 1115/2304 [3:05:47<1:12:46,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0744


 48%|████▊     | 1116/2304 [3:05:52<1:15:33,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0730


 48%|████▊     | 1117/2304 [3:05:54<1:09:51,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 49%|████▊     | 1118/2304 [3:05:57<1:06:30,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0662


 49%|████▊     | 1119/2304 [3:06:00<1:03:43,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 49%|████▊     | 1120/2304 [3:06:04<1:09:20,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 49%|████▊     | 1121/2304 [3:06:09<1:12:50,  3.69s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0794


 49%|████▊     | 1122/2304 [3:06:13<1:15:49,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 49%|████▊     | 1123/2304 [3:06:16<1:10:00,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0753


 49%|████▉     | 1124/2304 [3:06:19<1:06:30,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 49%|████▉     | 1125/2304 [3:06:21<1:03:24,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 49%|████▉     | 1126/2304 [3:06:26<1:08:53,  3.51s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


 49%|████▉     | 1127/2304 [3:06:30<1:12:12,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0783


 49%|████▉     | 1128/2304 [3:06:34<1:14:54,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0711


 49%|████▉     | 1129/2304 [3:06:37<1:09:31,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 49%|████▉     | 1130/2304 [3:06:40<1:06:02,  3.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 49%|████▉     | 1131/2304 [3:06:43<1:02:58,  3.22s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0783


 49%|████▉     | 1132/2304 [3:06:47<1:08:20,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1019


 49%|████▉     | 1133/2304 [3:06:51<1:11:45,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 49%|████▉     | 1134/2304 [3:06:55<1:14:28,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0720


 49%|████▉     | 1135/2304 [3:06:58<1:08:45,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 49%|████▉     | 1136/2304 [3:07:01<1:05:41,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 49%|████▉     | 1137/2304 [3:07:04<1:02:43,  3.23s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 49%|████▉     | 1138/2304 [3:07:08<1:08:02,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 49%|████▉     | 1139/2304 [3:07:12<1:11:22,  3.68s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 49%|████▉     | 1140/2304 [3:07:16<1:13:10,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0890


 50%|████▉     | 1141/2304 [3:07:19<1:08:37,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 50%|████▉     | 1142/2304 [3:07:22<1:05:13,  3.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0853


 50%|████▉     | 1143/2304 [3:07:25<1:02:52,  3.25s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0802


 50%|████▉     | 1144/2304 [3:07:29<1:08:13,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0782


 50%|████▉     | 1145/2304 [3:07:33<1:11:26,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 50%|████▉     | 1146/2304 [3:07:37<1:13:28,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 50%|████▉     | 1147/2304 [3:07:40<1:08:50,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 50%|████▉     | 1148/2304 [3:07:43<1:05:23,  3.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0755


 50%|████▉     | 1149/2304 [3:07:46<1:02:23,  3.24s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 50%|████▉     | 1150/2304 [3:07:50<1:08:00,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 50%|████▉     | 1151/2304 [3:07:54<1:11:01,  3.70s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0740


 50%|█████     | 1152/2304 [3:07:58<1:12:44,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0659


 50%|█████     | 1153/2304 [3:08:14<2:20:59,  7.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1212


 50%|█████     | 1154/2304 [3:08:29<3:05:26,  9.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0851


 50%|█████     | 1155/2304 [3:08:45<3:39:40, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0925


 50%|█████     | 1156/2304 [3:09:09<4:52:43, 15.30s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0920


 50%|█████     | 1157/2304 [3:09:33<5:41:22, 17.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0798


 50%|█████     | 1158/2304 [3:09:57<6:18:10, 19.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0817


 50%|█████     | 1159/2304 [3:10:13<5:53:27, 18.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1190


 50%|█████     | 1160/2304 [3:10:28<5:34:05, 17.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0858


 50%|█████     | 1161/2304 [3:10:43<5:22:13, 16.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0970


 50%|█████     | 1162/2304 [3:11:08<6:02:48, 19.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0883


 50%|█████     | 1163/2304 [3:11:31<6:27:49, 20.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0933


 51%|█████     | 1164/2304 [3:11:55<6:49:46, 21.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 51%|█████     | 1165/2304 [3:12:11<6:15:59, 19.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1435


 51%|█████     | 1166/2304 [3:12:26<5:50:20, 18.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1119


 51%|█████     | 1167/2304 [3:12:42<5:34:02, 17.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0992


 51%|█████     | 1168/2304 [3:13:06<6:10:55, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0861


 51%|█████     | 1169/2304 [3:13:30<6:34:13, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0825


 51%|█████     | 1170/2304 [3:13:54<6:53:10, 21.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0782


 51%|█████     | 1171/2304 [3:14:10<6:16:51, 19.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1360


 51%|█████     | 1172/2304 [3:14:25<5:49:49, 18.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1305


 51%|█████     | 1173/2304 [3:14:41<5:32:43, 17.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1739


 51%|█████     | 1174/2304 [3:15:05<6:09:52, 19.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0832


 51%|█████     | 1175/2304 [3:15:28<6:31:46, 20.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0786


 51%|█████     | 1176/2304 [3:15:53<6:50:09, 21.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1192


 51%|█████     | 1177/2304 [3:16:08<6:14:34, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1324


 51%|█████     | 1178/2304 [3:16:23<5:47:00, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0899


 51%|█████     | 1179/2304 [3:16:39<5:29:43, 17.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1117


 51%|█████     | 1180/2304 [3:17:03<6:06:05, 19.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1186


 51%|█████▏    | 1181/2304 [3:17:27<6:29:00, 20.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0953


 51%|█████▏    | 1182/2304 [3:17:51<6:47:39, 21.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1840


 51%|█████▏    | 1183/2304 [3:18:06<6:11:27, 19.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0965


 51%|█████▏    | 1184/2304 [3:18:21<5:45:12, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1398


 51%|█████▏    | 1185/2304 [3:18:37<5:28:21, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0918


 51%|█████▏    | 1186/2304 [3:19:01<6:04:31, 19.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0689


 52%|█████▏    | 1187/2304 [3:19:25<6:26:48, 20.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1128


 52%|█████▏    | 1188/2304 [3:19:49<6:45:51, 21.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0866


 52%|█████▏    | 1189/2304 [3:20:04<6:09:57, 19.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0993


 52%|█████▏    | 1190/2304 [3:20:20<5:43:33, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1484


 52%|█████▏    | 1191/2304 [3:20:35<5:26:43, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1214


 52%|█████▏    | 1192/2304 [3:20:59<6:03:25, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0902


 52%|█████▏    | 1193/2304 [3:21:23<6:26:27, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1210


 52%|█████▏    | 1194/2304 [3:21:48<6:45:26, 21.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 52%|█████▏    | 1195/2304 [3:22:03<6:09:47, 20.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0865


 52%|█████▏    | 1196/2304 [3:22:18<5:42:33, 18.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0929


 52%|█████▏    | 1197/2304 [3:22:34<5:26:13, 17.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1164


 52%|█████▏    | 1198/2304 [3:22:58<6:02:19, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0884


 52%|█████▏    | 1199/2304 [3:23:22<6:24:30, 20.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0885


 52%|█████▏    | 1200/2304 [3:23:46<6:43:12, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1664


 52%|█████▏    | 1201/2304 [3:24:02<6:07:59, 20.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1122


 52%|█████▏    | 1202/2304 [3:24:17<5:40:54, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1651


 52%|█████▏    | 1203/2304 [3:24:33<5:24:18, 17.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0964


 52%|█████▏    | 1204/2304 [3:24:57<6:00:14, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 52%|█████▏    | 1205/2304 [3:25:21<6:22:19, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0794


 52%|█████▏    | 1206/2304 [3:25:45<6:40:44, 21.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0964


 52%|█████▏    | 1207/2304 [3:26:00<6:05:33, 19.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0804


 52%|█████▏    | 1208/2304 [3:26:16<5:39:17, 18.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1376


 52%|█████▏    | 1209/2304 [3:26:31<5:22:36, 17.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0938


 53%|█████▎    | 1210/2304 [3:26:55<5:57:16, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0940


 53%|█████▎    | 1211/2304 [3:27:19<6:20:11, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0997


 53%|█████▎    | 1212/2304 [3:27:44<6:39:13, 21.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 53%|█████▎    | 1213/2304 [3:27:59<6:03:38, 20.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1480


 53%|█████▎    | 1214/2304 [3:28:14<5:37:28, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0873


 53%|█████▎    | 1215/2304 [3:28:30<5:21:05, 17.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1243


 53%|█████▎    | 1216/2304 [3:28:54<5:56:41, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0831


 53%|█████▎    | 1217/2304 [3:29:18<6:18:34, 20.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0862


 53%|█████▎    | 1218/2304 [3:29:42<6:37:29, 21.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0872


 53%|█████▎    | 1219/2304 [3:29:58<6:02:29, 20.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1435


 53%|█████▎    | 1220/2304 [3:30:13<5:35:54, 18.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0894


 53%|█████▎    | 1221/2304 [3:30:29<5:19:23, 17.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0929


 53%|█████▎    | 1222/2304 [3:30:53<5:55:12, 19.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 53%|█████▎    | 1223/2304 [3:31:17<6:16:41, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.2011


 53%|█████▎    | 1224/2304 [3:31:41<6:34:09, 21.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


 53%|█████▎    | 1225/2304 [3:31:57<5:59:09, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1069


 53%|█████▎    | 1226/2304 [3:32:12<5:33:34, 18.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1453


 53%|█████▎    | 1227/2304 [3:32:28<5:17:56, 17.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0914


 53%|█████▎    | 1228/2304 [3:32:52<5:53:57, 19.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 53%|█████▎    | 1229/2304 [3:33:16<6:15:18, 20.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1278


 53%|█████▎    | 1230/2304 [3:33:40<6:32:36, 21.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0891


 53%|█████▎    | 1231/2304 [3:33:56<5:57:48, 20.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1041


 53%|█████▎    | 1232/2304 [3:34:11<5:31:14, 18.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1253


 54%|█████▎    | 1233/2304 [3:34:26<5:14:48, 17.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1053


 54%|█████▎    | 1234/2304 [3:34:51<5:50:27, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0836


 54%|█████▎    | 1235/2304 [3:35:14<6:11:24, 20.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0967


 54%|█████▎    | 1236/2304 [3:35:39<6:29:53, 21.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 54%|█████▎    | 1237/2304 [3:35:54<5:55:44, 20.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0992


 54%|█████▎    | 1238/2304 [3:36:09<5:29:39, 18.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1019


 54%|█████▍    | 1239/2304 [3:36:25<5:13:16, 17.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1645


 54%|█████▍    | 1240/2304 [3:36:49<5:48:35, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0870


 54%|█████▍    | 1241/2304 [3:37:13<6:10:35, 20.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0821


 54%|█████▍    | 1242/2304 [3:37:37<6:28:05, 21.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0864


 54%|█████▍    | 1243/2304 [3:37:53<5:53:44, 20.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0880


 54%|█████▍    | 1244/2304 [3:38:08<5:27:50, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1989


 54%|█████▍    | 1245/2304 [3:38:24<5:11:33, 17.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1115


 54%|█████▍    | 1246/2304 [3:38:48<5:46:21, 19.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0857


 54%|█████▍    | 1247/2304 [3:39:12<6:07:08, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1076


 54%|█████▍    | 1248/2304 [3:39:36<6:25:07, 21.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0751


 54%|█████▍    | 1249/2304 [3:39:51<5:51:14, 19.98s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1158


 54%|█████▍    | 1250/2304 [3:40:07<5:26:02, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0908


 54%|█████▍    | 1251/2304 [3:40:22<5:10:04, 17.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0933


 54%|█████▍    | 1252/2304 [3:40:47<5:45:21, 19.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0790


 54%|█████▍    | 1253/2304 [3:41:10<6:06:42, 20.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0774


 54%|█████▍    | 1254/2304 [3:41:35<6:24:48, 21.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 54%|█████▍    | 1255/2304 [3:41:50<5:50:22, 20.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0994


 55%|█████▍    | 1256/2304 [3:42:06<5:24:35, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0734


 55%|█████▍    | 1257/2304 [3:42:21<5:07:41, 17.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1484


 55%|█████▍    | 1258/2304 [3:42:45<5:41:47, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 55%|█████▍    | 1259/2304 [3:43:09<6:03:54, 20.89s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 55%|█████▍    | 1260/2304 [3:43:33<6:21:27, 21.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0737


 55%|█████▍    | 1261/2304 [3:43:49<5:48:28, 20.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1413


 55%|█████▍    | 1262/2304 [3:44:04<5:23:03, 18.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0750


 55%|█████▍    | 1263/2304 [3:44:20<5:06:20, 17.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1019


 55%|█████▍    | 1264/2304 [3:44:44<5:40:08, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1183


 55%|█████▍    | 1265/2304 [3:45:08<6:02:00, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0905


 55%|█████▍    | 1266/2304 [3:45:32<6:19:56, 21.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 55%|█████▍    | 1267/2304 [3:45:48<5:46:49, 20.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 55%|█████▌    | 1268/2304 [3:46:03<5:21:20, 18.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0989


 55%|█████▌    | 1269/2304 [3:46:19<5:04:57, 17.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1327


 55%|█████▌    | 1270/2304 [3:46:43<5:38:58, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0858


 55%|█████▌    | 1271/2304 [3:47:07<5:59:32, 20.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0815


 55%|█████▌    | 1272/2304 [3:47:31<6:16:52, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0866


 55%|█████▌    | 1273/2304 [3:47:47<5:43:35, 20.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0919


 55%|█████▌    | 1274/2304 [3:48:02<5:18:33, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


 55%|█████▌    | 1275/2304 [3:48:17<5:03:17, 17.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0838


 55%|█████▌    | 1276/2304 [3:48:42<5:36:59, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0893


 55%|█████▌    | 1277/2304 [3:49:05<5:57:55, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 55%|█████▌    | 1278/2304 [3:49:30<6:15:22, 21.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0888


 56%|█████▌    | 1279/2304 [3:49:45<5:42:13, 20.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


 56%|█████▌    | 1280/2304 [3:50:01<5:17:14, 18.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0794


 56%|█████▌    | 1281/2304 [3:50:16<5:01:47, 17.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1421


 56%|█████▌    | 1282/2304 [3:50:41<5:35:03, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0871


 56%|█████▌    | 1283/2304 [3:51:04<5:55:44, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1067


 56%|█████▌    | 1284/2304 [3:51:29<6:13:11, 21.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1008


 56%|█████▌    | 1285/2304 [3:51:44<5:39:59, 20.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1472


 56%|█████▌    | 1286/2304 [3:51:59<5:14:49, 18.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0874


 56%|█████▌    | 1287/2304 [3:52:15<4:59:00, 17.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0778


 56%|█████▌    | 1288/2304 [3:52:39<5:31:43, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1833


 56%|█████▌    | 1289/2304 [3:53:03<5:53:00, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1022


 56%|█████▌    | 1290/2304 [3:53:27<6:10:25, 21.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0908


 56%|█████▌    | 1291/2304 [3:53:43<5:38:07, 20.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1173


 56%|█████▌    | 1292/2304 [3:53:58<5:13:26, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0959


 56%|█████▌    | 1293/2304 [3:54:14<4:57:29, 17.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0982


 56%|█████▌    | 1294/2304 [3:54:38<5:30:21, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0878


 56%|█████▌    | 1295/2304 [3:55:02<5:51:32, 20.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0963


 56%|█████▋    | 1296/2304 [3:55:26<6:08:20, 21.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


 56%|█████▋    | 1297/2304 [3:55:41<5:35:07, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0899


 56%|█████▋    | 1298/2304 [3:55:56<5:10:07, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1596


 56%|█████▋    | 1299/2304 [3:56:12<4:55:06, 17.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0954


 56%|█████▋    | 1300/2304 [3:56:36<5:27:31, 19.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 56%|█████▋    | 1301/2304 [3:57:00<5:47:46, 20.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1032


 57%|█████▋    | 1302/2304 [3:57:24<6:04:35, 21.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 57%|█████▋    | 1303/2304 [3:57:40<5:32:33, 19.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1076


 57%|█████▋    | 1304/2304 [3:57:55<5:08:26, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1198


 57%|█████▋    | 1305/2304 [3:58:10<4:53:07, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0856


 57%|█████▋    | 1306/2304 [3:58:34<5:24:48, 19.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0840


 57%|█████▋    | 1307/2304 [3:58:58<5:44:28, 20.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1005


 57%|█████▋    | 1308/2304 [3:59:22<6:01:22, 21.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0869


 57%|█████▋    | 1309/2304 [3:59:37<5:29:39, 19.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0947


 57%|█████▋    | 1310/2304 [3:59:53<5:05:45, 18.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1558


 57%|█████▋    | 1311/2304 [4:00:08<4:50:54, 17.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1034


 57%|█████▋    | 1312/2304 [4:00:32<5:23:17, 19.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0863


 57%|█████▋    | 1313/2304 [4:00:56<5:44:12, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1274


 57%|█████▋    | 1314/2304 [4:01:20<6:00:32, 21.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 57%|█████▋    | 1315/2304 [4:01:36<5:28:13, 19.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


 57%|█████▋    | 1316/2304 [4:01:51<5:04:10, 18.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0853


 57%|█████▋    | 1317/2304 [4:02:06<4:49:37, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1245


 57%|█████▋    | 1318/2304 [4:02:31<5:22:34, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0683


 57%|█████▋    | 1319/2304 [4:02:54<5:42:21, 20.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0808


 57%|█████▋    | 1320/2304 [4:03:19<5:59:21, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0799


 57%|█████▋    | 1321/2304 [4:03:34<5:27:12, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1081


 57%|█████▋    | 1322/2304 [4:03:49<5:03:13, 18.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0928


 57%|█████▋    | 1323/2304 [4:04:05<4:47:56, 17.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0789


 57%|█████▋    | 1324/2304 [4:04:29<5:19:32, 19.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0936


 58%|█████▊    | 1325/2304 [4:04:53<5:39:44, 20.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0868


 58%|█████▊    | 1326/2304 [4:05:17<5:56:22, 21.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 58%|█████▊    | 1327/2304 [4:05:33<5:24:41, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0871


 58%|█████▊    | 1328/2304 [4:05:48<5:00:57, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0914


 58%|█████▊    | 1329/2304 [4:06:03<4:46:22, 17.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0892


 58%|█████▊    | 1330/2304 [4:06:27<5:17:38, 19.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0934


 58%|█████▊    | 1331/2304 [4:06:51<5:37:01, 20.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0993


 58%|█████▊    | 1332/2304 [4:07:15<5:52:45, 21.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.2548


 58%|█████▊    | 1333/2304 [4:07:31<5:21:56, 19.89s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0939


 58%|█████▊    | 1334/2304 [4:07:46<4:59:06, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1117


 58%|█████▊    | 1335/2304 [4:08:01<4:44:30, 17.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1102


 58%|█████▊    | 1336/2304 [4:08:25<5:15:34, 19.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1126


 58%|█████▊    | 1337/2304 [4:08:49<5:34:42, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1260


 58%|█████▊    | 1338/2304 [4:09:13<5:51:05, 21.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0904


 58%|█████▊    | 1339/2304 [4:09:29<5:20:08, 19.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0992


 58%|█████▊    | 1340/2304 [4:09:44<4:57:08, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1779


 58%|█████▊    | 1341/2304 [4:09:59<4:41:45, 17.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0976


 58%|█████▊    | 1342/2304 [4:10:23<5:12:41, 19.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0840


 58%|█████▊    | 1343/2304 [4:10:47<5:32:06, 20.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0899


 58%|█████▊    | 1344/2304 [4:11:11<5:48:25, 21.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0890


 58%|█████▊    | 1345/2304 [4:11:27<5:17:48, 19.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0847


 58%|█████▊    | 1346/2304 [4:11:42<4:54:34, 18.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0850


 58%|█████▊    | 1347/2304 [4:11:57<4:40:08, 17.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 59%|█████▊    | 1348/2304 [4:12:21<5:11:10, 19.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 59%|█████▊    | 1349/2304 [4:12:45<5:29:55, 20.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 59%|█████▊    | 1350/2304 [4:13:09<5:45:58, 21.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0702


 59%|█████▊    | 1351/2304 [4:13:25<5:15:42, 19.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0831


 59%|█████▊    | 1352/2304 [4:13:40<4:52:44, 18.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1196


 59%|█████▊    | 1353/2304 [4:13:55<4:38:54, 17.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0900


 59%|█████▉    | 1354/2304 [4:14:19<5:09:36, 19.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 59%|█████▉    | 1355/2304 [4:14:43<5:28:51, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0770


 59%|█████▉    | 1356/2304 [4:15:07<5:45:05, 21.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0665


 59%|█████▉    | 1357/2304 [4:15:23<5:14:43, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1145


 59%|█████▉    | 1358/2304 [4:15:38<4:51:16, 18.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0787


 59%|█████▉    | 1359/2304 [4:15:53<4:36:37, 17.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0897


 59%|█████▉    | 1360/2304 [4:16:17<5:07:10, 19.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 59%|█████▉    | 1361/2304 [4:16:41<5:26:27, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0892


 59%|█████▉    | 1362/2304 [4:17:05<5:42:23, 21.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 59%|█████▉    | 1363/2304 [4:17:21<5:13:07, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0936


 59%|█████▉    | 1364/2304 [4:17:36<4:49:54, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 59%|█████▉    | 1365/2304 [4:17:52<4:35:41, 17.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0820


 59%|█████▉    | 1366/2304 [4:18:16<5:06:02, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0937


 59%|█████▉    | 1367/2304 [4:18:40<5:25:00, 20.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 59%|█████▉    | 1368/2304 [4:19:04<5:41:25, 21.89s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 59%|█████▉    | 1369/2304 [4:19:19<5:11:08, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0801


 59%|█████▉    | 1370/2304 [4:19:35<4:48:04, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0792


 60%|█████▉    | 1371/2304 [4:19:50<4:33:56, 17.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0950


 60%|█████▉    | 1372/2304 [4:20:14<5:04:07, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0803


 60%|█████▉    | 1373/2304 [4:20:38<5:23:26, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 60%|█████▉    | 1374/2304 [4:21:02<5:39:11, 21.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


 60%|█████▉    | 1375/2304 [4:21:18<5:08:48, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 60%|█████▉    | 1376/2304 [4:21:33<4:46:12, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1048


 60%|█████▉    | 1377/2304 [4:21:48<4:32:21, 17.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0993


 60%|█████▉    | 1378/2304 [4:22:13<5:02:32, 19.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0673


 60%|█████▉    | 1379/2304 [4:22:36<5:21:09, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 60%|█████▉    | 1380/2304 [4:23:01<5:36:28, 21.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1430


 60%|█████▉    | 1381/2304 [4:23:16<5:06:56, 19.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0948


 60%|█████▉    | 1382/2304 [4:23:31<4:44:48, 18.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0933


 60%|██████    | 1383/2304 [4:23:47<4:30:46, 17.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1334


 60%|██████    | 1384/2304 [4:24:11<5:01:10, 19.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0742


 60%|██████    | 1385/2304 [4:24:35<5:19:25, 20.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1874


 60%|██████    | 1386/2304 [4:24:59<5:34:19, 21.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0940


 60%|██████    | 1387/2304 [4:25:15<5:04:48, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0840


 60%|██████    | 1388/2304 [4:25:30<4:42:41, 18.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0897


 60%|██████    | 1389/2304 [4:25:45<4:29:09, 17.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0786


 60%|██████    | 1390/2304 [4:26:10<4:59:04, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0894


 60%|██████    | 1391/2304 [4:26:33<5:16:58, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0960


 60%|██████    | 1392/2304 [4:26:58<5:32:11, 21.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 60%|██████    | 1393/2304 [4:27:13<5:03:10, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0713


 61%|██████    | 1394/2304 [4:27:28<4:40:44, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 61%|██████    | 1395/2304 [4:27:44<4:26:25, 17.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0760


 61%|██████    | 1396/2304 [4:28:08<4:55:34, 19.53s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 61%|██████    | 1397/2304 [4:28:31<5:13:25, 20.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 61%|██████    | 1398/2304 [4:28:55<5:28:16, 21.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 61%|██████    | 1399/2304 [4:29:11<4:59:30, 19.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0757


 61%|██████    | 1400/2304 [4:29:26<4:37:29, 18.42s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0758


 61%|██████    | 1401/2304 [4:29:41<4:23:28, 17.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0882


 61%|██████    | 1402/2304 [4:30:05<4:53:20, 19.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 61%|██████    | 1403/2304 [4:30:29<5:11:11, 20.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0650


 61%|██████    | 1404/2304 [4:30:53<5:26:12, 21.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 61%|██████    | 1405/2304 [4:31:08<4:57:12, 19.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0835


 61%|██████    | 1406/2304 [4:31:24<4:35:55, 18.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1166


 61%|██████    | 1407/2304 [4:31:39<4:22:36, 17.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0825


 61%|██████    | 1408/2304 [4:32:03<4:52:10, 19.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.2458


 61%|██████    | 1409/2304 [4:32:27<5:10:31, 20.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 61%|██████    | 1410/2304 [4:32:51<5:25:09, 21.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0879


 61%|██████    | 1411/2304 [4:33:07<4:56:44, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0790


 61%|██████▏   | 1412/2304 [4:33:22<4:35:00, 18.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0932


 61%|██████▏   | 1413/2304 [4:33:37<4:21:18, 17.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0821


 61%|██████▏   | 1414/2304 [4:34:02<4:50:26, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 61%|██████▏   | 1415/2304 [4:34:25<5:07:47, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0981


 61%|██████▏   | 1416/2304 [4:34:49<5:22:42, 21.80s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0769


 62%|██████▏   | 1417/2304 [4:35:05<4:54:10, 19.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


 62%|██████▏   | 1418/2304 [4:35:20<4:33:05, 18.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0854


 62%|██████▏   | 1419/2304 [4:35:36<4:20:05, 17.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0864


 62%|██████▏   | 1420/2304 [4:36:00<4:48:27, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.2054


 62%|██████▏   | 1421/2304 [4:36:24<5:06:19, 20.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 62%|██████▏   | 1422/2304 [4:36:48<5:21:26, 21.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 62%|██████▏   | 1423/2304 [4:37:03<4:53:09, 19.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0916


 62%|██████▏   | 1424/2304 [4:37:19<4:31:29, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 62%|██████▏   | 1425/2304 [4:37:34<4:18:35, 17.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0992


 62%|██████▏   | 1426/2304 [4:37:58<4:47:15, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0694


 62%|██████▏   | 1427/2304 [4:38:22<5:04:31, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0920


 62%|██████▏   | 1428/2304 [4:38:46<5:18:33, 21.82s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 62%|██████▏   | 1429/2304 [4:39:02<4:50:31, 19.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0795


 62%|██████▏   | 1430/2304 [4:39:17<4:29:47, 18.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0972


 62%|██████▏   | 1431/2304 [4:39:33<4:16:40, 17.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0915


 62%|██████▏   | 1432/2304 [4:39:57<4:44:38, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0846


 62%|██████▏   | 1433/2304 [4:40:20<5:02:04, 20.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0821


 62%|██████▏   | 1434/2304 [4:40:44<5:16:15, 21.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0809


 62%|██████▏   | 1435/2304 [4:41:00<4:48:44, 19.94s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0933


 62%|██████▏   | 1436/2304 [4:41:15<4:27:47, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1001


 62%|██████▏   | 1437/2304 [4:41:31<4:15:33, 17.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0857


 62%|██████▏   | 1438/2304 [4:41:55<4:44:22, 19.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1092


 62%|██████▏   | 1439/2304 [4:42:19<5:00:49, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0897


 62%|██████▎   | 1440/2304 [4:42:43<5:14:37, 21.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0809


 63%|██████▎   | 1441/2304 [4:42:51<4:15:51, 17.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0970


 63%|██████▎   | 1442/2304 [4:42:59<3:33:41, 14.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0809


 63%|██████▎   | 1443/2304 [4:43:08<3:05:03, 12.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0909


 63%|██████▎   | 1444/2304 [4:43:20<3:03:49, 12.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0766


 63%|██████▎   | 1445/2304 [4:43:33<3:02:01, 12.71s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1787


 63%|██████▎   | 1446/2304 [4:43:46<3:01:25, 12.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0866


 63%|██████▎   | 1447/2304 [4:43:54<2:42:52, 11.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0914


 63%|██████▎   | 1448/2304 [4:44:02<2:29:06, 10.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0712


 63%|██████▎   | 1449/2304 [4:44:10<2:19:28,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0786


 63%|██████▎   | 1450/2304 [4:44:23<2:31:23, 10.64s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0837


 63%|██████▎   | 1451/2304 [4:44:35<2:38:36, 11.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 63%|██████▎   | 1452/2304 [4:44:48<2:44:38, 11.59s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 63%|██████▎   | 1453/2304 [4:44:56<2:31:00, 10.65s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0956


 63%|██████▎   | 1454/2304 [4:45:05<2:20:21,  9.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0662


 63%|██████▎   | 1455/2304 [4:45:13<2:13:31,  9.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1003


 63%|██████▎   | 1456/2304 [4:45:26<2:26:59, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1875


 63%|██████▎   | 1457/2304 [4:45:38<2:35:29, 11.02s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0775


 63%|██████▎   | 1458/2304 [4:45:51<2:42:22, 11.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 63%|██████▎   | 1459/2304 [4:45:59<2:28:27, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.2116


 63%|██████▎   | 1460/2304 [4:46:07<2:18:02,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 63%|██████▎   | 1461/2304 [4:46:15<2:11:26,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0769


 63%|██████▎   | 1462/2304 [4:46:28<2:25:17, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0798


 63%|██████▎   | 1463/2304 [4:46:41<2:34:00, 10.99s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 64%|██████▎   | 1464/2304 [4:46:53<2:41:13, 11.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0890


 64%|██████▎   | 1465/2304 [4:47:02<2:27:53, 10.58s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0966


 64%|██████▎   | 1466/2304 [4:47:10<2:17:10,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 64%|██████▎   | 1467/2304 [4:47:18<2:10:35,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0891


 64%|██████▎   | 1468/2304 [4:47:31<2:24:25, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0839


 64%|██████▍   | 1469/2304 [4:47:43<2:32:25, 10.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0732


 64%|██████▍   | 1470/2304 [4:47:56<2:39:24, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0741


 64%|██████▍   | 1471/2304 [4:48:04<2:25:57, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0796


 64%|██████▍   | 1472/2304 [4:48:12<2:15:34,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0769


 64%|██████▍   | 1473/2304 [4:48:20<2:08:56,  9.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0927


 64%|██████▍   | 1474/2304 [4:48:33<2:22:48, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1574


 64%|██████▍   | 1475/2304 [4:48:45<2:31:27, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 64%|██████▍   | 1476/2304 [4:48:58<2:38:46, 11.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0784


 64%|██████▍   | 1477/2304 [4:49:06<2:25:15, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 64%|██████▍   | 1478/2304 [4:49:15<2:15:01,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 64%|██████▍   | 1479/2304 [4:49:23<2:08:33,  9.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0709


 64%|██████▍   | 1480/2304 [4:49:36<2:22:15, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 64%|██████▍   | 1481/2304 [4:49:48<2:30:49, 11.00s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 64%|██████▍   | 1482/2304 [4:50:01<2:37:33, 11.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1107


 64%|██████▍   | 1483/2304 [4:50:09<2:24:04, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 64%|██████▍   | 1484/2304 [4:50:17<2:14:05,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 64%|██████▍   | 1485/2304 [4:50:25<2:07:32,  9.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0890


 64%|██████▍   | 1486/2304 [4:50:38<2:20:44, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0908


 65%|██████▍   | 1487/2304 [4:50:50<2:29:03, 10.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0765


 65%|██████▍   | 1488/2304 [4:51:03<2:35:46, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0821


 65%|██████▍   | 1489/2304 [4:51:11<2:22:54, 10.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0874


 65%|██████▍   | 1490/2304 [4:51:19<2:12:30,  9.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


 65%|██████▍   | 1491/2304 [4:51:28<2:06:06,  9.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0937


 65%|██████▍   | 1492/2304 [4:51:40<2:19:13, 10.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0765


 65%|██████▍   | 1493/2304 [4:51:53<2:27:33, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 65%|██████▍   | 1494/2304 [4:52:05<2:34:58, 11.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1193


 65%|██████▍   | 1495/2304 [4:52:14<2:22:08, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0881


 65%|██████▍   | 1496/2304 [4:52:22<2:12:01,  9.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0802


 65%|██████▍   | 1497/2304 [4:52:30<2:05:34,  9.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1120


 65%|██████▌   | 1498/2304 [4:52:43<2:18:39, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0777


 65%|██████▌   | 1499/2304 [4:52:55<2:26:28, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 65%|██████▌   | 1500/2304 [4:53:08<2:33:14, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0666


 65%|██████▌   | 1501/2304 [4:53:16<2:20:29, 10.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1115


 65%|██████▌   | 1502/2304 [4:53:24<2:10:44,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0758


 65%|██████▌   | 1503/2304 [4:53:32<2:04:23,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0869


 65%|██████▌   | 1504/2304 [4:53:45<2:17:39, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0829


 65%|██████▌   | 1505/2304 [4:53:57<2:25:25, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1457


 65%|██████▌   | 1506/2304 [4:54:10<2:32:03, 11.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0751


 65%|██████▌   | 1507/2304 [4:54:18<2:19:06, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 65%|██████▌   | 1508/2304 [4:54:26<2:09:29,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0750


 65%|██████▌   | 1509/2304 [4:54:35<2:03:38,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0746


 66%|██████▌   | 1510/2304 [4:54:47<2:16:31, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0988


 66%|██████▌   | 1511/2304 [4:54:59<2:23:49, 10.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0697


 66%|██████▌   | 1512/2304 [4:55:12<2:30:47, 11.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0819


 66%|██████▌   | 1513/2304 [4:55:20<2:18:04, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 66%|██████▌   | 1514/2304 [4:55:28<2:08:38,  9.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0800


 66%|██████▌   | 1515/2304 [4:55:37<2:02:41,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0909


 66%|██████▌   | 1516/2304 [4:55:49<2:15:48, 10.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0857


 66%|██████▌   | 1517/2304 [4:56:02<2:23:43, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 66%|██████▌   | 1518/2304 [4:56:14<2:30:05, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0659


 66%|██████▌   | 1519/2304 [4:56:23<2:17:24, 10.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0796


 66%|██████▌   | 1520/2304 [4:56:31<2:07:43,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 66%|██████▌   | 1521/2304 [4:56:39<2:01:43,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0801


 66%|██████▌   | 1522/2304 [4:56:52<2:14:17, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0817


 66%|██████▌   | 1523/2304 [4:57:04<2:22:11, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0768


 66%|██████▌   | 1524/2304 [4:57:17<2:28:46, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0810


 66%|██████▌   | 1525/2304 [4:57:25<2:16:15, 10.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1092


 66%|██████▌   | 1526/2304 [4:57:33<2:06:30,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0825


 66%|██████▋   | 1527/2304 [4:57:41<2:00:39,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1463


 66%|██████▋   | 1528/2304 [4:57:54<2:13:14, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1041


 66%|██████▋   | 1529/2304 [4:58:06<2:20:52, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


 66%|██████▋   | 1530/2304 [4:58:19<2:27:38, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0856


 66%|██████▋   | 1531/2304 [4:58:27<2:15:11, 10.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0794


 66%|██████▋   | 1532/2304 [4:58:35<2:05:32,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 67%|██████▋   | 1533/2304 [4:58:43<1:59:30,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1129


 67%|██████▋   | 1534/2304 [4:58:56<2:12:23, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0998


 67%|██████▋   | 1535/2304 [4:59:08<2:19:45, 10.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 67%|██████▋   | 1536/2304 [4:59:21<2:26:19, 11.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0957


 67%|██████▋   | 1537/2304 [4:59:29<2:14:22, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0920


 67%|██████▋   | 1538/2304 [4:59:38<2:05:20,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 67%|██████▋   | 1539/2304 [4:59:46<1:59:26,  9.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 67%|██████▋   | 1540/2304 [4:59:59<2:11:47, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 67%|██████▋   | 1541/2304 [5:00:11<2:19:30, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 67%|██████▋   | 1542/2304 [5:00:24<2:26:10, 11.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0894


 67%|██████▋   | 1543/2304 [5:00:32<2:13:34, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


 67%|██████▋   | 1544/2304 [5:00:40<2:04:20,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0880


 67%|██████▋   | 1545/2304 [5:00:49<1:58:27,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0778


 67%|██████▋   | 1546/2304 [5:01:01<2:10:49, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0671


 67%|██████▋   | 1547/2304 [5:01:14<2:18:09, 10.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0741


 67%|██████▋   | 1548/2304 [5:01:26<2:24:49, 11.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 67%|██████▋   | 1549/2304 [5:01:35<2:12:35, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1369


 67%|██████▋   | 1550/2304 [5:01:43<2:03:37,  9.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 67%|██████▋   | 1551/2304 [5:01:51<1:57:46,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1341


 67%|██████▋   | 1552/2304 [5:02:04<2:10:23, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0806


 67%|██████▋   | 1553/2304 [5:02:16<2:18:04, 11.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0720


 67%|██████▋   | 1554/2304 [5:02:29<2:24:09, 11.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1316


 67%|██████▋   | 1555/2304 [5:02:37<2:11:48, 10.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0801


 68%|██████▊   | 1556/2304 [5:02:46<2:02:30,  9.83s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 68%|██████▊   | 1557/2304 [5:02:54<1:56:48,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1172


 68%|██████▊   | 1558/2304 [5:03:07<2:09:10, 10.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0701


 68%|██████▊   | 1559/2304 [5:03:19<2:16:39, 11.01s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 68%|██████▊   | 1560/2304 [5:03:32<2:22:46, 11.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 68%|██████▊   | 1561/2304 [5:03:40<2:10:54, 10.57s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


 68%|██████▊   | 1562/2304 [5:03:48<2:01:23,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0834


 68%|██████▊   | 1563/2304 [5:03:57<1:55:41,  9.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0935


 68%|██████▊   | 1564/2304 [5:04:09<2:07:48, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 68%|██████▊   | 1565/2304 [5:04:22<2:15:02, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0738


 68%|██████▊   | 1566/2304 [5:04:34<2:21:15, 11.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0930


 68%|██████▊   | 1567/2304 [5:04:43<2:09:20, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1132


 68%|██████▊   | 1568/2304 [5:04:51<2:00:20,  9.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0940


 68%|██████▊   | 1569/2304 [5:04:59<1:54:25,  9.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


 68%|██████▊   | 1570/2304 [5:05:12<2:06:32, 10.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0673


 68%|██████▊   | 1571/2304 [5:05:24<2:13:43, 10.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0804


 68%|██████▊   | 1572/2304 [5:05:37<2:20:08, 11.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 68%|██████▊   | 1573/2304 [5:05:45<2:08:05, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0878


 68%|██████▊   | 1574/2304 [5:05:53<1:59:14,  9.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0733


 68%|██████▊   | 1575/2304 [5:06:01<1:53:44,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0975


 68%|██████▊   | 1576/2304 [5:06:14<2:05:37, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0734


 68%|██████▊   | 1577/2304 [5:06:27<2:12:54, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 68%|██████▊   | 1578/2304 [5:06:39<2:19:10, 11.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0863


 69%|██████▊   | 1579/2304 [5:06:48<2:07:11, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0907


 69%|██████▊   | 1580/2304 [5:06:56<1:58:01,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1285


 69%|██████▊   | 1581/2304 [5:07:04<1:52:17,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 69%|██████▊   | 1582/2304 [5:07:16<2:03:53, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0923


 69%|██████▊   | 1583/2304 [5:07:29<2:11:42, 10.96s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0748


 69%|██████▉   | 1584/2304 [5:07:42<2:17:59, 11.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0960


 69%|██████▉   | 1585/2304 [5:07:50<2:06:25, 10.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0790


 69%|██████▉   | 1586/2304 [5:07:58<1:57:08,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 69%|██████▉   | 1587/2304 [5:08:06<1:51:32,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 69%|██████▉   | 1588/2304 [5:08:19<2:03:01, 10.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 69%|██████▉   | 1589/2304 [5:08:31<2:10:22, 10.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0697


 69%|██████▉   | 1590/2304 [5:08:44<2:16:30, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 69%|██████▉   | 1591/2304 [5:08:52<2:05:13, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1217


 69%|██████▉   | 1592/2304 [5:09:00<1:56:12,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0772


 69%|██████▉   | 1593/2304 [5:09:09<1:51:00,  9.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0927


 69%|██████▉   | 1594/2304 [5:09:21<2:02:33, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0929


 69%|██████▉   | 1595/2304 [5:09:34<2:09:39, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 69%|██████▉   | 1596/2304 [5:09:47<2:16:22, 11.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0670


 69%|██████▉   | 1597/2304 [5:09:55<2:04:46, 10.59s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 69%|██████▉   | 1598/2304 [5:10:03<1:56:06,  9.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 69%|██████▉   | 1599/2304 [5:10:11<1:50:08,  9.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.2181


 69%|██████▉   | 1600/2304 [5:10:24<2:01:25, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0921


 69%|██████▉   | 1601/2304 [5:10:37<2:08:31, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0755


 70%|██████▉   | 1602/2304 [5:10:49<2:13:57, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0755


 70%|██████▉   | 1603/2304 [5:10:57<2:02:29, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1359


 70%|██████▉   | 1604/2304 [5:11:05<1:53:49,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 70%|██████▉   | 1605/2304 [5:11:14<1:48:08,  9.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


 70%|██████▉   | 1606/2304 [5:11:26<1:59:37, 10.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 70%|██████▉   | 1607/2304 [5:11:39<2:06:50, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 70%|██████▉   | 1608/2304 [5:11:51<2:12:47, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0959


 70%|██████▉   | 1609/2304 [5:12:00<2:01:31, 10.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1064


 70%|██████▉   | 1610/2304 [5:12:08<1:53:05,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 70%|██████▉   | 1611/2304 [5:12:16<1:47:38,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1071


 70%|██████▉   | 1612/2304 [5:12:29<1:59:16, 10.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 70%|███████   | 1613/2304 [5:12:41<2:05:59, 10.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


 70%|███████   | 1614/2304 [5:12:54<2:11:44, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0761


 70%|███████   | 1615/2304 [5:13:02<2:00:39, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0820


 70%|███████   | 1616/2304 [5:13:10<1:52:08,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 70%|███████   | 1617/2304 [5:13:18<1:46:39,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 70%|███████   | 1618/2304 [5:13:31<1:58:05, 10.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 70%|███████   | 1619/2304 [5:13:43<2:04:57, 10.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


 70%|███████   | 1620/2304 [5:13:56<2:10:34, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 70%|███████   | 1621/2304 [5:14:04<1:59:39, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0732


 70%|███████   | 1622/2304 [5:14:12<1:51:14,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 70%|███████   | 1623/2304 [5:14:21<1:45:44,  9.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0859


 70%|███████   | 1624/2304 [5:14:33<1:56:40, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1640


 71%|███████   | 1625/2304 [5:14:45<2:03:28, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 71%|███████   | 1626/2304 [5:14:58<2:09:20, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


 71%|███████   | 1627/2304 [5:15:07<1:58:40, 10.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0811


 71%|███████   | 1628/2304 [5:15:15<1:50:19,  9.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0837


 71%|███████   | 1629/2304 [5:15:23<1:44:57,  9.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0776


 71%|███████   | 1630/2304 [5:15:36<1:56:20, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0844


 71%|███████   | 1631/2304 [5:15:48<2:03:02, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0647


 71%|███████   | 1632/2304 [5:16:01<2:08:12, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0794


 71%|███████   | 1633/2304 [5:16:09<1:56:57, 10.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0999


 71%|███████   | 1634/2304 [5:16:17<1:48:36,  9.73s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0877


 71%|███████   | 1635/2304 [5:16:25<1:43:09,  9.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0725


 71%|███████   | 1636/2304 [5:16:37<1:53:38, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 71%|███████   | 1637/2304 [5:16:50<2:00:01, 10.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1604


 71%|███████   | 1638/2304 [5:17:02<2:05:41, 11.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0676


 71%|███████   | 1639/2304 [5:17:10<1:55:15, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0950


 71%|███████   | 1640/2304 [5:17:19<1:47:44,  9.74s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 71%|███████   | 1641/2304 [5:17:27<1:42:25,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 71%|███████▏  | 1642/2304 [5:17:39<1:52:43, 10.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 71%|███████▏  | 1643/2304 [5:17:51<1:59:03, 10.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1874


 71%|███████▏  | 1644/2304 [5:18:04<2:04:34, 11.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 71%|███████▏  | 1645/2304 [5:18:12<1:53:48, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0845


 71%|███████▏  | 1646/2304 [5:18:20<1:45:46,  9.64s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0859


 71%|███████▏  | 1647/2304 [5:18:28<1:40:43,  9.20s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 72%|███████▏  | 1648/2304 [5:18:41<1:51:29, 10.20s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 72%|███████▏  | 1649/2304 [5:18:53<1:58:01, 10.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0783


 72%|███████▏  | 1650/2304 [5:19:05<2:03:48, 11.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0702


 72%|███████▏  | 1651/2304 [5:19:14<1:53:20, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0824


 72%|███████▏  | 1652/2304 [5:19:22<1:45:47,  9.74s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0917


 72%|███████▏  | 1653/2304 [5:19:30<1:40:38,  9.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 72%|███████▏  | 1654/2304 [5:19:43<1:51:03, 10.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1163


 72%|███████▏  | 1655/2304 [5:19:55<1:57:14, 10.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0789


 72%|███████▏  | 1656/2304 [5:20:07<2:02:23, 11.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0775


 72%|███████▏  | 1657/2304 [5:20:15<1:51:53, 10.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0800


 72%|███████▏  | 1658/2304 [5:20:23<1:44:00,  9.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0828


 72%|███████▏  | 1659/2304 [5:20:32<1:39:13,  9.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 72%|███████▏  | 1660/2304 [5:20:44<1:49:21, 10.19s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 72%|███████▏  | 1661/2304 [5:20:57<1:56:29, 10.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0681


 72%|███████▏  | 1662/2304 [5:21:09<2:01:36, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1150


 72%|███████▏  | 1663/2304 [5:21:17<1:51:03, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0909


 72%|███████▏  | 1664/2304 [5:21:25<1:43:13,  9.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 72%|███████▏  | 1665/2304 [5:21:33<1:38:20,  9.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 72%|███████▏  | 1666/2304 [5:21:46<1:48:32, 10.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 72%|███████▏  | 1667/2304 [5:21:58<1:54:38, 10.80s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 72%|███████▏  | 1668/2304 [5:22:11<1:59:51, 11.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 72%|███████▏  | 1669/2304 [5:22:19<1:49:50, 10.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 72%|███████▏  | 1670/2304 [5:22:27<1:42:06,  9.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 73%|███████▎  | 1671/2304 [5:22:35<1:37:19,  9.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 73%|███████▎  | 1672/2304 [5:22:48<1:48:03, 10.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1442


 73%|███████▎  | 1673/2304 [5:23:00<1:54:29, 10.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0843


 73%|███████▎  | 1674/2304 [5:23:12<1:59:23, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 73%|███████▎  | 1675/2304 [5:23:21<1:49:15, 10.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1019


 73%|███████▎  | 1676/2304 [5:23:29<1:41:43,  9.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1125


 73%|███████▎  | 1677/2304 [5:23:37<1:36:49,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0914


 73%|███████▎  | 1678/2304 [5:23:49<1:46:48, 10.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0810


 73%|███████▎  | 1679/2304 [5:24:02<1:53:05, 10.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 73%|███████▎  | 1680/2304 [5:24:14<1:58:13, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 73%|███████▎  | 1681/2304 [5:24:23<1:48:07, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0929


 73%|███████▎  | 1682/2304 [5:24:31<1:40:35,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 73%|███████▎  | 1683/2304 [5:24:39<1:36:05,  9.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 73%|███████▎  | 1684/2304 [5:24:51<1:46:07, 10.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0983


 73%|███████▎  | 1685/2304 [5:25:04<1:52:03, 10.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 73%|███████▎  | 1686/2304 [5:25:16<1:57:05, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0671


 73%|███████▎  | 1687/2304 [5:25:24<1:46:58, 10.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0736


 73%|███████▎  | 1688/2304 [5:25:32<1:39:41,  9.71s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0798


 73%|███████▎  | 1689/2304 [5:25:41<1:36:07,  9.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0747


 73%|███████▎  | 1690/2304 [5:25:54<1:46:47, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0687


 73%|███████▎  | 1691/2304 [5:26:06<1:52:24, 11.00s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0900


 73%|███████▎  | 1692/2304 [5:26:19<1:56:53, 11.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 73%|███████▎  | 1693/2304 [5:26:27<1:46:47, 10.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0998


 74%|███████▎  | 1694/2304 [5:26:35<1:39:10,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0912


 74%|███████▎  | 1695/2304 [5:26:43<1:34:18,  9.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0875


 74%|███████▎  | 1696/2304 [5:26:56<1:44:01, 10.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0728


 74%|███████▎  | 1697/2304 [5:27:08<1:49:49, 10.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0763


 74%|███████▎  | 1698/2304 [5:27:21<1:54:55, 11.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 74%|███████▎  | 1699/2304 [5:27:29<1:45:09, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0755


 74%|███████▍  | 1700/2304 [5:27:37<1:37:46,  9.71s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 74%|███████▍  | 1701/2304 [5:27:45<1:33:00,  9.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0838


 74%|███████▍  | 1702/2304 [5:27:58<1:42:33, 10.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0814


 74%|███████▍  | 1703/2304 [5:28:10<1:48:14, 10.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0946


 74%|███████▍  | 1704/2304 [5:28:22<1:53:22, 11.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1299


 74%|███████▍  | 1705/2304 [5:28:31<1:43:53, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0771


 74%|███████▍  | 1706/2304 [5:28:39<1:36:38,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0880


 74%|███████▍  | 1707/2304 [5:28:47<1:32:16,  9.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0827


 74%|███████▍  | 1708/2304 [5:28:59<1:41:53, 10.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0672


 74%|███████▍  | 1709/2304 [5:29:12<1:48:44, 10.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0681


 74%|███████▍  | 1710/2304 [5:29:25<1:53:30, 11.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0884


 74%|███████▍  | 1711/2304 [5:29:33<1:43:51, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0868


 74%|███████▍  | 1712/2304 [5:29:41<1:36:28,  9.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 74%|███████▍  | 1713/2304 [5:29:49<1:31:35,  9.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0679


 74%|███████▍  | 1714/2304 [5:30:02<1:41:06, 10.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 74%|███████▍  | 1715/2304 [5:30:14<1:47:10, 10.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 74%|███████▍  | 1716/2304 [5:30:27<1:51:59, 11.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0872


 75%|███████▍  | 1717/2304 [5:30:35<1:42:30, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 75%|███████▍  | 1718/2304 [5:30:43<1:35:17,  9.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0739


 75%|███████▍  | 1719/2304 [5:30:51<1:30:36,  9.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 75%|███████▍  | 1720/2304 [5:31:04<1:39:52, 10.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 75%|███████▍  | 1721/2304 [5:31:16<1:45:41, 10.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 75%|███████▍  | 1722/2304 [5:31:29<1:50:29, 11.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0673


 75%|███████▍  | 1723/2304 [5:31:37<1:41:01, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 75%|███████▍  | 1724/2304 [5:31:45<1:33:46,  9.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 75%|███████▍  | 1725/2304 [5:31:53<1:29:19,  9.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0811


 75%|███████▍  | 1726/2304 [5:32:06<1:38:35, 10.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.3087


 75%|███████▍  | 1727/2304 [5:32:18<1:44:03, 10.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 75%|███████▌  | 1728/2304 [5:32:30<1:48:42, 11.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 75%|███████▌  | 1729/2304 [5:32:35<1:29:40,  9.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 75%|███████▌  | 1730/2304 [5:32:40<1:15:40,  7.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 75%|███████▌  | 1731/2304 [5:32:45<1:06:47,  6.99s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0860


 75%|███████▌  | 1732/2304 [5:32:51<1:06:05,  6.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 75%|███████▌  | 1733/2304 [5:32:58<1:05:20,  6.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 75%|███████▌  | 1734/2304 [5:33:05<1:05:27,  6.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0733


 75%|███████▌  | 1735/2304 [5:33:10<58:53,  6.21s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 75%|███████▌  | 1736/2304 [5:33:14<53:57,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0819


 75%|███████▌  | 1737/2304 [5:33:19<51:21,  5.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0831


 75%|███████▌  | 1738/2304 [5:33:26<55:05,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0890


 75%|███████▌  | 1739/2304 [5:33:33<57:45,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 76%|███████▌  | 1740/2304 [5:33:39<59:42,  6.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0863


 76%|███████▌  | 1741/2304 [5:33:44<54:54,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


 76%|███████▌  | 1742/2304 [5:33:49<51:37,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 76%|███████▌  | 1743/2304 [5:33:53<49:09,  5.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0793


 76%|███████▌  | 1744/2304 [5:34:00<53:24,  5.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0799


 76%|███████▌  | 1745/2304 [5:34:07<56:32,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 76%|███████▌  | 1746/2304 [5:34:14<58:28,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0764


 76%|███████▌  | 1747/2304 [5:34:19<54:15,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 76%|███████▌  | 1748/2304 [5:34:23<50:32,  5.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


 76%|███████▌  | 1749/2304 [5:34:28<48:12,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0934


 76%|███████▌  | 1750/2304 [5:34:35<53:05,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 76%|███████▌  | 1751/2304 [5:34:42<55:54,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0730


 76%|███████▌  | 1752/2304 [5:34:49<57:51,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0745


 76%|███████▌  | 1753/2304 [5:34:53<53:34,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1033


 76%|███████▌  | 1754/2304 [5:34:58<49:52,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 76%|███████▌  | 1755/2304 [5:35:03<47:33,  5.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0884


 76%|███████▌  | 1756/2304 [5:35:09<52:19,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 76%|███████▋  | 1757/2304 [5:35:16<54:51,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0755


 76%|███████▋  | 1758/2304 [5:35:23<57:23,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 76%|███████▋  | 1759/2304 [5:35:28<52:46,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0851


 76%|███████▋  | 1760/2304 [5:35:32<49:18,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1000


 76%|███████▋  | 1761/2304 [5:35:37<47:32,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0792


 76%|███████▋  | 1762/2304 [5:35:44<51:50,  5.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 77%|███████▋  | 1763/2304 [5:35:51<54:15,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0730


 77%|███████▋  | 1764/2304 [5:35:58<56:46,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0801


 77%|███████▋  | 1765/2304 [5:36:02<52:07,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 77%|███████▋  | 1766/2304 [5:36:07<49:07,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 77%|███████▋  | 1767/2304 [5:36:12<46:42,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1167


 77%|███████▋  | 1768/2304 [5:36:19<50:57,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0810


 77%|███████▋  | 1769/2304 [5:36:25<53:48,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0801


 77%|███████▋  | 1770/2304 [5:36:32<55:53,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0737


 77%|███████▋  | 1771/2304 [5:36:37<51:23,  5.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0820


 77%|███████▋  | 1772/2304 [5:36:42<48:35,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 77%|███████▋  | 1773/2304 [5:36:46<46:20,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 77%|███████▋  | 1774/2304 [5:36:53<50:57,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 77%|███████▋  | 1775/2304 [5:37:00<53:32,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 77%|███████▋  | 1776/2304 [5:37:07<56:02,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0760


 77%|███████▋  | 1777/2304 [5:37:12<51:54,  5.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0848


 77%|███████▋  | 1778/2304 [5:37:17<48:25,  5.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0779


 77%|███████▋  | 1779/2304 [5:37:21<46:11,  5.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0880


 77%|███████▋  | 1780/2304 [5:37:28<50:42,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 77%|███████▋  | 1781/2304 [5:37:35<53:07,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0968


 77%|███████▋  | 1782/2304 [5:37:42<55:34,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0821


 77%|███████▋  | 1783/2304 [5:37:47<50:55,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


 77%|███████▋  | 1784/2304 [5:37:51<47:22,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 77%|███████▋  | 1785/2304 [5:37:56<45:33,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1043


 78%|███████▊  | 1786/2304 [5:38:03<49:30,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0829


 78%|███████▊  | 1787/2304 [5:38:10<52:02,  6.04s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0921


 78%|███████▊  | 1788/2304 [5:38:17<54:25,  6.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 78%|███████▊  | 1789/2304 [5:38:21<50:00,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0728


 78%|███████▊  | 1790/2304 [5:38:26<46:37,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0891


 78%|███████▊  | 1791/2304 [5:38:31<44:55,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0700


 78%|███████▊  | 1792/2304 [5:38:38<48:47,  5.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0783


 78%|███████▊  | 1793/2304 [5:38:44<51:42,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 78%|███████▊  | 1794/2304 [5:38:51<53:30,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0850


 78%|███████▊  | 1795/2304 [5:38:56<49:16,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 78%|███████▊  | 1796/2304 [5:39:01<46:24,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0714


 78%|███████▊  | 1797/2304 [5:39:05<44:14,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 78%|███████▊  | 1798/2304 [5:39:12<48:17,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0777


 78%|███████▊  | 1799/2304 [5:39:19<51:15,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


 78%|███████▊  | 1800/2304 [5:39:26<53:12,  6.34s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0745


 78%|███████▊  | 1801/2304 [5:39:31<49:24,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1089


 78%|███████▊  | 1802/2304 [5:39:35<46:05,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1146


 78%|███████▊  | 1803/2304 [5:39:40<44:02,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1159


 78%|███████▊  | 1804/2304 [5:39:47<48:24,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1258


 78%|███████▊  | 1805/2304 [5:39:54<50:37,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 78%|███████▊  | 1806/2304 [5:40:01<52:26,  6.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 78%|███████▊  | 1807/2304 [5:40:06<48:38,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0795


 78%|███████▊  | 1808/2304 [5:40:10<45:18,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1116


 79%|███████▊  | 1809/2304 [5:40:15<43:29,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 79%|███████▊  | 1810/2304 [5:40:22<47:19,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 79%|███████▊  | 1811/2304 [5:40:29<49:44,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 79%|███████▊  | 1812/2304 [5:40:36<52:07,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


 79%|███████▊  | 1813/2304 [5:40:40<47:49,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 79%|███████▊  | 1814/2304 [5:40:45<44:40,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 79%|███████▉  | 1815/2304 [5:40:50<42:54,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0756


 79%|███████▉  | 1816/2304 [5:40:57<46:47,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0803


 79%|███████▉  | 1817/2304 [5:41:03<49:19,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0846


 79%|███████▉  | 1818/2304 [5:41:10<51:04,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0719


 79%|███████▉  | 1819/2304 [5:41:15<46:55,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0943


 79%|███████▉  | 1820/2304 [5:41:20<44:10,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 79%|███████▉  | 1821/2304 [5:41:24<42:03,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 79%|███████▉  | 1822/2304 [5:41:31<45:52,  5.71s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1165


 79%|███████▉  | 1823/2304 [5:41:38<48:33,  6.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


 79%|███████▉  | 1824/2304 [5:41:45<50:21,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0967


 79%|███████▉  | 1825/2304 [5:41:49<46:12,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0760


 79%|███████▉  | 1826/2304 [5:41:54<43:35,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


 79%|███████▉  | 1827/2304 [5:41:59<41:25,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0930


 79%|███████▉  | 1828/2304 [5:42:06<45:38,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1092


 79%|███████▉  | 1829/2304 [5:42:13<47:51,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0746


 79%|███████▉  | 1830/2304 [5:42:19<49:43,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0929


 79%|███████▉  | 1831/2304 [5:42:24<46:05,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1039


 80%|███████▉  | 1832/2304 [5:42:29<42:58,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0816


 80%|███████▉  | 1833/2304 [5:42:33<41:03,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0833


 80%|███████▉  | 1834/2304 [5:42:41<45:13,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0861


 80%|███████▉  | 1835/2304 [5:42:47<47:25,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0892


 80%|███████▉  | 1836/2304 [5:42:54<49:41,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0809


 80%|███████▉  | 1837/2304 [5:42:59<45:34,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1803


 80%|███████▉  | 1838/2304 [5:43:04<42:29,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0929


 80%|███████▉  | 1839/2304 [5:43:08<40:51,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


 80%|███████▉  | 1840/2304 [5:43:15<44:19,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0953


 80%|███████▉  | 1841/2304 [5:43:22<46:34,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 80%|███████▉  | 1842/2304 [5:43:29<48:36,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0997


 80%|███████▉  | 1843/2304 [5:43:34<44:40,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1404


 80%|████████  | 1844/2304 [5:43:38<41:57,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 80%|████████  | 1845/2304 [5:43:43<39:57,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0848


 80%|████████  | 1846/2304 [5:43:50<43:31,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0756


 80%|████████  | 1847/2304 [5:43:57<46:03,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 80%|████████  | 1848/2304 [5:44:03<47:40,  6.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0735


 80%|████████  | 1849/2304 [5:44:08<43:52,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0763


 80%|████████  | 1850/2304 [5:44:13<41:19,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 80%|████████  | 1851/2304 [5:44:17<39:28,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0772


 80%|████████  | 1852/2304 [5:44:24<43:23,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1087


 80%|████████  | 1853/2304 [5:44:31<45:30,  6.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 80%|████████  | 1854/2304 [5:44:38<47:16,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0625


 81%|████████  | 1855/2304 [5:44:43<43:52,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0826


 81%|████████  | 1856/2304 [5:44:47<40:53,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0853


 81%|████████  | 1857/2304 [5:44:52<38:57,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1159


 81%|████████  | 1858/2304 [5:44:59<42:54,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1195


 81%|████████  | 1859/2304 [5:45:06<44:54,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 81%|████████  | 1860/2304 [5:45:13<46:31,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 81%|████████  | 1861/2304 [5:45:17<43:03,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0995


 81%|████████  | 1862/2304 [5:45:22<40:14,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0876


 81%|████████  | 1863/2304 [5:45:27<38:38,  5.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0839


 81%|████████  | 1864/2304 [5:45:34<42:08,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0816


 81%|████████  | 1865/2304 [5:45:40<44:06,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 81%|████████  | 1866/2304 [5:45:47<46:16,  6.34s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0726


 81%|████████  | 1867/2304 [5:45:52<42:32,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0750


 81%|████████  | 1868/2304 [5:45:57<39:51,  5.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 81%|████████  | 1869/2304 [5:46:02<38:20,  5.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0810


 81%|████████  | 1870/2304 [5:46:09<41:43,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0931


 81%|████████  | 1871/2304 [5:46:15<44:00,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 81%|████████▏ | 1872/2304 [5:46:22<45:40,  6.34s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0968


 81%|████████▏ | 1873/2304 [5:46:27<41:58,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0889


 81%|████████▏ | 1874/2304 [5:46:32<39:29,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0777


 81%|████████▏ | 1875/2304 [5:46:36<37:31,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1097


 81%|████████▏ | 1876/2304 [5:46:43<40:49,  5.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 81%|████████▏ | 1877/2304 [5:46:50<43:10,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 82%|████████▏ | 1878/2304 [5:46:57<44:48,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0886


 82%|████████▏ | 1879/2304 [5:47:02<41:28,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0980


 82%|████████▏ | 1880/2304 [5:47:06<38:35,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0726


 82%|████████▏ | 1881/2304 [5:47:11<36:50,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0801


 82%|████████▏ | 1882/2304 [5:47:18<40:31,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0914


 82%|████████▏ | 1883/2304 [5:47:25<42:31,  6.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 82%|████████▏ | 1884/2304 [5:47:32<44:14,  6.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1045


 82%|████████▏ | 1885/2304 [5:47:36<40:52,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0827


 82%|████████▏ | 1886/2304 [5:47:41<38:01,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0778


 82%|████████▏ | 1887/2304 [5:47:46<36:21,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0884


 82%|████████▏ | 1888/2304 [5:47:53<39:55,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0900


 82%|████████▏ | 1889/2304 [5:47:59<41:55,  6.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 82%|████████▏ | 1890/2304 [5:48:06<43:47,  6.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0859


 82%|████████▏ | 1891/2304 [5:48:11<40:12,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 82%|████████▏ | 1892/2304 [5:48:16<37:23,  5.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 82%|████████▏ | 1893/2304 [5:48:20<36:03,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1043


 82%|████████▏ | 1894/2304 [5:48:27<39:11,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 82%|████████▏ | 1895/2304 [5:48:34<41:05,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 82%|████████▏ | 1896/2304 [5:48:41<42:58,  6.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1008


 82%|████████▏ | 1897/2304 [5:48:46<39:34,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1315


 82%|████████▏ | 1898/2304 [5:48:50<37:16,  5.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0766


 82%|████████▏ | 1899/2304 [5:48:55<35:35,  5.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0993


 82%|████████▏ | 1900/2304 [5:49:02<38:40,  5.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 83%|████████▎ | 1901/2304 [5:49:09<40:55,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0675


 83%|████████▎ | 1902/2304 [5:49:16<42:18,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0788


 83%|████████▎ | 1903/2304 [5:49:20<38:54,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1196


 83%|████████▎ | 1904/2304 [5:49:25<36:31,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 83%|████████▎ | 1905/2304 [5:49:30<34:45,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0836


 83%|████████▎ | 1906/2304 [5:49:37<38:14,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 83%|████████▎ | 1907/2304 [5:49:43<40:08,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 83%|████████▎ | 1908/2304 [5:49:50<41:44,  6.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 83%|████████▎ | 1909/2304 [5:49:55<38:47,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0834


 83%|████████▎ | 1910/2304 [5:50:00<36:05,  5.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 83%|████████▎ | 1911/2304 [5:50:05<34:21,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


 83%|████████▎ | 1912/2304 [5:50:12<37:51,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0929


 83%|████████▎ | 1913/2304 [5:50:18<39:36,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 83%|████████▎ | 1914/2304 [5:50:25<41:23,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1214


 83%|████████▎ | 1915/2304 [5:50:30<37:57,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 83%|████████▎ | 1916/2304 [5:50:35<35:27,  5.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 83%|████████▎ | 1917/2304 [5:50:39<34:01,  5.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0770


 83%|████████▎ | 1918/2304 [5:50:46<37:02,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 83%|████████▎ | 1919/2304 [5:50:53<38:49,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 83%|████████▎ | 1920/2304 [5:51:00<40:37,  6.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1443


 83%|████████▎ | 1921/2304 [5:51:05<37:13,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 83%|████████▎ | 1922/2304 [5:51:09<34:41,  5.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 83%|████████▎ | 1923/2304 [5:51:14<33:14,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1158


 84%|████████▎ | 1924/2304 [5:51:21<36:32,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 84%|████████▎ | 1925/2304 [5:51:28<38:31,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 84%|████████▎ | 1926/2304 [5:51:35<39:53,  6.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 84%|████████▎ | 1927/2304 [5:51:39<36:34,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1020


 84%|████████▎ | 1928/2304 [5:51:44<34:23,  5.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0827


 84%|████████▎ | 1929/2304 [5:51:49<32:43,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1233


 84%|████████▍ | 1930/2304 [5:51:56<35:38,  5.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0798


 84%|████████▍ | 1931/2304 [5:52:02<37:35,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 84%|████████▍ | 1932/2304 [5:52:09<38:57,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0924


 84%|████████▍ | 1933/2304 [5:52:14<36:01,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1069


 84%|████████▍ | 1934/2304 [5:52:19<33:33,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0901


 84%|████████▍ | 1935/2304 [5:52:23<32:02,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0959


 84%|████████▍ | 1936/2304 [5:52:30<35:28,  5.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 84%|████████▍ | 1937/2304 [5:52:37<37:01,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0812


 84%|████████▍ | 1938/2304 [5:52:44<38:20,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


 84%|████████▍ | 1939/2304 [5:52:49<35:26,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0842


 84%|████████▍ | 1940/2304 [5:52:53<32:59,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0815


 84%|████████▍ | 1941/2304 [5:52:58<31:44,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0944


 84%|████████▍ | 1942/2304 [5:53:05<34:32,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0989


 84%|████████▍ | 1943/2304 [5:53:12<36:17,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 84%|████████▍ | 1944/2304 [5:53:19<37:52,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1094


 84%|████████▍ | 1945/2304 [5:53:23<34:45,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 84%|████████▍ | 1946/2304 [5:53:28<32:20,  5.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 85%|████████▍ | 1947/2304 [5:53:32<31:09,  5.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 85%|████████▍ | 1948/2304 [5:53:39<33:49,  5.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 85%|████████▍ | 1949/2304 [5:53:46<35:49,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 85%|████████▍ | 1950/2304 [5:53:53<37:07,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 85%|████████▍ | 1951/2304 [5:53:58<34:09,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0977


 85%|████████▍ | 1952/2304 [5:54:02<32:02,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0832


 85%|████████▍ | 1953/2304 [5:54:07<30:31,  5.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0869


 85%|████████▍ | 1954/2304 [5:54:14<33:16,  5.71s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 85%|████████▍ | 1955/2304 [5:54:21<35:11,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 85%|████████▍ | 1956/2304 [5:54:27<36:25,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0679


 85%|████████▍ | 1957/2304 [5:54:32<33:30,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0855


 85%|████████▍ | 1958/2304 [5:54:37<31:30,  5.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 85%|████████▌ | 1959/2304 [5:54:41<29:58,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1121


 85%|████████▌ | 1960/2304 [5:54:48<32:56,  5.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0912


 85%|████████▌ | 1961/2304 [5:54:55<34:30,  6.04s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0871


 85%|████████▌ | 1962/2304 [5:55:02<35:46,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0734


 85%|████████▌ | 1963/2304 [5:55:07<33:09,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0923


 85%|████████▌ | 1964/2304 [5:55:11<31:00,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0853


 85%|████████▌ | 1965/2304 [5:55:16<29:31,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0861


 85%|████████▌ | 1966/2304 [5:55:23<32:27,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 85%|████████▌ | 1967/2304 [5:55:30<33:51,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0774


 85%|████████▌ | 1968/2304 [5:55:37<35:20,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0725


 85%|████████▌ | 1969/2304 [5:55:41<32:25,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 86%|████████▌ | 1970/2304 [5:55:46<30:18,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0833


 86%|████████▌ | 1971/2304 [5:55:51<29:09,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 86%|████████▌ | 1972/2304 [5:55:58<31:46,  5.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0908


 86%|████████▌ | 1973/2304 [5:56:04<33:20,  6.04s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0693


 86%|████████▌ | 1974/2304 [5:56:11<34:59,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 86%|████████▌ | 1975/2304 [5:56:16<32:03,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1041


 86%|████████▌ | 1976/2304 [5:56:21<30:03,  5.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 86%|████████▌ | 1977/2304 [5:56:25<28:36,  5.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 86%|████████▌ | 1978/2304 [5:56:32<31:08,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0873


 86%|████████▌ | 1979/2304 [5:56:39<32:54,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 86%|████████▌ | 1980/2304 [5:56:46<34:05,  6.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0663


 86%|████████▌ | 1981/2304 [5:56:51<31:13,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1012


 86%|████████▌ | 1982/2304 [5:56:55<29:19,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0792


 86%|████████▌ | 1983/2304 [5:57:00<27:57,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1103


 86%|████████▌ | 1984/2304 [5:57:07<30:40,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


 86%|████████▌ | 1985/2304 [5:57:14<32:06,  6.04s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 86%|████████▌ | 1986/2304 [5:57:21<33:13,  6.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0806


 86%|████████▌ | 1987/2304 [5:57:25<30:48,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0824


 86%|████████▋ | 1988/2304 [5:57:30<28:38,  5.44s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 86%|████████▋ | 1989/2304 [5:57:35<27:20,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0810


 86%|████████▋ | 1990/2304 [5:57:41<30:00,  5.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0852


 86%|████████▋ | 1991/2304 [5:57:48<31:21,  6.01s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 86%|████████▋ | 1992/2304 [5:57:55<32:32,  6.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 87%|████████▋ | 1993/2304 [5:58:00<30:15,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0958


 87%|████████▋ | 1994/2304 [5:58:04<28:09,  5.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0872


 87%|████████▋ | 1995/2304 [5:58:09<27:03,  5.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0758


 87%|████████▋ | 1996/2304 [5:58:16<29:18,  5.71s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 87%|████████▋ | 1997/2304 [5:58:23<30:39,  5.99s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 87%|████████▋ | 1998/2304 [5:58:30<32:00,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 87%|████████▋ | 1999/2304 [5:58:34<29:22,  5.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1071


 87%|████████▋ | 2000/2304 [5:58:39<27:20,  5.40s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1409


 87%|████████▋ | 2001/2304 [5:58:43<26:17,  5.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 87%|████████▋ | 2002/2304 [5:58:50<28:34,  5.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 87%|████████▋ | 2003/2304 [5:58:57<30:11,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 87%|████████▋ | 2004/2304 [5:59:04<31:14,  6.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0677


 87%|████████▋ | 2005/2304 [5:59:08<28:40,  5.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0826


 87%|████████▋ | 2006/2304 [5:59:13<26:59,  5.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 87%|████████▋ | 2007/2304 [5:59:18<25:43,  5.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1020


 87%|████████▋ | 2008/2304 [5:59:25<28:04,  5.69s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0955


 87%|████████▋ | 2009/2304 [5:59:31<29:35,  6.02s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 87%|████████▋ | 2010/2304 [5:59:38<30:46,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0945


 87%|████████▋ | 2011/2304 [5:59:43<28:30,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 87%|████████▋ | 2012/2304 [5:59:48<26:37,  5.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1054


 87%|████████▋ | 2013/2304 [5:59:52<25:20,  5.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0777


 87%|████████▋ | 2014/2304 [5:59:59<27:50,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0780


 87%|████████▋ | 2015/2304 [6:00:06<29:02,  6.03s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 88%|████████▊ | 2016/2304 [6:00:13<30:09,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0975


 88%|████████▊ | 2017/2304 [6:00:16<25:24,  5.31s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 88%|████████▊ | 2018/2304 [6:00:19<21:48,  4.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0707


 88%|████████▊ | 2019/2304 [6:00:22<19:36,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0710


 88%|████████▊ | 2020/2304 [6:00:26<19:34,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 88%|████████▊ | 2021/2304 [6:00:30<19:26,  4.12s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 88%|████████▊ | 2022/2304 [6:00:34<19:18,  4.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0719


 88%|████████▊ | 2023/2304 [6:00:37<17:44,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 88%|████████▊ | 2024/2304 [6:00:40<16:21,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 88%|████████▊ | 2025/2304 [6:00:43<15:38,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 88%|████████▊ | 2026/2304 [6:00:47<16:42,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 88%|████████▊ | 2027/2304 [6:00:51<17:05,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 88%|████████▊ | 2028/2304 [6:00:55<17:40,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 88%|████████▊ | 2029/2304 [6:00:58<16:32,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0671


 88%|████████▊ | 2030/2304 [6:01:01<15:25,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 88%|████████▊ | 2031/2304 [6:01:04<14:53,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 88%|████████▊ | 2032/2304 [6:01:09<16:04,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0673


 88%|████████▊ | 2033/2304 [6:01:12<16:34,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0710


 88%|████████▊ | 2034/2304 [6:01:17<17:12,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 88%|████████▊ | 2035/2304 [6:01:20<16:05,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0679


 88%|████████▊ | 2036/2304 [6:01:23<15:02,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 88%|████████▊ | 2037/2304 [6:01:26<14:30,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0686


 88%|████████▊ | 2038/2304 [6:01:30<15:27,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0693


 88%|████████▊ | 2039/2304 [6:01:34<16:14,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 89%|████████▊ | 2040/2304 [6:01:38<16:49,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 89%|████████▊ | 2041/2304 [6:01:41<15:39,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 89%|████████▊ | 2042/2304 [6:01:44<14:42,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 89%|████████▊ | 2043/2304 [6:01:47<14:16,  3.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0707


 89%|████████▊ | 2044/2304 [6:01:51<15:10,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


 89%|████████▉ | 2045/2304 [6:01:55<15:54,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 89%|████████▉ | 2046/2304 [6:01:59<16:28,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0675


 89%|████████▉ | 2047/2304 [6:02:02<15:11,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


 89%|████████▉ | 2048/2304 [6:02:05<14:23,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0664


 89%|████████▉ | 2049/2304 [6:02:08<13:54,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 89%|████████▉ | 2050/2304 [6:02:12<14:47,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 89%|████████▉ | 2051/2304 [6:02:16<15:28,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0694


 89%|████████▉ | 2052/2304 [6:02:20<16:01,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 89%|████████▉ | 2053/2304 [6:02:23<14:48,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0729


 89%|████████▉ | 2054/2304 [6:02:26<14:03,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 89%|████████▉ | 2055/2304 [6:02:29<13:21,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0729


 89%|████████▉ | 2056/2304 [6:02:33<14:30,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0685


 89%|████████▉ | 2057/2304 [6:02:37<15:08,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0728


 89%|████████▉ | 2058/2304 [6:02:41<15:40,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 89%|████████▉ | 2059/2304 [6:02:44<14:27,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 89%|████████▉ | 2060/2304 [6:02:47<13:44,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0701


 89%|████████▉ | 2061/2304 [6:02:50<13:04,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0692


 89%|████████▉ | 2062/2304 [6:02:54<14:07,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 90%|████████▉ | 2063/2304 [6:02:58<14:49,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 90%|████████▉ | 2064/2304 [6:03:03<15:17,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 90%|████████▉ | 2065/2304 [6:03:05<14:04,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0698


 90%|████████▉ | 2066/2304 [6:03:08<13:22,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0706


 90%|████████▉ | 2067/2304 [6:03:11<12:45,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0681


 90%|████████▉ | 2068/2304 [6:03:15<13:46,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 90%|████████▉ | 2069/2304 [6:03:20<14:24,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0679


 90%|████████▉ | 2070/2304 [6:03:24<14:55,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 90%|████████▉ | 2071/2304 [6:03:27<13:45,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0788


 90%|████████▉ | 2072/2304 [6:03:30<13:01,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 90%|████████▉ | 2073/2304 [6:03:32<12:26,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 90%|█████████ | 2074/2304 [6:03:37<13:28,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0721


 90%|█████████ | 2075/2304 [6:03:41<14:06,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0655


 90%|█████████ | 2076/2304 [6:03:45<14:26,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 90%|█████████ | 2077/2304 [6:03:48<13:30,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 90%|█████████ | 2078/2304 [6:03:51<12:46,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0712


 90%|█████████ | 2079/2304 [6:03:54<12:09,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 90%|█████████ | 2080/2304 [6:03:58<13:12,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 90%|█████████ | 2081/2304 [6:04:02<13:47,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0697


 90%|█████████ | 2082/2304 [6:04:06<14:04,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 90%|█████████ | 2083/2304 [6:04:09<13:10,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 90%|█████████ | 2084/2304 [6:04:12<12:29,  3.41s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 90%|█████████ | 2085/2304 [6:04:15<11:52,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 91%|█████████ | 2086/2304 [6:04:19<12:46,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 91%|█████████ | 2087/2304 [6:04:23<13:24,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 91%|█████████ | 2088/2304 [6:04:27<13:41,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 91%|█████████ | 2089/2304 [6:04:30<12:46,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0728


 91%|█████████ | 2090/2304 [6:04:33<11:58,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0714


 91%|█████████ | 2091/2304 [6:04:36<11:33,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 91%|█████████ | 2092/2304 [6:04:40<12:25,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0790


 91%|█████████ | 2093/2304 [6:04:44<12:58,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 91%|█████████ | 2094/2304 [6:04:48<13:17,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0784


 91%|█████████ | 2095/2304 [6:04:52<12:23,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0710


 91%|█████████ | 2096/2304 [6:04:54<11:33,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 91%|█████████ | 2097/2304 [6:04:57<11:14,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0786


 91%|█████████ | 2098/2304 [6:05:02<12:06,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 91%|█████████ | 2099/2304 [6:05:05<12:27,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 91%|█████████ | 2100/2304 [6:05:10<12:56,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 91%|█████████ | 2101/2304 [6:05:13<12:04,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0670


 91%|█████████ | 2102/2304 [6:05:15<11:15,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 91%|█████████▏| 2103/2304 [6:05:18<10:51,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0705


 91%|█████████▏| 2104/2304 [6:05:23<11:45,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0712


 91%|█████████▏| 2105/2304 [6:05:27<12:05,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 91%|█████████▏| 2106/2304 [6:05:31<12:35,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0690


 91%|█████████▏| 2107/2304 [6:05:34<11:47,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 91%|█████████▏| 2108/2304 [6:05:37<10:59,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0725


 92%|█████████▏| 2109/2304 [6:05:40<10:37,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0711


 92%|█████████▏| 2110/2304 [6:05:44<11:18,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0724


 92%|█████████▏| 2111/2304 [6:05:48<11:51,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 92%|█████████▏| 2112/2304 [6:05:52<12:14,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 92%|█████████▏| 2113/2304 [6:05:55<11:24,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 92%|█████████▏| 2114/2304 [6:05:58<10:40,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 92%|█████████▏| 2115/2304 [6:06:01<10:17,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0870


 92%|█████████▏| 2116/2304 [6:06:05<10:56,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0687


 92%|█████████▏| 2117/2304 [6:06:09<11:28,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 92%|█████████▏| 2118/2304 [6:06:13<11:51,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0753


 92%|█████████▏| 2119/2304 [6:06:16<11:02,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0793


 92%|█████████▏| 2120/2304 [6:06:19<10:17,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0689


 92%|█████████▏| 2121/2304 [6:06:22<09:56,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 92%|█████████▏| 2122/2304 [6:06:26<10:32,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0844


 92%|█████████▏| 2123/2304 [6:06:30<11:01,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 92%|█████████▏| 2124/2304 [6:06:34<11:27,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 92%|█████████▏| 2125/2304 [6:06:37<10:32,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0685


 92%|█████████▏| 2126/2304 [6:06:40<09:58,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 92%|█████████▏| 2127/2304 [6:06:43<09:37,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0713


 92%|█████████▏| 2128/2304 [6:06:47<10:15,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 92%|█████████▏| 2129/2304 [6:06:51<10:43,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 92%|█████████▏| 2130/2304 [6:06:56<11:04,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 92%|█████████▏| 2131/2304 [6:06:59<10:19,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 93%|█████████▎| 2132/2304 [6:07:01<09:36,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 93%|█████████▎| 2133/2304 [6:07:04<09:16,  3.26s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 93%|█████████▎| 2134/2304 [6:07:08<09:54,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0673


 93%|█████████▎| 2135/2304 [6:07:13<10:23,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0703


 93%|█████████▎| 2136/2304 [6:07:17<10:43,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 93%|█████████▎| 2137/2304 [6:07:20<09:51,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 93%|█████████▎| 2138/2304 [6:07:23<09:21,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0789


 93%|█████████▎| 2139/2304 [6:07:26<08:52,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 93%|█████████▎| 2140/2304 [6:07:30<09:34,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0768


 93%|█████████▎| 2141/2304 [6:07:34<10:02,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0657


 93%|█████████▎| 2142/2304 [6:07:38<10:22,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0709


 93%|█████████▎| 2143/2304 [6:07:41<09:32,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0761


 93%|█████████▎| 2144/2304 [6:07:44<09:01,  3.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 93%|█████████▎| 2145/2304 [6:07:47<08:34,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0748


 93%|█████████▎| 2146/2304 [6:07:51<09:14,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0747


 93%|█████████▎| 2147/2304 [6:07:55<09:37,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 93%|█████████▎| 2148/2304 [6:07:59<09:58,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0772


 93%|█████████▎| 2149/2304 [6:08:02<09:09,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0692


 93%|█████████▎| 2150/2304 [6:08:05<08:40,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 93%|█████████▎| 2151/2304 [6:08:08<08:13,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 93%|█████████▎| 2152/2304 [6:08:12<08:54,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0693


 93%|█████████▎| 2153/2304 [6:08:16<09:16,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 93%|█████████▎| 2154/2304 [6:08:20<09:34,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0729


 94%|█████████▎| 2155/2304 [6:08:23<08:48,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0688


 94%|█████████▎| 2156/2304 [6:08:26<08:18,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 94%|█████████▎| 2157/2304 [6:08:29<07:52,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0731


 94%|█████████▎| 2158/2304 [6:08:33<08:29,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


 94%|█████████▎| 2159/2304 [6:08:37<08:52,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 94%|█████████▍| 2160/2304 [6:08:41<09:01,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1076


 94%|█████████▍| 2161/2304 [6:08:44<08:24,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 94%|█████████▍| 2162/2304 [6:08:47<07:57,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 94%|█████████▍| 2163/2304 [6:08:50<07:33,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


 94%|█████████▍| 2164/2304 [6:08:54<08:07,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 94%|█████████▍| 2165/2304 [6:08:58<08:30,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 94%|█████████▍| 2166/2304 [6:09:02<08:40,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0792


 94%|█████████▍| 2167/2304 [6:09:05<08:05,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 94%|█████████▍| 2168/2304 [6:09:08<07:31,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 94%|█████████▍| 2169/2304 [6:09:11<07:16,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0752


 94%|█████████▍| 2170/2304 [6:09:15<07:50,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0991


 94%|█████████▍| 2171/2304 [6:09:19<08:10,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 94%|█████████▍| 2172/2304 [6:09:23<08:20,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0816


 94%|█████████▍| 2173/2304 [6:09:26<07:46,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0764


 94%|█████████▍| 2174/2304 [6:09:29<07:14,  3.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 94%|█████████▍| 2175/2304 [6:09:32<06:58,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 94%|█████████▍| 2176/2304 [6:09:36<07:30,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


 94%|█████████▍| 2177/2304 [6:09:41<07:48,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0725


 95%|█████████▍| 2178/2304 [6:09:45<07:58,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 95%|█████████▍| 2179/2304 [6:09:48<07:26,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 95%|█████████▍| 2180/2304 [6:09:50<06:55,  3.35s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 95%|█████████▍| 2181/2304 [6:09:54<06:39,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0682


 95%|█████████▍| 2182/2304 [6:09:58<07:10,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 95%|█████████▍| 2183/2304 [6:10:02<07:22,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 95%|█████████▍| 2184/2304 [6:10:06<07:36,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0723


 95%|█████████▍| 2185/2304 [6:10:09<07:05,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0759


 95%|█████████▍| 2186/2304 [6:10:12<06:35,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 95%|█████████▍| 2187/2304 [6:10:15<06:20,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0848


 95%|█████████▍| 2188/2304 [6:10:19<06:43,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 95%|█████████▌| 2189/2304 [6:10:23<07:03,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 95%|█████████▌| 2190/2304 [6:10:27<07:15,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0846


 95%|█████████▌| 2191/2304 [6:10:30<06:39,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0792


 95%|█████████▌| 2192/2304 [6:10:33<06:17,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 95%|█████████▌| 2193/2304 [6:10:36<06:02,  3.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 95%|█████████▌| 2194/2304 [6:10:40<06:22,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


 95%|█████████▌| 2195/2304 [6:10:44<06:38,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 95%|█████████▌| 2196/2304 [6:10:48<06:52,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 95%|█████████▌| 2197/2304 [6:10:51<06:17,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 95%|█████████▌| 2198/2304 [6:10:54<05:56,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0867


 95%|█████████▌| 2199/2304 [6:10:57<05:38,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 95%|█████████▌| 2200/2304 [6:11:01<06:06,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0770


 96%|█████████▌| 2201/2304 [6:11:05<06:20,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 96%|█████████▌| 2202/2304 [6:11:09<06:31,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0779


 96%|█████████▌| 2203/2304 [6:11:12<05:59,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


 96%|█████████▌| 2204/2304 [6:11:15<05:38,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 96%|█████████▌| 2205/2304 [6:11:18<05:19,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0734


 96%|█████████▌| 2206/2304 [6:11:22<05:43,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0829


 96%|█████████▌| 2207/2304 [6:11:26<05:56,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 96%|█████████▌| 2208/2304 [6:11:30<06:06,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 96%|█████████▌| 2209/2304 [6:11:33<05:36,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0824


 96%|█████████▌| 2210/2304 [6:11:36<05:17,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 96%|█████████▌| 2211/2304 [6:11:39<04:59,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0832


 96%|█████████▌| 2212/2304 [6:11:43<05:21,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 96%|█████████▌| 2213/2304 [6:11:47<05:35,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 96%|█████████▌| 2214/2304 [6:11:52<05:43,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


 96%|█████████▌| 2215/2304 [6:11:54<05:14,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0838


 96%|█████████▌| 2216/2304 [6:11:57<04:55,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 96%|█████████▌| 2217/2304 [6:12:00<04:40,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 96%|█████████▋| 2218/2304 [6:12:04<05:00,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 96%|█████████▋| 2219/2304 [6:12:09<05:12,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 96%|█████████▋| 2220/2304 [6:12:13<05:16,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0685


 96%|█████████▋| 2221/2304 [6:12:16<04:53,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0790


 96%|█████████▋| 2222/2304 [6:12:19<04:36,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 96%|█████████▋| 2223/2304 [6:12:21<04:20,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0821


 97%|█████████▋| 2224/2304 [6:12:26<04:39,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0649


 97%|█████████▋| 2225/2304 [6:12:30<04:49,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 97%|█████████▋| 2226/2304 [6:12:34<04:53,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0897


 97%|█████████▋| 2227/2304 [6:12:37<04:33,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 97%|█████████▋| 2228/2304 [6:12:40<04:15,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 97%|█████████▋| 2229/2304 [6:12:42<04:00,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0716


 97%|█████████▋| 2230/2304 [6:12:47<04:18,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 97%|█████████▋| 2231/2304 [6:12:51<04:27,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0795


 97%|█████████▋| 2232/2304 [6:12:55<04:30,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


 97%|█████████▋| 2233/2304 [6:12:58<04:10,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 97%|█████████▋| 2234/2304 [6:13:00<03:52,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 97%|█████████▋| 2235/2304 [6:13:03<03:43,  3.23s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 97%|█████████▋| 2236/2304 [6:13:08<03:57,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


 97%|█████████▋| 2237/2304 [6:13:12<04:03,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 97%|█████████▋| 2238/2304 [6:13:16<04:09,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0754


 97%|█████████▋| 2239/2304 [6:13:19<03:50,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0781


 97%|█████████▋| 2240/2304 [6:13:22<03:33,  3.33s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 97%|█████████▋| 2241/2304 [6:13:25<03:24,  3.25s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 97%|█████████▋| 2242/2304 [6:13:29<03:38,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 97%|█████████▋| 2243/2304 [6:13:33<03:45,  3.70s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 97%|█████████▋| 2244/2304 [6:13:37<03:52,  3.87s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0715


 97%|█████████▋| 2245/2304 [6:13:40<03:33,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0765


 97%|█████████▋| 2246/2304 [6:13:43<03:16,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0749


 98%|█████████▊| 2247/2304 [6:13:46<03:06,  3.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 98%|█████████▊| 2248/2304 [6:13:50<03:16,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0927


 98%|█████████▊| 2249/2304 [6:13:54<03:21,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0787


 98%|█████████▊| 2250/2304 [6:13:58<03:26,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 98%|█████████▊| 2251/2304 [6:14:01<03:07,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 98%|█████████▊| 2252/2304 [6:14:04<02:54,  3.36s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0697


 98%|█████████▊| 2253/2304 [6:14:07<02:43,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0851


 98%|█████████▊| 2254/2304 [6:14:11<02:54,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 98%|█████████▊| 2255/2304 [6:14:15<02:59,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0821


 98%|█████████▊| 2256/2304 [6:14:19<03:02,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0703


 98%|█████████▊| 2257/2304 [6:14:22<02:46,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0771


 98%|█████████▊| 2258/2304 [6:14:25<02:35,  3.38s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0733


 98%|█████████▊| 2259/2304 [6:14:28<02:24,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0797


 98%|█████████▊| 2260/2304 [6:14:32<02:33,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0732


 98%|█████████▊| 2261/2304 [6:14:36<02:38,  3.69s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 98%|█████████▊| 2262/2304 [6:14:41<02:40,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 98%|█████████▊| 2263/2304 [6:14:43<02:24,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0836


 98%|█████████▊| 2264/2304 [6:14:46<02:14,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 98%|█████████▊| 2265/2304 [6:14:49<02:05,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0908


 98%|█████████▊| 2266/2304 [6:14:53<02:12,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1069


 98%|█████████▊| 2267/2304 [6:14:57<02:15,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 98%|█████████▊| 2268/2304 [6:15:02<02:17,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0737


 98%|█████████▊| 2269/2304 [6:15:04<02:03,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 99%|█████████▊| 2270/2304 [6:15:07<01:54,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0735


 99%|█████████▊| 2271/2304 [6:15:10<01:46,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 99%|█████████▊| 2272/2304 [6:15:14<01:51,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


 99%|█████████▊| 2273/2304 [6:15:19<01:53,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


 99%|█████████▊| 2274/2304 [6:15:23<01:53,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 99%|█████████▊| 2275/2304 [6:15:26<01:42,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0929


 99%|█████████▉| 2276/2304 [6:15:29<01:34,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 99%|█████████▉| 2277/2304 [6:15:31<01:26,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 99%|█████████▉| 2278/2304 [6:15:36<01:30,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 99%|█████████▉| 2279/2304 [6:15:40<01:31,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 99%|█████████▉| 2280/2304 [6:15:44<01:30,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


 99%|█████████▉| 2281/2304 [6:15:47<01:21,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 99%|█████████▉| 2282/2304 [6:15:50<01:14,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 99%|█████████▉| 2283/2304 [6:15:52<01:07,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 99%|█████████▉| 2284/2304 [6:15:57<01:09,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0698


 99%|█████████▉| 2285/2304 [6:16:01<01:09,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 99%|█████████▉| 2286/2304 [6:16:05<01:07,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 99%|█████████▉| 2287/2304 [6:16:08<01:00,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


 99%|█████████▉| 2288/2304 [6:16:11<00:53,  3.32s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0747


 99%|█████████▉| 2289/2304 [6:16:14<00:48,  3.22s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 99%|█████████▉| 2290/2304 [6:16:18<00:48,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 99%|█████████▉| 2291/2304 [6:16:22<00:47,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


 99%|█████████▉| 2292/2304 [6:16:26<00:45,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0664


100%|█████████▉| 2293/2304 [6:16:29<00:38,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0900


100%|█████████▉| 2294/2304 [6:16:32<00:33,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


100%|█████████▉| 2295/2304 [6:16:35<00:29,  3.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


100%|█████████▉| 2296/2304 [6:16:39<00:28,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


100%|█████████▉| 2297/2304 [6:16:43<00:25,  3.67s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0647


100%|█████████▉| 2298/2304 [6:16:47<00:22,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


100%|█████████▉| 2299/2304 [6:16:50<00:17,  3.55s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


100%|█████████▉| 2300/2304 [6:16:53<00:13,  3.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0799


100%|█████████▉| 2301/2304 [6:16:56<00:09,  3.21s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


100%|█████████▉| 2302/2304 [6:17:00<00:07,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


100%|█████████▉| 2303/2304 [6:17:04<00:03,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


100%|██████████| 2304/2304 [6:17:08<00:00,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0689

✅ Best Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}
✅ Best Loss: 0.0625





##### 2.3 Test

In [None]:
def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    model.to(device)

    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred)
            targets.append(yb)

    # (B*T, 8, 8) 텐서 형태로 합치기
    preds_tensor = torch.cat(preds, dim=0)
    targets_tensor = torch.cat(targets, dim=0)

    return preds_tensor, targets_tensor

with open(f'{model_save_path}/best_model_window10per30_CT2_config.json', 'r') as f:
    best_config = json.load(f)

best_model = CorrPredictorCNNTransformer(
    kernel_size=best_config['kernel_size'],
    d_model=best_config['d_model'],
    nhead=best_config['nhead'],
    num_layers=best_config['num_layers'],
    dim_feedforward=best_config['dim_feedforward'],
    activation=best_config['activation']
)
best_model.load_state_dict(torch.load(f"{model_save_path}/best_model_window10per30_CT2_weights.pth"))

test_loader = DataLoader(test_ds, batch_size=best_config['batch_size'], shuffle=False)
preds_tensor, targets_tensor = evaluate_model(best_model, test_loader, device=device)

# 저장
torch.save({
    'preds': preds_tensor,
    'targets': targets_tensor
}, f"{model_save_path}/best_model_window10per30_CT2_result.pt")

In [None]:
# Performance metrics

preds_flat = preds_tensor.view(preds_tensor.size(0), -1).cpu().numpy()
targets_flat = targets_tensor.view(targets_tensor.size(0), -1).cpu().numpy()

mse = mean_squared_error(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
rmse = np.sqrt(mse)

# frobenius_loss
cos_sim = cosine_similarity(targets_flat, preds_flat)
mean_cos_sim = np.diag(cos_sim).mean()

# frobenius_loss
diff = preds_tensor - targets_tensor
frobenius_per_sample = torch.norm(diff, p='fro', dim=(1, 2))
mean_frobenius = frobenius_per_sample.mean().item()

print(f"\n📊 Evaluation Results:")
print(f"MSE               : {mse:.5f}")
print(f"MAE               : {mae:.5f}")
print(f"RMSE              : {rmse:.5f}")
print(f"Cosine Similarity : {mean_cos_sim:.5f}")
print(f"Frobenius Norm    : {mean_frobenius:.5f}")


📊 Evaluation Results:
MSE               : 0.06231
MAE               : 0.18891
RMSE              : 0.24961
Cosine Similarity : 0.94513
Frobenius Norm    : 1.85180
