### 0. Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
import pywt
import copy
import json

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/data/train_data.csv')
df.set_index('timestamp', inplace=True)

### 1. Data Preprocessing

##### 1.1 Correlation Transform

In [None]:
def corr_transform(df, input_window_width=30, label_window_width=10):
    X, Y = [], []
    data = df.values

    for t in range(input_window_width, len(df) - 1):
        window_data = data[t-input_window_width : t]
        window_corr = np.corrcoef(window_data.T)
        X.append(torch.tensor(window_corr, dtype=torch.float32))

        label_window = data[t : t+label_window_width]
        corr_next = np.corrcoef(label_window.T)
        Y.append(torch.tensor(corr_next, dtype=torch.float32))

    return torch.stack(X), torch.stack(Y)

In [None]:
X_tensor, Y_tensor = corr_transform(df)

In [None]:
X_tensor.shape, Y_tensor.shape

(torch.Size([2528, 8, 8]), torch.Size([2528, 8, 8]))

In [None]:
total_size = len(X_tensor)
train_size = int(total_size * 0.8)
val_size   = int(total_size * 0.1)
test_size  = total_size - train_size - val_size

X_train = X_tensor[:train_size]
Y_train = Y_tensor[:train_size]

X_val = X_tensor[train_size:train_size + val_size]
Y_val = Y_tensor[train_size:train_size + val_size]

X_test = X_tensor[train_size + val_size:]
Y_test = Y_tensor[train_size + val_size:]

train_ds = TensorDataset(X_train, Y_train)
val_ds   = TensorDataset(X_val, Y_val)
test_ds  = TensorDataset(X_test, Y_test)

### 2. Modeling

##### 2.1 Model Structure Setting

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]

        return x

In [None]:
class CorrPredictorCNNTransformer(nn.Module):
    def __init__(
            self,
            num_channels=1,
            conv_channels=32,
            kernel_size=3,
            d_model=128,
            nhead=4,
            num_layers=2,
            dim_feedforward=256,
            activation='relu',
            ):

        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(num_channels, 32, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, d_model, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm2d(d_model)
        )

        self.flatten = nn.Flatten(start_dim=2)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.positional_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            activation=activation,
            batch_first=True,
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, 64)
            )

    def forward(self, x):
        """
        CB: CNN Batch
        TB: Transformer Batch
        T: Sequence Length
        C: Channel
        H: Height
        W: Width
        d_model: Dimension of model
        """

        if x.dim() == 3:
            x = x.unsqueeze(1)

        B, C, H, W = x.shape

        x = self.cnn(x)

        x = self.flatten(x)
        x = self.pool(x).squeeze(-1)
        x = x.unsqueeze(0)

        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = x.squeeze(0)

        output = self.fc(x)
        output = output.view(-1, 8, 8)

        return output

##### 2.2 Training

In [None]:
def train_model(model, train_loader, val_loader, optimizer_name='Adam', lr=5e-4, epochs=70, device='cuda'):
    model.to(device)

    # Optimizer 선택
    if optimizer_name == 'Adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == 'RMSprop':
        opt = torch.optim.RMSprop(model.parameters(), lr=lr)
    elif optimizer_name == 'AdamW':
        opt = torch.optim.AdamW(model.parameters(), lr=lr)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss & LR Scheduler
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, mode='min', factor=0.5, patience=5, verbose=True
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()

            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            opt.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        # 스케줄러 적용
        scheduler.step(val_loss)

    return val_loss

In [None]:
model_save_path = '/content/drive/MyDrive/WaveletFrequencyDecomposed_CNN_Transformer/best_model'

In [None]:
# Grid Search
param_grid = {
    'kernel_size': [3, 5],
    'd_model': [32, 64, 128],
    'nhead': [2, 4],
    'num_layers': [2, 4],
    'dim_feedforward': [256, 512],
    'activation': ['relu', 'gelu'],
    'lr': [0.001, 5e-4],
    'optimizer': ['Adam', 'RMSprop', 'AdamW'],
    'batch_size': [64, 128, 256, 512]
}

best_loss = float('inf')
best_config = None
best_model = None

for config in tqdm(ParameterGrid(param_grid)):
    train_loader = DataLoader(train_ds,  batch_size=config['batch_size'], shuffle=False)
    val_loader   = DataLoader(val_ds,  batch_size=config['batch_size'], shuffle=False)

    model = CorrPredictorCNNTransformer(
        kernel_size=config['kernel_size'],
        d_model=config['d_model'],
        nhead=config['nhead'],
        num_layers=config['num_layers'],
        dim_feedforward=config['dim_feedforward'],
        activation=config['activation'],
    )
    loss = train_model(model, train_loader, val_loader,
                       optimizer_name=config['optimizer'],
                       lr=config['lr'], device=device)

    print(f"Config: {config}, Loss: {loss:.4f}")
    if loss < best_loss:
        best_loss = loss
        best_config = config

        torch.save(model.state_dict(), f"{model_save_path}/best_model_window10per30_CT_weights.pth")
        with open(f'{model_save_path}/best_model_window10per30_CT_config.json', 'w') as f:
            json.dump(best_config, f, indent=4)

# 최종 결과
print(f"\n✅ Best Config: {best_config}")
print(f"✅ Best Loss: {best_loss:.4f}")



Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0990


  0%|          | 2/2304 [00:40<12:21:35, 19.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0902


  0%|          | 3/2304 [00:56<11:28:51, 17.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0907


  0%|          | 4/2304 [01:21<13:17:50, 20.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0742


  0%|          | 5/2304 [01:46<14:15:13, 22.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0846


  0%|          | 6/2304 [02:12<14:58:57, 23.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0836


  0%|          | 7/2304 [02:28<13:29:15, 21.14s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0904


  0%|          | 8/2304 [02:44<12:23:41, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1058


  0%|          | 9/2304 [03:00<11:45:08, 18.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1251


  0%|          | 10/2304 [03:25<13:05:22, 20.54s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


  0%|          | 11/2304 [03:50<13:55:58, 21.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0839


  1%|          | 12/2304 [04:16<14:39:35, 23.03s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1046


  1%|          | 13/2304 [04:32<13:19:49, 20.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0741


  1%|          | 14/2304 [04:48<12:21:08, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0802


  1%|          | 15/2304 [05:04<11:42:20, 18.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0876


  1%|          | 16/2304 [05:30<13:02:11, 20.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1408


  1%|          | 17/2304 [05:54<13:51:55, 21.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


  1%|          | 18/2304 [06:20<14:33:43, 22.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0987


  1%|          | 19/2304 [06:36<13:17:25, 20.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0784


  1%|          | 20/2304 [06:52<12:21:05, 19.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1023


  1%|          | 21/2304 [07:09<11:44:52, 18.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0709


  1%|          | 22/2304 [07:34<13:04:03, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0958


  1%|          | 23/2304 [07:59<13:52:07, 21.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1056


  1%|          | 24/2304 [08:24<14:29:45, 22.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


  1%|          | 25/2304 [08:40<13:13:17, 20.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0977


  1%|          | 26/2304 [08:56<12:17:14, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1039


  1%|          | 27/2304 [09:13<11:43:13, 18.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0840


  1%|          | 28/2304 [09:38<13:00:13, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1021


  1%|▏         | 29/2304 [10:03<13:46:27, 21.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1033


  1%|▏         | 30/2304 [10:28<14:27:33, 22.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0951


  1%|▏         | 31/2304 [10:45<13:14:07, 20.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0904


  1%|▏         | 32/2304 [11:01<12:15:38, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0867


  1%|▏         | 33/2304 [11:17<11:38:52, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0907


  1%|▏         | 34/2304 [11:42<12:56:42, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0903


  2%|▏         | 35/2304 [12:07<13:46:01, 21.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0802


  2%|▏         | 36/2304 [12:33<14:31:39, 23.06s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1011


  2%|▏         | 37/2304 [12:49<13:15:30, 21.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1268


  2%|▏         | 38/2304 [13:05<12:14:59, 19.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1059


  2%|▏         | 39/2304 [13:21<11:39:21, 18.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


  2%|▏         | 40/2304 [13:47<12:57:02, 20.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0906


  2%|▏         | 41/2304 [14:12<13:47:07, 21.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0862


  2%|▏         | 42/2304 [14:37<14:26:16, 22.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


  2%|▏         | 43/2304 [14:54<13:09:37, 20.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1201


  2%|▏         | 44/2304 [15:09<12:11:32, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1305


  2%|▏         | 45/2304 [15:26<11:36:03, 18.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1273


  2%|▏         | 46/2304 [15:51<12:58:24, 20.68s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0858


  2%|▏         | 47/2304 [16:16<13:45:30, 21.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1098


  2%|▏         | 48/2304 [16:42<14:24:46, 23.00s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1065


  2%|▏         | 49/2304 [16:58<13:08:42, 20.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0855


  2%|▏         | 50/2304 [17:14<12:16:05, 19.59s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1026


  2%|▏         | 51/2304 [17:31<11:40:39, 18.66s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1328


  2%|▏         | 52/2304 [17:56<12:54:50, 20.64s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0887


  2%|▏         | 53/2304 [18:21<13:42:02, 21.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0879


  2%|▏         | 54/2304 [18:47<14:23:21, 23.02s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


  2%|▏         | 55/2304 [19:03<13:06:07, 20.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0938


  2%|▏         | 56/2304 [19:19<12:08:40, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1090


  2%|▏         | 57/2304 [19:35<11:31:06, 18.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0884


  3%|▎         | 58/2304 [20:00<12:46:22, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


  3%|▎         | 59/2304 [20:25<13:38:22, 21.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0886


  3%|▎         | 60/2304 [20:51<14:18:03, 22.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0823


  3%|▎         | 61/2304 [21:07<13:02:59, 20.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


  3%|▎         | 62/2304 [21:23<12:04:02, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1007


  3%|▎         | 63/2304 [21:39<11:26:14, 18.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0892


  3%|▎         | 64/2304 [22:04<12:46:19, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1138


  3%|▎         | 65/2304 [22:29<13:35:13, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0802


  3%|▎         | 66/2304 [22:55<14:16:16, 22.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1543


  3%|▎         | 67/2304 [23:11<13:00:40, 20.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0846


  3%|▎         | 68/2304 [23:27<12:03:40, 19.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


  3%|▎         | 69/2304 [23:43<11:29:33, 18.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0853


  3%|▎         | 70/2304 [24:09<12:45:23, 20.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1005


  3%|▎         | 71/2304 [24:33<13:32:04, 21.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0806


  3%|▎         | 72/2304 [24:59<14:12:54, 22.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1023


  3%|▎         | 73/2304 [25:15<13:00:29, 20.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0919


  3%|▎         | 74/2304 [25:31<12:04:51, 19.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1011


  3%|▎         | 75/2304 [25:47<11:27:07, 18.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0888


  3%|▎         | 76/2304 [26:13<12:43:57, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0899


  3%|▎         | 77/2304 [26:38<13:30:57, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0780


  3%|▎         | 78/2304 [27:04<14:14:26, 23.03s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0869


  3%|▎         | 79/2304 [27:20<12:58:24, 20.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0917


  3%|▎         | 80/2304 [27:36<12:00:59, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0776


  4%|▎         | 81/2304 [27:52<11:25:53, 18.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0905


  4%|▎         | 82/2304 [28:17<12:41:43, 20.57s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


  4%|▎         | 83/2304 [28:42<13:28:35, 21.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0772


  4%|▎         | 84/2304 [29:08<14:08:56, 22.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


  4%|▎         | 85/2304 [29:24<12:54:27, 20.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1143


  4%|▎         | 86/2304 [29:40<11:58:28, 19.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0871


  4%|▍         | 87/2304 [29:56<11:22:03, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1222


  4%|▍         | 88/2304 [30:22<12:41:19, 20.61s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1048


  4%|▍         | 89/2304 [30:47<13:28:15, 21.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1169


  4%|▍         | 90/2304 [31:12<14:07:50, 22.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1004


  4%|▍         | 91/2304 [31:28<12:52:20, 20.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1120


  4%|▍         | 92/2304 [31:44<11:55:26, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0869


  4%|▍         | 93/2304 [32:00<11:20:51, 18.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0913


  4%|▍         | 94/2304 [32:26<12:37:09, 20.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0985


  4%|▍         | 95/2304 [32:51<13:23:57, 21.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0855


  4%|▍         | 96/2304 [33:16<14:04:20, 22.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0781


  4%|▍         | 97/2304 [33:32<12:50:03, 20.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0994


  4%|▍         | 98/2304 [33:48<11:52:40, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1049


  4%|▍         | 99/2304 [34:04<11:16:03, 18.40s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


  4%|▍         | 100/2304 [34:29<12:30:50, 20.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0892


  4%|▍         | 101/2304 [34:54<13:19:45, 21.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0738


  4%|▍         | 102/2304 [35:20<14:00:32, 22.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0752


  4%|▍         | 103/2304 [35:36<12:45:34, 20.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


  5%|▍         | 104/2304 [35:52<11:50:29, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0982


  5%|▍         | 105/2304 [36:08<11:14:37, 18.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


  5%|▍         | 106/2304 [36:33<12:29:32, 20.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


  5%|▍         | 107/2304 [36:58<13:15:57, 21.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0771


  5%|▍         | 108/2304 [37:23<13:56:01, 22.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


  5%|▍         | 109/2304 [37:40<12:42:09, 20.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0881


  5%|▍         | 110/2304 [37:55<11:46:24, 19.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1355


  5%|▍         | 111/2304 [38:12<11:13:34, 18.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0899


  5%|▍         | 112/2304 [38:37<12:28:38, 20.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0945


  5%|▍         | 113/2304 [39:02<13:15:20, 21.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1137


  5%|▍         | 114/2304 [39:27<13:55:32, 22.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0901


  5%|▍         | 115/2304 [39:44<12:43:17, 20.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0905


  5%|▌         | 116/2304 [39:59<11:46:56, 19.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


  5%|▌         | 117/2304 [40:16<11:11:23, 18.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0747


  5%|▌         | 118/2304 [40:41<12:25:59, 20.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


  5%|▌         | 119/2304 [41:06<13:12:39, 21.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0888


  5%|▌         | 120/2304 [41:31<13:51:41, 22.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0820


  5%|▌         | 121/2304 [41:47<12:38:30, 20.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


  5%|▌         | 122/2304 [42:03<11:43:34, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


  5%|▌         | 123/2304 [42:19<11:10:20, 18.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0797


  5%|▌         | 124/2304 [42:45<12:23:56, 20.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0924


  5%|▌         | 125/2304 [43:09<13:11:13, 21.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0764


  5%|▌         | 126/2304 [43:35<13:49:52, 22.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0655


  6%|▌         | 127/2304 [43:51<12:36:10, 20.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


  6%|▌         | 128/2304 [44:07<11:40:48, 19.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0980


  6%|▌         | 129/2304 [44:23<11:08:10, 18.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1004


  6%|▌         | 130/2304 [44:48<12:23:38, 20.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0826


  6%|▌         | 131/2304 [45:13<13:09:32, 21.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0882


  6%|▌         | 132/2304 [45:39<13:48:44, 22.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


  6%|▌         | 133/2304 [45:55<12:37:11, 20.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0861


  6%|▌         | 134/2304 [46:11<11:43:41, 19.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0875


  6%|▌         | 135/2304 [46:27<11:09:03, 18.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


  6%|▌         | 136/2304 [46:53<12:22:18, 20.54s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0935


  6%|▌         | 137/2304 [47:17<13:09:08, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1027


  6%|▌         | 138/2304 [47:43<13:48:17, 22.94s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0860


  6%|▌         | 139/2304 [47:59<12:35:52, 20.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


  6%|▌         | 140/2304 [48:15<11:41:29, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0864


  6%|▌         | 141/2304 [48:31<11:05:28, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1079


  6%|▌         | 142/2304 [48:57<12:17:30, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


  6%|▌         | 143/2304 [49:21<13:04:21, 21.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0774


  6%|▋         | 144/2304 [49:47<13:43:02, 22.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0975


  6%|▋         | 145/2304 [50:03<12:29:53, 20.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1310


  6%|▋         | 146/2304 [50:19<11:34:34, 19.31s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1006


  6%|▋         | 147/2304 [50:35<11:00:31, 18.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0942


  6%|▋         | 148/2304 [51:00<12:12:00, 20.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


  6%|▋         | 149/2304 [51:24<12:56:25, 21.62s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0827


  7%|▋         | 150/2304 [51:50<13:36:29, 22.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


  7%|▋         | 151/2304 [52:06<12:25:24, 20.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0991


  7%|▋         | 152/2304 [52:22<11:30:58, 19.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1069


  7%|▋         | 153/2304 [52:38<10:57:44, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0930


  7%|▋         | 154/2304 [53:03<12:09:52, 20.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0932


  7%|▋         | 155/2304 [53:28<12:54:37, 21.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0949


  7%|▋         | 156/2304 [53:53<13:31:29, 22.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0837


  7%|▋         | 157/2304 [54:09<12:20:34, 20.70s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0940


  7%|▋         | 158/2304 [54:25<11:28:08, 19.24s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1148


  7%|▋         | 159/2304 [54:41<10:56:26, 18.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0848


  7%|▋         | 160/2304 [55:06<12:07:18, 20.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0748


  7%|▋         | 161/2304 [55:31<12:54:05, 21.67s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1627


  7%|▋         | 162/2304 [55:56<13:33:46, 22.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0946


  7%|▋         | 163/2304 [56:12<12:23:15, 20.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1109


  7%|▋         | 164/2304 [56:28<11:28:50, 19.31s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0967


  7%|▋         | 165/2304 [56:44<10:55:30, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0803


  7%|▋         | 166/2304 [57:10<12:09:15, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0755


  7%|▋         | 167/2304 [57:34<12:54:01, 21.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


  7%|▋         | 168/2304 [58:00<13:33:37, 22.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693


  7%|▋         | 169/2304 [58:16<12:24:36, 20.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


  7%|▋         | 170/2304 [58:32<11:31:03, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0962


  7%|▋         | 171/2304 [58:48<10:55:52, 18.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1114


  7%|▋         | 172/2304 [59:14<12:08:46, 20.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0708


  8%|▊         | 173/2304 [59:39<12:57:33, 21.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0770


  8%|▊         | 174/2304 [1:00:04<13:38:11, 23.05s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0753


  8%|▊         | 175/2304 [1:00:21<12:26:05, 21.03s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


  8%|▊         | 176/2304 [1:00:37<11:30:41, 19.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0946


  8%|▊         | 177/2304 [1:00:53<10:55:53, 18.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0859


  8%|▊         | 178/2304 [1:01:18<12:08:09, 20.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1056


  8%|▊         | 179/2304 [1:01:43<12:53:45, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0852


  8%|▊         | 180/2304 [1:02:08<13:30:33, 22.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0812


  8%|▊         | 181/2304 [1:02:25<12:18:33, 20.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


  8%|▊         | 182/2304 [1:02:40<11:25:00, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1067


  8%|▊         | 183/2304 [1:02:57<10:51:37, 18.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1075


  8%|▊         | 184/2304 [1:03:22<12:04:38, 20.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0963


  8%|▊         | 185/2304 [1:03:47<12:50:06, 21.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0860


  8%|▊         | 186/2304 [1:04:12<13:27:28, 22.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1018


  8%|▊         | 187/2304 [1:04:29<12:17:29, 20.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


  8%|▊         | 188/2304 [1:04:44<11:23:14, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


  8%|▊         | 189/2304 [1:05:00<10:48:47, 18.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0946


  8%|▊         | 190/2304 [1:05:26<12:01:57, 20.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0986


  8%|▊         | 191/2304 [1:05:51<12:49:49, 21.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0928


  8%|▊         | 192/2304 [1:06:16<13:26:46, 22.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0934


  8%|▊         | 193/2304 [1:06:32<12:15:30, 20.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0989


  8%|▊         | 194/2304 [1:06:48<11:20:34, 19.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


  8%|▊         | 195/2304 [1:07:04<10:45:41, 18.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0808


  9%|▊         | 196/2304 [1:07:30<11:59:19, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0800


  9%|▊         | 197/2304 [1:07:55<12:44:52, 21.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0771


  9%|▊         | 198/2304 [1:08:20<13:23:40, 22.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


  9%|▊         | 199/2304 [1:08:36<12:13:20, 20.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0793


  9%|▊         | 200/2304 [1:08:52<11:21:38, 19.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1084


  9%|▊         | 201/2304 [1:09:08<10:46:57, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


  9%|▉         | 202/2304 [1:09:34<11:58:28, 20.51s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


  9%|▉         | 203/2304 [1:09:58<12:42:39, 21.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0790


  9%|▉         | 204/2304 [1:10:24<13:20:28, 22.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0778


  9%|▉         | 205/2304 [1:10:40<12:09:53, 20.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0958


  9%|▉         | 206/2304 [1:10:56<11:15:23, 19.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0723


  9%|▉         | 207/2304 [1:11:12<10:42:37, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0873


  9%|▉         | 208/2304 [1:11:37<11:56:02, 20.50s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0840


  9%|▉         | 209/2304 [1:12:02<12:41:04, 21.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0803


  9%|▉         | 210/2304 [1:12:28<13:20:48, 22.95s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0773


  9%|▉         | 211/2304 [1:12:44<12:12:18, 20.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0755


  9%|▉         | 212/2304 [1:13:00<11:18:14, 19.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0857


  9%|▉         | 213/2304 [1:13:16<10:43:51, 18.48s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0859


  9%|▉         | 214/2304 [1:13:42<11:53:09, 20.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0897


  9%|▉         | 215/2304 [1:14:06<12:38:42, 21.79s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0903


  9%|▉         | 216/2304 [1:14:32<13:17:58, 22.93s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0796


  9%|▉         | 217/2304 [1:14:48<12:09:43, 20.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0837


  9%|▉         | 218/2304 [1:15:04<11:16:35, 19.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0834


 10%|▉         | 219/2304 [1:15:20<10:41:47, 18.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0939


 10%|▉         | 220/2304 [1:15:46<11:53:37, 20.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0818


 10%|▉         | 221/2304 [1:16:11<12:37:08, 21.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0760


 10%|▉         | 222/2304 [1:16:36<13:14:42, 22.90s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0782


 10%|▉         | 223/2304 [1:16:52<12:05:16, 20.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0889


 10%|▉         | 224/2304 [1:17:08<11:11:51, 19.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 10%|▉         | 225/2304 [1:17:24<10:39:51, 18.47s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0897


 10%|▉         | 226/2304 [1:17:50<11:51:58, 20.56s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0795


 10%|▉         | 227/2304 [1:18:15<12:38:36, 21.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0638


 10%|▉         | 228/2304 [1:18:40<13:15:28, 22.99s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0653


 10%|▉         | 229/2304 [1:18:57<12:04:57, 20.96s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0817


 10%|▉         | 230/2304 [1:19:13<11:11:36, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0935


 10%|█         | 231/2304 [1:19:29<10:37:35, 18.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0792


 10%|█         | 232/2304 [1:19:54<11:49:03, 20.53s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0853


 10%|█         | 233/2304 [1:20:19<12:34:16, 21.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0957


 10%|█         | 234/2304 [1:20:45<13:12:56, 22.98s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


 10%|█         | 235/2304 [1:21:01<12:03:14, 20.97s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0818


 10%|█         | 236/2304 [1:21:17<11:09:45, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0920


 10%|█         | 237/2304 [1:21:33<10:38:55, 18.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0961


 10%|█         | 238/2304 [1:21:59<11:50:14, 20.63s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0867


 10%|█         | 239/2304 [1:22:24<12:33:01, 21.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0913


 10%|█         | 240/2304 [1:22:49<13:08:25, 22.92s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0783


 10%|█         | 241/2304 [1:23:05<11:58:56, 20.91s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0936


 11%|█         | 242/2304 [1:23:21<11:07:40, 19.43s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0945


 11%|█         | 243/2304 [1:23:37<10:32:43, 18.42s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0867


 11%|█         | 244/2304 [1:24:02<11:41:41, 20.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0778


 11%|█         | 245/2304 [1:24:27<12:25:14, 21.72s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 11%|█         | 246/2304 [1:24:53<13:04:59, 22.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 11%|█         | 247/2304 [1:25:09<11:55:56, 20.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0906


 11%|█         | 248/2304 [1:25:25<11:02:14, 19.33s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0713


 11%|█         | 249/2304 [1:25:41<10:30:22, 18.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0815


 11%|█         | 250/2304 [1:26:06<11:40:01, 20.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0741


 11%|█         | 251/2304 [1:26:31<12:24:17, 21.75s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0728


 11%|█         | 252/2304 [1:26:56<13:01:30, 22.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 11%|█         | 253/2304 [1:27:12<11:51:06, 20.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0811


 11%|█         | 254/2304 [1:27:28<10:58:05, 19.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0959


 11%|█         | 255/2304 [1:27:44<10:26:45, 18.35s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1041


 11%|█         | 256/2304 [1:28:09<11:37:34, 20.44s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0959


 11%|█         | 257/2304 [1:28:34<12:21:22, 21.73s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0818


 11%|█         | 258/2304 [1:28:59<12:55:22, 22.74s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0851


 11%|█         | 259/2304 [1:29:15<11:48:11, 20.78s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0965


 11%|█▏        | 260/2304 [1:29:31<10:55:58, 19.26s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0842


 11%|█▏        | 261/2304 [1:29:48<10:25:49, 18.38s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0803


 11%|█▏        | 262/2304 [1:30:13<11:34:29, 20.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0646


 11%|█▏        | 263/2304 [1:30:37<12:18:23, 21.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 11%|█▏        | 264/2304 [1:31:03<12:55:13, 22.80s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0790


 12%|█▏        | 265/2304 [1:31:19<11:47:12, 20.81s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0969


 12%|█▏        | 266/2304 [1:31:35<10:56:13, 19.32s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0865


 12%|█▏        | 267/2304 [1:31:51<10:24:10, 18.39s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0826


 12%|█▏        | 268/2304 [1:32:16<11:34:10, 20.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0733


 12%|█▏        | 269/2304 [1:32:41<12:18:18, 21.77s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0740


 12%|█▏        | 270/2304 [1:33:06<12:53:54, 22.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 12%|█▏        | 271/2304 [1:33:23<11:45:42, 20.83s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0862


 12%|█▏        | 272/2304 [1:33:39<10:57:25, 19.41s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 12%|█▏        | 273/2304 [1:33:55<10:26:00, 18.49s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0844


 12%|█▏        | 274/2304 [1:34:20<11:34:25, 20.52s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 12%|█▏        | 275/2304 [1:34:45<12:18:25, 21.84s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0755


 12%|█▏        | 276/2304 [1:35:10<12:53:21, 22.88s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0868


 12%|█▏        | 277/2304 [1:35:27<11:45:36, 20.89s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 12%|█▏        | 278/2304 [1:35:43<10:54:08, 19.37s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0975


 12%|█▏        | 279/2304 [1:35:59<10:23:06, 18.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0925


 12%|█▏        | 280/2304 [1:36:24<11:33:06, 20.55s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


 12%|█▏        | 281/2304 [1:36:49<12:15:42, 21.82s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 12%|█▏        | 282/2304 [1:37:14<12:50:50, 22.87s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0898


 12%|█▏        | 283/2304 [1:37:31<11:42:43, 20.86s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0963


 12%|█▏        | 284/2304 [1:37:46<10:51:50, 19.36s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1090


 12%|█▏        | 285/2304 [1:38:03<10:20:41, 18.45s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0902


 12%|█▏        | 286/2304 [1:38:28<11:28:16, 20.46s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0839


 12%|█▏        | 287/2304 [1:38:53<12:09:43, 21.71s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0865


 12%|█▎        | 288/2304 [1:39:18<12:47:52, 22.85s/it]

Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1019


 13%|█▎        | 289/2304 [1:39:27<10:25:02, 18.61s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0829


 13%|█▎        | 290/2304 [1:39:35<8:42:35, 15.57s/it] 

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 13%|█▎        | 291/2304 [1:39:44<7:32:49, 13.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1090


 13%|█▎        | 292/2304 [1:39:57<7:29:40, 13.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0808


 13%|█▎        | 293/2304 [1:40:10<7:27:17, 13.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0804


 13%|█▎        | 294/2304 [1:40:24<7:27:18, 13.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0779


 13%|█▎        | 295/2304 [1:40:32<6:40:10, 11.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0762


 13%|█▎        | 296/2304 [1:40:41<6:05:57, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0884


 13%|█▎        | 297/2304 [1:40:50<5:43:36, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0720


 13%|█▎        | 298/2304 [1:41:03<6:13:24, 11.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 13%|█▎        | 299/2304 [1:41:16<6:31:05, 11.70s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0836


 13%|█▎        | 300/2304 [1:41:29<6:47:31, 12.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0754


 13%|█▎        | 301/2304 [1:41:38<6:14:06, 11.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0771


 13%|█▎        | 302/2304 [1:41:47<5:45:41, 10.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 13%|█▎        | 303/2304 [1:41:55<5:27:56,  9.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0730


 13%|█▎        | 304/2304 [1:42:08<6:01:05, 10.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0907


 13%|█▎        | 305/2304 [1:42:21<6:22:11, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 13%|█▎        | 306/2304 [1:42:35<6:41:19, 12.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1585


 13%|█▎        | 307/2304 [1:42:43<6:08:14, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 13%|█▎        | 308/2304 [1:42:52<5:42:07, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 13%|█▎        | 309/2304 [1:43:01<5:26:35,  9.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0764


 13%|█▎        | 310/2304 [1:43:14<6:00:50, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0776


 13%|█▎        | 311/2304 [1:43:27<6:20:34, 11.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 14%|█▎        | 312/2304 [1:43:40<6:38:46, 12.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1577


 14%|█▎        | 313/2304 [1:43:49<6:06:48, 11.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0943


 14%|█▎        | 314/2304 [1:43:57<5:42:22, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0929


 14%|█▎        | 315/2304 [1:44:06<5:28:16,  9.90s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1032


 14%|█▎        | 316/2304 [1:44:20<6:01:50, 10.92s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 14%|█▍        | 317/2304 [1:44:33<6:23:13, 11.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0779


 14%|█▍        | 318/2304 [1:44:46<6:41:56, 12.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0780


 14%|█▍        | 319/2304 [1:44:55<6:08:03, 11.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1020


 14%|█▍        | 320/2304 [1:45:04<5:41:53, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0787


 14%|█▍        | 321/2304 [1:45:12<5:26:33,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 14%|█▍        | 322/2304 [1:45:26<6:01:06, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0772


 14%|█▍        | 323/2304 [1:45:39<6:22:56, 11.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 14%|█▍        | 324/2304 [1:45:52<6:40:28, 12.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1519


 14%|█▍        | 325/2304 [1:46:01<6:06:04, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


 14%|█▍        | 326/2304 [1:46:09<5:40:20, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0764


 14%|█▍        | 327/2304 [1:46:18<5:26:35,  9.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


 14%|█▍        | 328/2304 [1:46:32<6:02:23, 11.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1010


 14%|█▍        | 329/2304 [1:46:45<6:23:55, 11.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0792


 14%|█▍        | 330/2304 [1:46:59<6:41:53, 12.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


 14%|█▍        | 331/2304 [1:47:07<6:08:14, 11.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0799


 14%|█▍        | 332/2304 [1:47:16<5:42:20, 10.42s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0814


 14%|█▍        | 333/2304 [1:47:25<5:26:04,  9.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0944


 14%|█▍        | 334/2304 [1:47:38<5:58:46, 10.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0734


 15%|█▍        | 335/2304 [1:47:51<6:19:24, 11.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 15%|█▍        | 336/2304 [1:48:05<6:38:00, 12.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0731


 15%|█▍        | 337/2304 [1:48:13<6:03:02, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0889


 15%|█▍        | 338/2304 [1:48:22<5:36:38, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1027


 15%|█▍        | 339/2304 [1:48:30<5:20:43,  9.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0806


 15%|█▍        | 340/2304 [1:48:44<5:56:01, 10.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 15%|█▍        | 341/2304 [1:48:57<6:17:11, 11.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0852


 15%|█▍        | 342/2304 [1:49:10<6:34:07, 12.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 15%|█▍        | 343/2304 [1:49:19<6:00:37, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 15%|█▍        | 344/2304 [1:49:27<5:35:02, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0733


 15%|█▍        | 345/2304 [1:49:36<5:19:14,  9.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0945


 15%|█▌        | 346/2304 [1:49:49<5:53:10, 10.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1045


 15%|█▌        | 347/2304 [1:50:02<6:14:18, 11.48s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0821


 15%|█▌        | 348/2304 [1:50:15<6:31:51, 12.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0821


 15%|█▌        | 349/2304 [1:50:24<5:59:15, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1225


 15%|█▌        | 350/2304 [1:50:33<5:34:59, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0734


 15%|█▌        | 351/2304 [1:50:41<5:19:21,  9.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0812


 15%|█▌        | 352/2304 [1:50:55<5:52:41, 10.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


 15%|█▌        | 353/2304 [1:51:08<6:13:51, 11.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 15%|█▌        | 354/2304 [1:51:21<6:30:56, 12.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0706


 15%|█▌        | 355/2304 [1:51:30<5:58:10, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0953


 15%|█▌        | 356/2304 [1:51:38<5:33:42, 10.28s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 15%|█▌        | 357/2304 [1:51:47<5:18:35,  9.82s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0790


 16%|█▌        | 358/2304 [1:52:00<5:52:40, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0841


 16%|█▌        | 359/2304 [1:52:13<6:13:00, 11.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0751


 16%|█▌        | 360/2304 [1:52:27<6:31:19, 12.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0759


 16%|█▌        | 361/2304 [1:52:35<5:58:26, 11.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0819


 16%|█▌        | 362/2304 [1:52:44<5:35:05, 10.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0868


 16%|█▌        | 363/2304 [1:52:53<5:21:07,  9.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1120


 16%|█▌        | 364/2304 [1:53:06<5:54:28, 10.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0897


 16%|█▌        | 365/2304 [1:53:19<6:13:29, 11.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0805


 16%|█▌        | 366/2304 [1:53:32<6:29:44, 12.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0754


 16%|█▌        | 367/2304 [1:53:41<5:56:57, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0911


 16%|█▌        | 368/2304 [1:53:50<5:32:13, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0778


 16%|█▌        | 369/2304 [1:53:58<5:17:22,  9.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0760


 16%|█▌        | 370/2304 [1:54:12<5:50:19, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0897


 16%|█▌        | 371/2304 [1:54:25<6:10:51, 11.51s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0737


 16%|█▌        | 372/2304 [1:54:38<6:28:32, 12.07s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0780


 16%|█▌        | 373/2304 [1:54:47<5:57:07, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1048


 16%|█▌        | 374/2304 [1:54:56<5:32:55, 10.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0800


 16%|█▋        | 375/2304 [1:55:04<5:17:55,  9.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0833


 16%|█▋        | 376/2304 [1:55:18<5:52:19, 10.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0774


 16%|█▋        | 377/2304 [1:55:31<6:11:16, 11.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0762


 16%|█▋        | 378/2304 [1:55:44<6:29:19, 12.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0795


 16%|█▋        | 379/2304 [1:55:53<5:58:16, 11.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0968


 16%|█▋        | 380/2304 [1:56:02<5:33:20, 10.40s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0702


 17%|█▋        | 381/2304 [1:56:11<5:18:18,  9.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1084


 17%|█▋        | 382/2304 [1:56:24<5:51:19, 10.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0827


 17%|█▋        | 383/2304 [1:56:37<6:10:26, 11.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0767


 17%|█▋        | 384/2304 [1:56:50<6:26:59, 12.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 17%|█▋        | 385/2304 [1:56:59<5:54:44, 11.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0647


 17%|█▋        | 386/2304 [1:57:08<5:30:50, 10.35s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0805


 17%|█▋        | 387/2304 [1:57:16<5:15:05,  9.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 17%|█▋        | 388/2304 [1:57:30<5:46:30, 10.85s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1050


 17%|█▋        | 389/2304 [1:57:43<6:07:37, 11.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 17%|█▋        | 390/2304 [1:57:56<6:26:36, 12.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0825


 17%|█▋        | 391/2304 [1:58:06<6:01:05, 11.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0793


 17%|█▋        | 392/2304 [1:58:15<5:43:29, 10.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0868


 17%|█▋        | 393/2304 [1:58:25<5:33:14, 10.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0849


 17%|█▋        | 394/2304 [1:58:40<6:15:12, 11.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0927


 17%|█▋        | 395/2304 [1:58:54<6:38:30, 12.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0650


 17%|█▋        | 396/2304 [1:59:09<6:59:02, 13.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 17%|█▋        | 397/2304 [1:59:18<6:23:24, 12.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0871


 17%|█▋        | 398/2304 [1:59:27<5:52:55, 11.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 17%|█▋        | 399/2304 [1:59:36<5:29:45, 10.39s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0998


 17%|█▋        | 400/2304 [1:59:49<5:57:32, 11.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 17%|█▋        | 401/2304 [2:00:02<6:14:31, 11.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1081


 17%|█▋        | 402/2304 [2:00:16<6:30:23, 12.31s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0731


 17%|█▋        | 403/2304 [2:00:24<5:55:45, 11.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 18%|█▊        | 404/2304 [2:00:33<5:30:25, 10.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 18%|█▊        | 405/2304 [2:00:42<5:14:55,  9.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0982


 18%|█▊        | 406/2304 [2:00:55<5:47:03, 10.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0760


 18%|█▊        | 407/2304 [2:01:08<6:06:08, 11.58s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0830


 18%|█▊        | 408/2304 [2:01:21<6:22:58, 12.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 18%|█▊        | 409/2304 [2:01:30<5:51:24, 11.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0804


 18%|█▊        | 410/2304 [2:01:39<5:27:15, 10.37s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1005


 18%|█▊        | 411/2304 [2:01:48<5:11:40,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0873


 18%|█▊        | 412/2304 [2:02:01<5:44:03, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0839


 18%|█▊        | 413/2304 [2:02:14<6:06:27, 11.63s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 18%|█▊        | 414/2304 [2:02:28<6:25:03, 12.22s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0957


 18%|█▊        | 415/2304 [2:02:37<5:52:31, 11.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0788


 18%|█▊        | 416/2304 [2:02:45<5:29:27, 10.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0877


 18%|█▊        | 417/2304 [2:02:54<5:13:34,  9.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1178


 18%|█▊        | 418/2304 [2:03:07<5:44:44, 10.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0798


 18%|█▊        | 419/2304 [2:03:21<6:06:15, 11.66s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0813


 18%|█▊        | 420/2304 [2:03:34<6:22:56, 12.20s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0753


 18%|█▊        | 421/2304 [2:03:43<5:50:42, 11.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1095


 18%|█▊        | 422/2304 [2:03:52<5:26:29, 10.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0837


 18%|█▊        | 423/2304 [2:04:00<5:11:01,  9.92s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0938


 18%|█▊        | 424/2304 [2:04:14<5:43:05, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0896


 18%|█▊        | 425/2304 [2:04:27<6:03:19, 11.60s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0985


 18%|█▊        | 426/2304 [2:04:40<6:21:01, 12.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 19%|█▊        | 427/2304 [2:04:49<5:49:18, 11.17s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1235


 19%|█▊        | 428/2304 [2:04:58<5:24:00, 10.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0718


 19%|█▊        | 429/2304 [2:05:06<5:08:33,  9.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0855


 19%|█▊        | 430/2304 [2:05:20<5:40:45, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1116


 19%|█▊        | 431/2304 [2:05:33<6:00:23, 11.54s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 19%|█▉        | 432/2304 [2:05:46<6:19:27, 12.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 19%|█▉        | 433/2304 [2:05:55<5:47:30, 11.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0939


 19%|█▉        | 434/2304 [2:06:04<5:23:01, 10.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0804


 19%|█▉        | 435/2304 [2:06:12<5:07:39,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0900


 19%|█▉        | 436/2304 [2:06:26<5:42:01, 10.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0792


 19%|█▉        | 437/2304 [2:06:39<6:02:22, 11.65s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0841


 19%|█▉        | 438/2304 [2:06:53<6:18:43, 12.18s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0854


 19%|█▉        | 439/2304 [2:07:01<5:45:01, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0835


 19%|█▉        | 440/2304 [2:07:10<5:20:52, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0932


 19%|█▉        | 441/2304 [2:07:19<5:06:49,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0779


 19%|█▉        | 442/2304 [2:07:32<5:40:03, 10.96s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1145


 19%|█▉        | 443/2304 [2:07:45<5:59:32, 11.59s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0875


 19%|█▉        | 444/2304 [2:07:59<6:16:15, 12.14s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0903


 19%|█▉        | 445/2304 [2:08:07<5:44:28, 11.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0888


 19%|█▉        | 446/2304 [2:08:16<5:20:13, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0849


 19%|█▉        | 447/2304 [2:08:25<5:05:25,  9.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0862


 19%|█▉        | 448/2304 [2:08:38<5:37:54, 10.92s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1306


 19%|█▉        | 449/2304 [2:08:51<5:57:21, 11.56s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0780


 20%|█▉        | 450/2304 [2:09:05<6:15:36, 12.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1046


 20%|█▉        | 451/2304 [2:09:13<5:43:37, 11.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0878


 20%|█▉        | 452/2304 [2:09:22<5:20:22, 10.38s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0789


 20%|█▉        | 453/2304 [2:09:31<5:05:06,  9.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 20%|█▉        | 454/2304 [2:09:44<5:37:31, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 20%|█▉        | 455/2304 [2:09:57<5:56:25, 11.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0790


 20%|█▉        | 456/2304 [2:10:10<6:12:47, 12.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0786


 20%|█▉        | 457/2304 [2:10:19<5:42:25, 11.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0946


 20%|█▉        | 458/2304 [2:10:28<5:17:31, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0800


 20%|█▉        | 459/2304 [2:10:36<5:01:42,  9.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0847


 20%|█▉        | 460/2304 [2:10:50<5:33:23, 10.85s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0714


 20%|██        | 461/2304 [2:11:03<5:54:01, 11.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 20%|██        | 462/2304 [2:11:16<6:09:51, 12.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0723


 20%|██        | 463/2304 [2:11:25<5:39:23, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0902


 20%|██        | 464/2304 [2:11:33<5:16:20, 10.32s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0742


 20%|██        | 465/2304 [2:11:42<5:02:14,  9.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0775


 20%|██        | 466/2304 [2:11:55<5:33:26, 10.89s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0872


 20%|██        | 467/2304 [2:12:08<5:53:32, 11.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 20%|██        | 468/2304 [2:12:22<6:10:51, 12.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0775


 20%|██        | 469/2304 [2:12:31<5:40:07, 11.12s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


 20%|██        | 470/2304 [2:12:39<5:16:36, 10.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0811


 20%|██        | 471/2304 [2:12:48<5:01:45,  9.88s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0766


 20%|██        | 472/2304 [2:13:01<5:33:06, 10.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 21%|██        | 473/2304 [2:13:14<5:52:33, 11.55s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0874


 21%|██        | 474/2304 [2:13:28<6:08:41, 12.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0757


 21%|██        | 475/2304 [2:13:37<5:38:14, 11.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 21%|██        | 476/2304 [2:13:45<5:14:35, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0837


 21%|██        | 477/2304 [2:13:54<4:59:26,  9.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0940


 21%|██        | 478/2304 [2:14:07<5:32:48, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 21%|██        | 479/2304 [2:14:21<5:55:17, 11.68s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


 21%|██        | 480/2304 [2:14:34<6:09:46, 12.16s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 21%|██        | 481/2304 [2:14:43<5:38:06, 11.13s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 21%|██        | 482/2304 [2:14:51<5:13:43, 10.33s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0656


 21%|██        | 483/2304 [2:15:00<4:58:30,  9.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0798


 21%|██        | 484/2304 [2:15:13<5:29:37, 10.87s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 21%|██        | 485/2304 [2:15:26<5:49:28, 11.53s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0842


 21%|██        | 486/2304 [2:15:39<6:04:56, 12.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693


 21%|██        | 487/2304 [2:15:48<5:33:56, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 21%|██        | 488/2304 [2:15:57<5:10:39, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 21%|██        | 489/2304 [2:16:05<4:55:52,  9.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 21%|██▏       | 490/2304 [2:16:18<5:25:45, 10.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0699


 21%|██▏       | 491/2304 [2:16:31<5:45:49, 11.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0641


 21%|██▏       | 492/2304 [2:16:44<6:00:46, 11.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0749


 21%|██▏       | 493/2304 [2:16:53<5:30:39, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 21%|██▏       | 494/2304 [2:17:02<5:08:02, 10.21s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0824


 21%|██▏       | 495/2304 [2:17:10<4:53:47,  9.74s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0882


 22%|██▏       | 496/2304 [2:17:23<5:23:20, 10.73s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 22%|██▏       | 497/2304 [2:17:36<5:42:09, 11.36s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0806


 22%|██▏       | 498/2304 [2:17:49<5:58:26, 11.91s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0810


 22%|██▏       | 499/2304 [2:17:58<5:29:08, 10.94s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0832


 22%|██▏       | 500/2304 [2:18:06<5:06:22, 10.19s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0785


 22%|██▏       | 501/2304 [2:18:15<4:52:58,  9.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0751


 22%|██▏       | 502/2304 [2:18:28<5:23:15, 10.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0820


 22%|██▏       | 503/2304 [2:18:41<5:43:54, 11.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0761


 22%|██▏       | 504/2304 [2:18:55<5:59:55, 12.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0755


 22%|██▏       | 505/2304 [2:19:03<5:30:36, 11.03s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 22%|██▏       | 506/2304 [2:19:12<5:07:28, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 22%|██▏       | 507/2304 [2:19:20<4:52:35,  9.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 22%|██▏       | 508/2304 [2:19:34<5:23:04, 10.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 22%|██▏       | 509/2304 [2:19:46<5:41:07, 11.40s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


 22%|██▏       | 510/2304 [2:20:00<5:56:47, 11.93s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0865


 22%|██▏       | 511/2304 [2:20:08<5:28:13, 10.98s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 22%|██▏       | 512/2304 [2:20:17<5:05:33, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 22%|██▏       | 513/2304 [2:20:25<4:50:59,  9.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 22%|██▏       | 514/2304 [2:20:39<5:22:05, 10.80s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0668


 22%|██▏       | 515/2304 [2:20:52<5:41:45, 11.46s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 22%|██▏       | 516/2304 [2:21:05<5:57:20, 11.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 22%|██▏       | 517/2304 [2:21:14<5:27:17, 10.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0873


 22%|██▏       | 518/2304 [2:21:22<5:04:39, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 23%|██▎       | 519/2304 [2:21:31<4:50:17,  9.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1232


 23%|██▎       | 520/2304 [2:21:44<5:21:29, 10.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 23%|██▎       | 521/2304 [2:21:57<5:42:20, 11.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0790


 23%|██▎       | 522/2304 [2:22:11<5:58:57, 12.09s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 23%|██▎       | 523/2304 [2:22:19<5:27:39, 11.04s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1035


 23%|██▎       | 524/2304 [2:22:28<5:04:36, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1039


 23%|██▎       | 525/2304 [2:22:36<4:50:00,  9.78s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0740


 23%|██▎       | 526/2304 [2:22:49<5:19:36, 10.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0792


 23%|██▎       | 527/2304 [2:23:03<5:40:29, 11.50s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0845


 23%|██▎       | 528/2304 [2:23:16<5:55:54, 12.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0793


 23%|██▎       | 529/2304 [2:23:24<5:26:06, 11.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0760


 23%|██▎       | 530/2304 [2:23:33<5:03:45, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0835


 23%|██▎       | 531/2304 [2:23:42<4:49:22,  9.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 23%|██▎       | 532/2304 [2:23:55<5:20:48, 10.86s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0740


 23%|██▎       | 533/2304 [2:24:08<5:39:53, 11.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0758


 23%|██▎       | 534/2304 [2:24:22<5:56:54, 12.10s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 23%|██▎       | 535/2304 [2:24:30<5:26:43, 11.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 23%|██▎       | 536/2304 [2:24:39<5:04:47, 10.34s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0869


 23%|██▎       | 537/2304 [2:24:48<4:51:31,  9.90s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0962


 23%|██▎       | 538/2304 [2:25:01<5:22:23, 10.95s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 23%|██▎       | 539/2304 [2:25:14<5:40:22, 11.57s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0649


 23%|██▎       | 540/2304 [2:25:27<5:55:00, 12.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 23%|██▎       | 541/2304 [2:25:36<5:24:52, 11.06s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0758


 24%|██▎       | 542/2304 [2:25:45<5:01:36, 10.27s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0904


 24%|██▎       | 543/2304 [2:25:53<4:46:06,  9.75s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0873


 24%|██▎       | 544/2304 [2:26:06<5:15:32, 10.76s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0727


 24%|██▎       | 545/2304 [2:26:19<5:34:35, 11.41s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 24%|██▎       | 546/2304 [2:26:32<5:51:15, 11.99s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 24%|██▎       | 547/2304 [2:26:41<5:21:09, 10.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0888


 24%|██▍       | 548/2304 [2:26:50<4:59:25, 10.23s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0857


 24%|██▍       | 549/2304 [2:26:58<4:45:40,  9.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


 24%|██▍       | 550/2304 [2:27:11<5:14:54, 10.77s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0847


 24%|██▍       | 551/2304 [2:27:24<5:34:18, 11.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0743


 24%|██▍       | 552/2304 [2:27:38<5:50:51, 12.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 24%|██▍       | 553/2304 [2:27:47<5:22:29, 11.05s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0776


 24%|██▍       | 554/2304 [2:27:55<5:00:18, 10.30s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 24%|██▍       | 555/2304 [2:28:04<4:45:50,  9.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0990


 24%|██▍       | 556/2304 [2:28:17<5:15:33, 10.83s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 24%|██▍       | 557/2304 [2:28:30<5:32:52, 11.43s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0767


 24%|██▍       | 558/2304 [2:28:43<5:48:23, 11.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0748


 24%|██▍       | 559/2304 [2:28:52<5:19:56, 11.00s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0931


 24%|██▍       | 560/2304 [2:29:00<4:57:56, 10.25s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 24%|██▍       | 561/2304 [2:29:09<4:44:24,  9.79s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0819


 24%|██▍       | 562/2304 [2:29:22<5:15:04, 10.85s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0780


 24%|██▍       | 563/2304 [2:29:35<5:34:18, 11.52s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 24%|██▍       | 564/2304 [2:29:49<5:51:11, 12.11s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0688


 25%|██▍       | 565/2304 [2:29:58<5:20:59, 11.08s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0844


 25%|██▍       | 566/2304 [2:30:06<4:58:04, 10.29s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0953


 25%|██▍       | 567/2304 [2:30:15<4:44:07,  9.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0872


 25%|██▍       | 568/2304 [2:30:28<5:13:31, 10.84s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 25%|██▍       | 569/2304 [2:30:41<5:31:35, 11.47s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0891


 25%|██▍       | 570/2304 [2:30:54<5:47:24, 12.02s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1184


 25%|██▍       | 571/2304 [2:31:03<5:18:00, 11.01s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 25%|██▍       | 572/2304 [2:31:11<4:56:10, 10.26s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 25%|██▍       | 573/2304 [2:31:20<4:42:58,  9.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0841


 25%|██▍       | 574/2304 [2:31:33<5:11:42, 10.81s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0900


 25%|██▍       | 575/2304 [2:31:46<5:29:40, 11.44s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0763


 25%|██▌       | 576/2304 [2:31:59<5:44:51, 11.97s/it]

Config: {'activation': 'relu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0938


 25%|██▌       | 577/2304 [2:32:04<4:43:22,  9.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0856


 25%|██▌       | 578/2304 [2:32:09<3:59:41,  8.33s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0762


 25%|██▌       | 579/2304 [2:32:14<3:31:45,  7.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1266


 25%|██▌       | 580/2304 [2:32:21<3:30:05,  7.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0794


 25%|██▌       | 581/2304 [2:32:29<3:30:06,  7.32s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0695


 25%|██▌       | 582/2304 [2:32:36<3:29:39,  7.31s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1058


 25%|██▌       | 583/2304 [2:32:41<3:09:03,  6.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0810


 25%|██▌       | 584/2304 [2:32:46<2:55:24,  6.12s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0922


 25%|██▌       | 585/2304 [2:32:51<2:45:11,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0750


 25%|██▌       | 586/2304 [2:32:58<2:59:12,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1571


 25%|██▌       | 587/2304 [2:33:05<3:06:07,  6.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0759


 26%|██▌       | 588/2304 [2:33:13<3:12:58,  6.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1018


 26%|██▌       | 589/2304 [2:33:18<2:58:24,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0852


 26%|██▌       | 590/2304 [2:33:23<2:46:59,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 26%|██▌       | 591/2304 [2:33:27<2:38:23,  5.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 26%|██▌       | 592/2304 [2:33:35<2:54:04,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0967


 26%|██▌       | 593/2304 [2:33:42<3:01:38,  6.37s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 26%|██▌       | 594/2304 [2:33:49<3:10:44,  6.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0785


 26%|██▌       | 595/2304 [2:33:54<2:55:47,  6.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0912


 26%|██▌       | 596/2304 [2:33:59<2:44:49,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0735


 26%|██▌       | 597/2304 [2:34:04<2:38:29,  5.57s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0747


 26%|██▌       | 598/2304 [2:34:12<2:53:20,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 26%|██▌       | 599/2304 [2:34:19<3:01:40,  6.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0763


 26%|██▌       | 600/2304 [2:34:26<3:11:17,  6.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0822


 26%|██▌       | 601/2304 [2:34:31<2:56:50,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


 26%|██▌       | 602/2304 [2:34:36<2:46:58,  5.89s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0814


 26%|██▌       | 603/2304 [2:34:41<2:38:48,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0856


 26%|██▌       | 604/2304 [2:34:48<2:53:02,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0956


 26%|██▋       | 605/2304 [2:34:56<3:02:17,  6.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 26%|██▋       | 606/2304 [2:35:03<3:09:15,  6.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1052


 26%|██▋       | 607/2304 [2:35:08<2:55:43,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0914


 26%|██▋       | 608/2304 [2:35:13<2:44:16,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0837


 26%|██▋       | 609/2304 [2:35:18<2:36:48,  5.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0867


 26%|██▋       | 610/2304 [2:35:25<2:52:53,  6.12s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1461


 27%|██▋       | 611/2304 [2:35:33<3:01:40,  6.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0740


 27%|██▋       | 612/2304 [2:35:40<3:08:14,  6.68s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0894


 27%|██▋       | 613/2304 [2:35:45<2:54:37,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0784


 27%|██▋       | 614/2304 [2:35:50<2:42:47,  5.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 27%|██▋       | 615/2304 [2:35:55<2:36:24,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 27%|██▋       | 616/2304 [2:36:02<2:50:30,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 27%|██▋       | 617/2304 [2:36:09<3:00:01,  6.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0778


 27%|██▋       | 618/2304 [2:36:17<3:09:06,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1276


 27%|██▋       | 619/2304 [2:36:22<2:54:12,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 27%|██▋       | 620/2304 [2:36:26<2:43:07,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 27%|██▋       | 621/2304 [2:36:31<2:36:23,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0772


 27%|██▋       | 622/2304 [2:36:39<2:50:46,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


 27%|██▋       | 623/2304 [2:36:46<3:01:20,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0779


 27%|██▋       | 624/2304 [2:36:53<3:08:00,  6.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1117


 27%|██▋       | 625/2304 [2:36:58<2:52:51,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1065


 27%|██▋       | 626/2304 [2:37:03<2:43:38,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0807


 27%|██▋       | 627/2304 [2:37:08<2:36:08,  5.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0781


 27%|██▋       | 628/2304 [2:37:16<2:53:26,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0814


 27%|██▋       | 629/2304 [2:37:23<3:01:22,  6.50s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0645


 27%|██▋       | 630/2304 [2:37:31<3:09:02,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1075


 27%|██▋       | 631/2304 [2:37:36<2:54:42,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


 27%|██▋       | 632/2304 [2:37:41<2:42:30,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0815


 27%|██▋       | 633/2304 [2:37:45<2:34:35,  5.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0857


 28%|██▊       | 634/2304 [2:37:53<2:51:09,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0915


 28%|██▊       | 635/2304 [2:38:00<2:59:38,  6.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 28%|██▊       | 636/2304 [2:38:08<3:08:22,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0861


 28%|██▊       | 637/2304 [2:38:13<2:53:45,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 28%|██▊       | 638/2304 [2:38:18<2:41:51,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 28%|██▊       | 639/2304 [2:38:23<2:36:06,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 28%|██▊       | 640/2304 [2:38:30<2:49:57,  6.13s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 28%|██▊       | 641/2304 [2:38:37<2:59:18,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0915


 28%|██▊       | 642/2304 [2:38:45<3:06:53,  6.75s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0900


 28%|██▊       | 643/2304 [2:38:50<2:52:03,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0737


 28%|██▊       | 644/2304 [2:38:55<2:42:02,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0735


 28%|██▊       | 645/2304 [2:39:00<2:34:44,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0787


 28%|██▊       | 646/2304 [2:39:07<2:48:18,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0854


 28%|██▊       | 647/2304 [2:39:14<2:58:45,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 28%|██▊       | 648/2304 [2:39:22<3:05:50,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0710


 28%|██▊       | 649/2304 [2:39:27<2:52:51,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0798


 28%|██▊       | 650/2304 [2:39:32<2:40:53,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0900


 28%|██▊       | 651/2304 [2:39:37<2:34:15,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0888


 28%|██▊       | 652/2304 [2:39:44<2:50:22,  6.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0973


 28%|██▊       | 653/2304 [2:39:52<2:59:13,  6.51s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0655


 28%|██▊       | 654/2304 [2:39:59<3:07:05,  6.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0918


 28%|██▊       | 655/2304 [2:40:04<2:53:09,  6.30s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 28%|██▊       | 656/2304 [2:40:09<2:40:53,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0762


 29%|██▊       | 657/2304 [2:40:14<2:35:15,  5.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1039


 29%|██▊       | 658/2304 [2:40:21<2:48:43,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0908


 29%|██▊       | 659/2304 [2:40:29<2:57:07,  6.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 29%|██▊       | 660/2304 [2:40:36<3:06:02,  6.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0858


 29%|██▊       | 661/2304 [2:40:41<2:50:53,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0809


 29%|██▊       | 662/2304 [2:40:46<2:39:07,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 29%|██▉       | 663/2304 [2:40:51<2:34:01,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 29%|██▉       | 664/2304 [2:40:58<2:47:25,  6.13s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0869


 29%|██▉       | 665/2304 [2:41:06<2:56:50,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 29%|██▉       | 666/2304 [2:41:13<3:03:41,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1091


 29%|██▉       | 667/2304 [2:41:18<2:48:59,  6.19s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 29%|██▉       | 668/2304 [2:41:23<2:40:15,  5.88s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0754


 29%|██▉       | 669/2304 [2:41:28<2:33:09,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 29%|██▉       | 670/2304 [2:41:36<2:47:43,  6.16s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0970


 29%|██▉       | 671/2304 [2:41:43<2:55:40,  6.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0692


 29%|██▉       | 672/2304 [2:41:50<3:03:03,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 29%|██▉       | 673/2304 [2:41:55<2:49:46,  6.25s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0971


 29%|██▉       | 674/2304 [2:42:00<2:38:40,  5.84s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0835


 29%|██▉       | 675/2304 [2:42:05<2:31:36,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1161


 29%|██▉       | 676/2304 [2:42:13<2:47:43,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0772


 29%|██▉       | 677/2304 [2:42:20<2:55:33,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0770


 29%|██▉       | 678/2304 [2:42:27<3:03:36,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0743


 29%|██▉       | 679/2304 [2:42:32<2:48:28,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0791


 30%|██▉       | 680/2304 [2:42:37<2:37:20,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 30%|██▉       | 681/2304 [2:42:42<2:31:31,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0933


 30%|██▉       | 682/2304 [2:42:49<2:44:59,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0801


 30%|██▉       | 683/2304 [2:42:57<2:52:55,  6.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1011


 30%|██▉       | 684/2304 [2:43:04<3:01:36,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0907


 30%|██▉       | 685/2304 [2:43:09<2:46:45,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0826


 30%|██▉       | 686/2304 [2:43:14<2:37:13,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0838


 30%|██▉       | 687/2304 [2:43:19<2:30:05,  5.57s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0790


 30%|██▉       | 688/2304 [2:43:26<2:44:09,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1174


 30%|██▉       | 689/2304 [2:43:34<2:53:37,  6.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0795


 30%|██▉       | 690/2304 [2:43:41<3:00:59,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1038


 30%|██▉       | 691/2304 [2:43:46<2:47:36,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0810


 30%|███       | 692/2304 [2:43:51<2:37:28,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 30%|███       | 693/2304 [2:43:56<2:29:44,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 30%|███       | 694/2304 [2:44:03<2:44:59,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0835


 30%|███       | 695/2304 [2:44:10<2:51:53,  6.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 30%|███       | 696/2304 [2:44:18<2:59:09,  6.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1078


 30%|███       | 697/2304 [2:44:23<2:46:22,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1116


 30%|███       | 698/2304 [2:44:28<2:35:28,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 30%|███       | 699/2304 [2:44:33<2:29:56,  5.61s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0940


 30%|███       | 700/2304 [2:44:40<2:44:43,  6.16s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0868


 30%|███       | 701/2304 [2:44:47<2:52:51,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0772


 30%|███       | 702/2304 [2:44:55<3:00:36,  6.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0953


 31%|███       | 703/2304 [2:45:00<2:45:57,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0955


 31%|███       | 704/2304 [2:45:05<2:35:55,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 31%|███       | 705/2304 [2:45:10<2:29:48,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1061


 31%|███       | 706/2304 [2:45:17<2:42:49,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0968


 31%|███       | 707/2304 [2:45:25<2:52:08,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0949


 31%|███       | 708/2304 [2:45:32<2:58:27,  6.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0860


 31%|███       | 709/2304 [2:45:37<2:44:17,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0835


 31%|███       | 710/2304 [2:45:42<2:35:37,  5.86s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 31%|███       | 711/2304 [2:45:47<2:28:43,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 31%|███       | 712/2304 [2:45:54<2:43:54,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0864


 31%|███       | 713/2304 [2:46:01<2:51:21,  6.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 31%|███       | 714/2304 [2:46:09<2:57:20,  6.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


 31%|███       | 715/2304 [2:46:14<2:44:46,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 31%|███       | 716/2304 [2:46:19<2:34:14,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0789


 31%|███       | 717/2304 [2:46:24<2:27:11,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1189


 31%|███       | 718/2304 [2:46:31<2:42:08,  6.13s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1031


 31%|███       | 719/2304 [2:46:38<2:50:06,  6.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0772


 31%|███▏      | 720/2304 [2:46:46<2:57:48,  6.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1057


 31%|███▏      | 721/2304 [2:46:51<2:43:52,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 31%|███▏      | 722/2304 [2:46:56<2:32:54,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 31%|███▏      | 723/2304 [2:47:01<2:27:12,  5.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0877


 31%|███▏      | 724/2304 [2:47:08<2:40:50,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0887


 31%|███▏      | 725/2304 [2:47:15<2:49:20,  6.43s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 32%|███▏      | 726/2304 [2:47:23<2:57:52,  6.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 32%|███▏      | 727/2304 [2:47:28<2:43:54,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1030


 32%|███▏      | 728/2304 [2:47:33<2:34:37,  5.89s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0945


 32%|███▏      | 729/2304 [2:47:38<2:27:36,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0912


 32%|███▏      | 730/2304 [2:47:45<2:41:53,  6.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0899


 32%|███▏      | 731/2304 [2:47:53<2:51:35,  6.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 32%|███▏      | 732/2304 [2:48:00<2:57:26,  6.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0898


 32%|███▏      | 733/2304 [2:48:05<2:44:21,  6.28s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 32%|███▏      | 734/2304 [2:48:10<2:32:59,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0806


 32%|███▏      | 735/2304 [2:48:15<2:25:59,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0871


 32%|███▏      | 736/2304 [2:48:22<2:40:32,  6.14s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0928


 32%|███▏      | 737/2304 [2:48:29<2:48:05,  6.44s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0805


 32%|███▏      | 738/2304 [2:48:37<2:54:29,  6.69s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1091


 32%|███▏      | 739/2304 [2:48:42<2:41:44,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 32%|███▏      | 740/2304 [2:48:47<2:31:05,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0759


 32%|███▏      | 741/2304 [2:48:52<2:25:48,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 32%|███▏      | 742/2304 [2:48:59<2:38:50,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 32%|███▏      | 743/2304 [2:49:06<2:46:32,  6.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0699


 32%|███▏      | 744/2304 [2:49:14<2:55:00,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0864


 32%|███▏      | 745/2304 [2:49:19<2:41:49,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1121


 32%|███▏      | 746/2304 [2:49:24<2:31:22,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0757


 32%|███▏      | 747/2304 [2:49:29<2:25:55,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0916


 32%|███▏      | 748/2304 [2:49:36<2:39:23,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0835


 33%|███▎      | 749/2304 [2:49:43<2:48:46,  6.51s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0655


 33%|███▎      | 750/2304 [2:49:51<2:55:39,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0984


 33%|███▎      | 751/2304 [2:49:56<2:42:01,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1024


 33%|███▎      | 752/2304 [2:50:01<2:32:08,  5.88s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0885


 33%|███▎      | 753/2304 [2:50:06<2:25:37,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0867


 33%|███▎      | 754/2304 [2:50:14<2:40:16,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0965


 33%|███▎      | 755/2304 [2:50:21<2:47:23,  6.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0662


 33%|███▎      | 756/2304 [2:50:28<2:53:38,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1009


 33%|███▎      | 757/2304 [2:50:33<2:40:56,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0878


 33%|███▎      | 758/2304 [2:50:38<2:30:20,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 33%|███▎      | 759/2304 [2:50:43<2:23:42,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0898


 33%|███▎      | 760/2304 [2:50:51<2:38:52,  6.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0998


 33%|███▎      | 761/2304 [2:50:58<2:46:45,  6.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0797


 33%|███▎      | 762/2304 [2:51:05<2:54:35,  6.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0995


 33%|███▎      | 763/2304 [2:51:10<2:39:57,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 33%|███▎      | 764/2304 [2:51:15<2:29:42,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0789


 33%|███▎      | 765/2304 [2:51:20<2:24:25,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0887


 33%|███▎      | 766/2304 [2:51:28<2:38:06,  6.17s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1013


 33%|███▎      | 767/2304 [2:51:35<2:46:17,  6.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0892


 33%|███▎      | 768/2304 [2:51:42<2:54:49,  6.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0877


 33%|███▎      | 769/2304 [2:51:47<2:40:21,  6.27s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0718


 33%|███▎      | 770/2304 [2:51:53<2:31:11,  5.91s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 33%|███▎      | 771/2304 [2:51:57<2:23:41,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0970


 34%|███▎      | 772/2304 [2:52:05<2:36:21,  6.12s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0791


 34%|███▎      | 773/2304 [2:52:12<2:45:03,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 34%|███▎      | 774/2304 [2:52:19<2:51:19,  6.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1076


 34%|███▎      | 775/2304 [2:52:25<2:39:26,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0893


 34%|███▎      | 776/2304 [2:52:30<2:29:34,  5.87s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0801


 34%|███▎      | 777/2304 [2:52:35<2:23:11,  5.63s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1215


 34%|███▍      | 778/2304 [2:52:42<2:37:14,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


 34%|███▍      | 779/2304 [2:52:49<2:44:35,  6.48s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 34%|███▍      | 780/2304 [2:52:57<2:51:19,  6.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0920


 34%|███▍      | 781/2304 [2:53:02<2:38:27,  6.24s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0785


 34%|███▍      | 782/2304 [2:53:06<2:27:40,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 34%|███▍      | 783/2304 [2:53:12<2:21:45,  5.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1317


 34%|███▍      | 784/2304 [2:53:19<2:34:45,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 34%|███▍      | 785/2304 [2:53:26<2:42:15,  6.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0793


 34%|███▍      | 786/2304 [2:53:33<2:50:00,  6.72s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


 34%|███▍      | 787/2304 [2:53:38<2:36:44,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1383


 34%|███▍      | 788/2304 [2:53:43<2:26:46,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 34%|███▍      | 789/2304 [2:53:48<2:21:27,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0822


 34%|███▍      | 790/2304 [2:53:56<2:34:14,  6.11s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0911


 34%|███▍      | 791/2304 [2:54:03<2:43:15,  6.47s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0810


 34%|███▍      | 792/2304 [2:54:10<2:48:45,  6.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1131


 34%|███▍      | 793/2304 [2:54:15<2:35:33,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0867


 34%|███▍      | 794/2304 [2:54:20<2:26:47,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0709


 35%|███▍      | 795/2304 [2:54:25<2:20:30,  5.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0989


 35%|███▍      | 796/2304 [2:54:33<2:34:36,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0829


 35%|███▍      | 797/2304 [2:54:40<2:43:02,  6.49s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0664


 35%|███▍      | 798/2304 [2:54:47<2:49:17,  6.74s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0773


 35%|███▍      | 799/2304 [2:54:52<2:37:03,  6.26s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0802


 35%|███▍      | 800/2304 [2:54:57<2:25:51,  5.82s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 35%|███▍      | 801/2304 [2:55:02<2:18:52,  5.54s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0910


 35%|███▍      | 802/2304 [2:55:10<2:32:35,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


 35%|███▍      | 803/2304 [2:55:17<2:40:20,  6.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 35%|███▍      | 804/2304 [2:55:24<2:48:21,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0867


 35%|███▍      | 805/2304 [2:55:29<2:35:37,  6.23s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0886


 35%|███▍      | 806/2304 [2:55:34<2:25:08,  5.81s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0840


 35%|███▌      | 807/2304 [2:55:39<2:19:41,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0995


 35%|███▌      | 808/2304 [2:55:46<2:32:05,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 35%|███▌      | 809/2304 [2:55:54<2:39:44,  6.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0792


 35%|███▌      | 810/2304 [2:56:01<2:47:33,  6.73s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 35%|███▌      | 811/2304 [2:56:06<2:34:26,  6.21s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1011


 35%|███▌      | 812/2304 [2:56:11<2:24:57,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0735


 35%|███▌      | 813/2304 [2:56:16<2:18:36,  5.58s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0918


 35%|███▌      | 814/2304 [2:56:23<2:30:27,  6.06s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0899


 35%|███▌      | 815/2304 [2:56:30<2:39:38,  6.43s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0838


 35%|███▌      | 816/2304 [2:56:38<2:46:06,  6.70s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0719


 35%|███▌      | 817/2304 [2:56:43<2:34:14,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0906


 36%|███▌      | 818/2304 [2:56:48<2:23:39,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0750


 36%|███▌      | 819/2304 [2:56:53<2:17:38,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0808


 36%|███▌      | 820/2304 [2:57:00<2:30:51,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0683


 36%|███▌      | 821/2304 [2:57:07<2:39:29,  6.45s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 36%|███▌      | 822/2304 [2:57:15<2:44:33,  6.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0887


 36%|███▌      | 823/2304 [2:57:20<2:32:57,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0884


 36%|███▌      | 824/2304 [2:57:24<2:23:04,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0829


 36%|███▌      | 825/2304 [2:57:30<2:18:01,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0924


 36%|███▌      | 826/2304 [2:57:37<2:29:49,  6.08s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0811


 36%|███▌      | 827/2304 [2:57:44<2:37:21,  6.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 36%|███▌      | 828/2304 [2:57:51<2:43:56,  6.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0957


 36%|███▌      | 829/2304 [2:57:56<2:31:13,  6.15s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0733


 36%|███▌      | 830/2304 [2:58:01<2:21:27,  5.76s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 36%|███▌      | 831/2304 [2:58:06<2:16:33,  5.56s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0780


 36%|███▌      | 832/2304 [2:58:13<2:28:06,  6.04s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0968


 36%|███▌      | 833/2304 [2:58:21<2:37:16,  6.41s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 36%|███▌      | 834/2304 [2:58:28<2:43:02,  6.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0872


 36%|███▌      | 835/2304 [2:58:33<2:30:22,  6.14s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1016


 36%|███▋      | 836/2304 [2:58:38<2:21:37,  5.79s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0818


 36%|███▋      | 837/2304 [2:58:43<2:15:17,  5.53s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0893


 36%|███▋      | 838/2304 [2:58:50<2:27:05,  6.02s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 36%|███▋      | 839/2304 [2:58:57<2:36:07,  6.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1024


 36%|███▋      | 840/2304 [2:59:04<2:42:25,  6.66s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0842


 37%|███▋      | 841/2304 [2:59:09<2:30:45,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0893


 37%|███▋      | 842/2304 [2:59:14<2:20:31,  5.77s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0799


 37%|███▋      | 843/2304 [2:59:19<2:14:36,  5.53s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0849


 37%|███▋      | 844/2304 [2:59:27<2:28:26,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0717


 37%|███▋      | 845/2304 [2:59:34<2:36:12,  6.42s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 37%|███▋      | 846/2304 [2:59:41<2:44:41,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 37%|███▋      | 847/2304 [2:59:46<2:30:56,  6.22s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0890


 37%|███▋      | 848/2304 [2:59:51<2:20:39,  5.80s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0734


 37%|███▋      | 849/2304 [2:59:56<2:15:46,  5.60s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0972


 37%|███▋      | 850/2304 [3:00:04<2:27:44,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0843


 37%|███▋      | 851/2304 [3:00:11<2:34:57,  6.40s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 37%|███▋      | 852/2304 [3:00:18<2:42:20,  6.71s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0747


 37%|███▋      | 853/2304 [3:00:23<2:29:23,  6.18s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0772


 37%|███▋      | 854/2304 [3:00:28<2:20:59,  5.83s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 37%|███▋      | 855/2304 [3:00:33<2:15:02,  5.59s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1025


 37%|███▋      | 856/2304 [3:00:40<2:27:14,  6.10s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0971


 37%|███▋      | 857/2304 [3:00:48<2:35:52,  6.46s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0741


 37%|███▋      | 858/2304 [3:00:55<2:43:16,  6.78s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0772


 37%|███▋      | 859/2304 [3:01:00<2:29:22,  6.20s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0880


 37%|███▋      | 860/2304 [3:01:05<2:20:42,  5.85s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0877


 37%|███▋      | 861/2304 [3:01:10<2:13:33,  5.55s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 37%|███▋      | 862/2304 [3:01:17<2:26:27,  6.09s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0867


 37%|███▋      | 863/2304 [3:01:24<2:33:27,  6.39s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 38%|███▊      | 864/2304 [3:01:32<2:39:34,  6.65s/it]

Config: {'activation': 'relu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0993


 38%|███▊      | 865/2304 [3:01:35<2:14:41,  5.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 38%|███▊      | 866/2304 [3:01:38<1:56:03,  4.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0692


 38%|███▊      | 867/2304 [3:01:41<1:44:37,  4.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0698


 38%|███▊      | 868/2304 [3:01:46<1:45:00,  4.39s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0854


 38%|███▊      | 869/2304 [3:01:50<1:44:34,  4.37s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 38%|███▊      | 870/2304 [3:01:54<1:44:39,  4.38s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0710


 38%|███▊      | 871/2304 [3:01:58<1:36:56,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0693


 38%|███▊      | 872/2304 [3:02:01<1:29:27,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 38%|███▊      | 873/2304 [3:02:04<1:25:38,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0656


 38%|███▊      | 874/2304 [3:02:08<1:31:52,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 38%|███▊      | 875/2304 [3:02:13<1:35:22,  4.00s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0659


 38%|███▊      | 876/2304 [3:02:17<1:37:45,  4.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0713


 38%|███▊      | 877/2304 [3:02:20<1:31:39,  3.85s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 38%|███▊      | 878/2304 [3:02:23<1:25:19,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 38%|███▊      | 879/2304 [3:02:26<1:22:41,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0676


 38%|███▊      | 880/2304 [3:02:31<1:29:32,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0690


 38%|███▊      | 881/2304 [3:02:35<1:33:27,  3.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 38%|███▊      | 882/2304 [3:02:40<1:35:38,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0663


 38%|███▊      | 883/2304 [3:02:43<1:30:01,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0680


 38%|███▊      | 884/2304 [3:02:46<1:25:18,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0690


 38%|███▊      | 885/2304 [3:02:49<1:21:19,  3.44s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0682


 38%|███▊      | 886/2304 [3:02:53<1:29:01,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 38%|███▊      | 887/2304 [3:02:58<1:33:16,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 39%|███▊      | 888/2304 [3:03:02<1:35:37,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 39%|███▊      | 889/2304 [3:03:05<1:30:17,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0669


 39%|███▊      | 890/2304 [3:03:09<1:26:03,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0678


 39%|███▊      | 891/2304 [3:03:12<1:22:10,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0764


 39%|███▊      | 892/2304 [3:03:16<1:29:22,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0710


 39%|███▉      | 893/2304 [3:03:21<1:33:58,  4.00s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 39%|███▉      | 894/2304 [3:03:25<1:36:24,  4.10s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0719


 39%|███▉      | 895/2304 [3:03:28<1:30:33,  3.86s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0719


 39%|███▉      | 896/2304 [3:03:32<1:25:36,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0682


 39%|███▉      | 897/2304 [3:03:35<1:22:26,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 39%|███▉      | 898/2304 [3:03:39<1:28:19,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 39%|███▉      | 899/2304 [3:03:44<1:32:47,  3.96s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 39%|███▉      | 900/2304 [3:03:48<1:34:47,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0681


 39%|███▉      | 901/2304 [3:03:51<1:28:48,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0683


 39%|███▉      | 902/2304 [3:03:54<1:23:25,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0721


 39%|███▉      | 903/2304 [3:03:57<1:20:51,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0671


 39%|███▉      | 904/2304 [3:04:02<1:27:35,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 39%|███▉      | 905/2304 [3:04:06<1:32:18,  3.96s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0710


 39%|███▉      | 906/2304 [3:04:10<1:34:39,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0731


 39%|███▉      | 907/2304 [3:04:14<1:28:42,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


 39%|███▉      | 908/2304 [3:04:17<1:24:12,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 39%|███▉      | 909/2304 [3:04:20<1:20:20,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0713


 39%|███▉      | 910/2304 [3:04:24<1:26:50,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0696


 40%|███▉      | 911/2304 [3:04:29<1:30:53,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 40%|███▉      | 912/2304 [3:04:33<1:33:50,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 40%|███▉      | 913/2304 [3:04:36<1:28:10,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0736


 40%|███▉      | 914/2304 [3:04:39<1:23:49,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 40%|███▉      | 915/2304 [3:04:43<1:20:11,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 40%|███▉      | 916/2304 [3:04:47<1:26:40,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0674


 40%|███▉      | 917/2304 [3:04:51<1:30:49,  3.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 40%|███▉      | 918/2304 [3:04:56<1:33:29,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 40%|███▉      | 919/2304 [3:04:59<1:27:18,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 40%|███▉      | 920/2304 [3:05:02<1:22:35,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 40%|███▉      | 921/2304 [3:05:05<1:19:04,  3.43s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0655


 40%|████      | 922/2304 [3:05:09<1:26:13,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0703


 40%|████      | 923/2304 [3:05:14<1:30:08,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0728


 40%|████      | 924/2304 [3:05:18<1:32:57,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 40%|████      | 925/2304 [3:05:21<1:27:16,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0691


 40%|████      | 926/2304 [3:05:25<1:23:17,  3.63s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 40%|████      | 927/2304 [3:05:28<1:19:38,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 40%|████      | 928/2304 [3:05:32<1:26:31,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0687


 40%|████      | 929/2304 [3:05:36<1:30:30,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 40%|████      | 930/2304 [3:05:41<1:32:43,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 40%|████      | 931/2304 [3:05:44<1:27:36,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0681


 40%|████      | 932/2304 [3:05:47<1:22:11,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0676


 40%|████      | 933/2304 [3:05:50<1:19:36,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 41%|████      | 934/2304 [3:05:55<1:26:14,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 41%|████      | 935/2304 [3:05:59<1:30:01,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 41%|████      | 936/2304 [3:06:03<1:32:16,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 41%|████      | 937/2304 [3:06:07<1:27:09,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0670


 41%|████      | 938/2304 [3:06:10<1:21:45,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 41%|████      | 939/2304 [3:06:13<1:19:33,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0792


 41%|████      | 940/2304 [3:06:18<1:26:20,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0805


 41%|████      | 941/2304 [3:06:22<1:30:25,  3.98s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0747


 41%|████      | 942/2304 [3:06:26<1:32:17,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 41%|████      | 943/2304 [3:06:30<1:26:40,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 41%|████      | 944/2304 [3:06:33<1:21:35,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0730


 41%|████      | 945/2304 [3:06:36<1:19:14,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0681


 41%|████      | 946/2304 [3:06:40<1:25:28,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0701


 41%|████      | 947/2304 [3:06:45<1:30:06,  3.98s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 41%|████      | 948/2304 [3:06:49<1:32:22,  4.09s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693


 41%|████      | 949/2304 [3:06:52<1:26:45,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0676


 41%|████      | 950/2304 [3:06:55<1:22:05,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0691


 41%|████▏     | 951/2304 [3:06:59<1:19:20,  3.52s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 41%|████▏     | 952/2304 [3:07:03<1:25:52,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0682


 41%|████▏     | 953/2304 [3:07:08<1:30:06,  4.00s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0763


 41%|████▏     | 954/2304 [3:07:12<1:32:04,  4.09s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0720


 41%|████▏     | 955/2304 [3:07:15<1:26:09,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 41%|████▏     | 956/2304 [3:07:18<1:20:56,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 42%|████▏     | 957/2304 [3:07:22<1:18:27,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0733


 42%|████▏     | 958/2304 [3:07:26<1:25:15,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 42%|████▏     | 959/2304 [3:07:30<1:29:04,  3.97s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 42%|████▏     | 960/2304 [3:07:35<1:30:55,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0710


 42%|████▏     | 961/2304 [3:07:38<1:25:09,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0792


 42%|████▏     | 962/2304 [3:07:41<1:19:31,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 42%|████▏     | 963/2304 [3:07:44<1:17:17,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0841


 42%|████▏     | 964/2304 [3:07:49<1:23:41,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0796


 42%|████▏     | 965/2304 [3:07:53<1:26:56,  3.90s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0754


 42%|████▏     | 966/2304 [3:07:57<1:30:33,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0772


 42%|████▏     | 967/2304 [3:08:00<1:24:42,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0777


 42%|████▏     | 968/2304 [3:08:03<1:19:35,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0696


 42%|████▏     | 969/2304 [3:08:07<1:17:09,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0771


 42%|████▏     | 970/2304 [3:08:11<1:23:12,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0693


 42%|████▏     | 971/2304 [3:08:15<1:25:58,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0702


 42%|████▏     | 972/2304 [3:08:20<1:29:38,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 42%|████▏     | 973/2304 [3:08:23<1:23:56,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 42%|████▏     | 974/2304 [3:08:26<1:18:45,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0687


 42%|████▏     | 975/2304 [3:08:29<1:16:23,  3.45s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0893


 42%|████▏     | 976/2304 [3:08:34<1:23:40,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 42%|████▏     | 977/2304 [3:08:38<1:26:07,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 42%|████▏     | 978/2304 [3:08:42<1:29:56,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0747


 42%|████▏     | 979/2304 [3:08:45<1:24:13,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 43%|████▎     | 980/2304 [3:08:48<1:18:48,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 43%|████▎     | 981/2304 [3:08:52<1:16:13,  3.46s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0708


 43%|████▎     | 982/2304 [3:08:56<1:22:46,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0739


 43%|████▎     | 983/2304 [3:09:00<1:25:56,  3.90s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0822


 43%|████▎     | 984/2304 [3:09:05<1:29:35,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0734


 43%|████▎     | 985/2304 [3:09:08<1:24:19,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 43%|████▎     | 986/2304 [3:09:11<1:19:13,  3.61s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0726


 43%|████▎     | 987/2304 [3:09:14<1:16:42,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0845


 43%|████▎     | 988/2304 [3:09:19<1:21:56,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 43%|████▎     | 989/2304 [3:09:23<1:26:24,  3.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 43%|████▎     | 990/2304 [3:09:28<1:29:59,  4.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0786


 43%|████▎     | 991/2304 [3:09:31<1:24:32,  3.86s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0757


 43%|████▎     | 992/2304 [3:09:34<1:19:01,  3.61s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0775


 43%|████▎     | 993/2304 [3:09:37<1:17:03,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 43%|████▎     | 994/2304 [3:09:42<1:22:27,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0746


 43%|████▎     | 995/2304 [3:09:46<1:26:55,  3.98s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 43%|████▎     | 996/2304 [3:09:51<1:30:12,  4.14s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0776


 43%|████▎     | 997/2304 [3:09:54<1:23:21,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 43%|████▎     | 998/2304 [3:09:57<1:19:46,  3.67s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


 43%|████▎     | 999/2304 [3:10:00<1:17:01,  3.54s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 43%|████▎     | 1000/2304 [3:10:05<1:21:45,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0790


 43%|████▎     | 1001/2304 [3:10:09<1:25:41,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 43%|████▎     | 1002/2304 [3:10:13<1:28:50,  4.09s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0818


 44%|████▎     | 1003/2304 [3:10:16<1:22:33,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0690


 44%|████▎     | 1004/2304 [3:10:20<1:19:11,  3.65s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0708


 44%|████▎     | 1005/2304 [3:10:23<1:17:08,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0729


 44%|████▎     | 1006/2304 [3:10:27<1:22:08,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0691


 44%|████▎     | 1007/2304 [3:10:32<1:26:36,  4.01s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 44%|████▍     | 1008/2304 [3:10:36<1:29:34,  4.15s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0731


 44%|████▍     | 1009/2304 [3:10:40<1:23:48,  3.88s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0791


 44%|████▍     | 1010/2304 [3:10:43<1:18:33,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0765


 44%|████▍     | 1011/2304 [3:10:46<1:15:58,  3.53s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0759


 44%|████▍     | 1012/2304 [3:10:50<1:20:51,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 44%|████▍     | 1013/2304 [3:10:55<1:24:34,  3.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 44%|████▍     | 1014/2304 [3:10:59<1:27:25,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0767


 44%|████▍     | 1015/2304 [3:11:02<1:20:35,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0802


 44%|████▍     | 1016/2304 [3:11:05<1:17:03,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 44%|████▍     | 1017/2304 [3:11:09<1:14:54,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0874


 44%|████▍     | 1018/2304 [3:11:13<1:19:50,  3.73s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0986


 44%|████▍     | 1019/2304 [3:11:17<1:23:35,  3.90s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 44%|████▍     | 1020/2304 [3:11:22<1:26:47,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0789


 44%|████▍     | 1021/2304 [3:11:25<1:20:51,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


 44%|████▍     | 1022/2304 [3:11:28<1:16:35,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


 44%|████▍     | 1023/2304 [3:11:31<1:14:24,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0718


 44%|████▍     | 1024/2304 [3:11:35<1:19:05,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 44%|████▍     | 1025/2304 [3:11:40<1:22:59,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 45%|████▍     | 1026/2304 [3:11:44<1:26:33,  4.06s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0732


 45%|████▍     | 1027/2304 [3:11:47<1:20:09,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0705


 45%|████▍     | 1028/2304 [3:11:50<1:16:32,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 45%|████▍     | 1029/2304 [3:11:53<1:13:08,  3.44s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 45%|████▍     | 1030/2304 [3:11:58<1:19:40,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 45%|████▍     | 1031/2304 [3:12:02<1:23:37,  3.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0719


 45%|████▍     | 1032/2304 [3:12:07<1:26:56,  4.10s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 45%|████▍     | 1033/2304 [3:12:10<1:20:12,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0746


 45%|████▍     | 1034/2304 [3:12:13<1:16:12,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0773


 45%|████▍     | 1035/2304 [3:12:16<1:12:38,  3.43s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


 45%|████▍     | 1036/2304 [3:12:20<1:19:01,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0822


 45%|████▌     | 1037/2304 [3:12:25<1:22:52,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


 45%|████▌     | 1038/2304 [3:12:29<1:26:10,  4.08s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0862


 45%|████▌     | 1039/2304 [3:12:32<1:19:46,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0758


 45%|████▌     | 1040/2304 [3:12:36<1:15:32,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0759


 45%|████▌     | 1041/2304 [3:12:39<1:12:00,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 45%|████▌     | 1042/2304 [3:12:43<1:18:06,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0793


 45%|████▌     | 1043/2304 [3:12:47<1:21:47,  3.89s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0732


 45%|████▌     | 1044/2304 [3:12:52<1:24:54,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0810


 45%|████▌     | 1045/2304 [3:12:55<1:18:44,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 45%|████▌     | 1046/2304 [3:12:58<1:15:14,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 45%|████▌     | 1047/2304 [3:13:01<1:11:56,  3.43s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0740


 45%|████▌     | 1048/2304 [3:13:05<1:17:58,  3.72s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0885


 46%|████▌     | 1049/2304 [3:13:10<1:21:47,  3.91s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 46%|████▌     | 1050/2304 [3:13:14<1:24:27,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0699


 46%|████▌     | 1051/2304 [3:13:17<1:18:09,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 46%|████▌     | 1052/2304 [3:13:20<1:14:46,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0752


 46%|████▌     | 1053/2304 [3:13:23<1:11:16,  3.42s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0774


 46%|████▌     | 1054/2304 [3:13:28<1:17:17,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0732


 46%|████▌     | 1055/2304 [3:13:32<1:21:11,  3.90s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0708


 46%|████▌     | 1056/2304 [3:13:37<1:24:12,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0754


 46%|████▌     | 1057/2304 [3:13:40<1:17:52,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 46%|████▌     | 1058/2304 [3:13:43<1:14:01,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0662


 46%|████▌     | 1059/2304 [3:13:46<1:11:05,  3.43s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0754


 46%|████▌     | 1060/2304 [3:13:50<1:17:27,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0851


 46%|████▌     | 1061/2304 [3:13:55<1:21:18,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 46%|████▌     | 1062/2304 [3:13:59<1:24:21,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


 46%|████▌     | 1063/2304 [3:14:02<1:18:03,  3.77s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 46%|████▌     | 1064/2304 [3:14:05<1:14:15,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 46%|████▌     | 1065/2304 [3:14:08<1:11:14,  3.45s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0748


 46%|████▋     | 1066/2304 [3:14:13<1:17:16,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0803


 46%|████▋     | 1067/2304 [3:14:17<1:21:28,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 46%|████▋     | 1068/2304 [3:14:22<1:24:44,  4.11s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0997


 46%|████▋     | 1069/2304 [3:14:25<1:18:27,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0696


 46%|████▋     | 1070/2304 [3:14:28<1:14:31,  3.62s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0698


 46%|████▋     | 1071/2304 [3:14:31<1:11:22,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 47%|████▋     | 1072/2304 [3:14:36<1:18:27,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0819


 47%|████▋     | 1073/2304 [3:14:40<1:21:54,  3.99s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0791


 47%|████▋     | 1074/2304 [3:14:45<1:25:16,  4.16s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 47%|████▋     | 1075/2304 [3:14:48<1:18:31,  3.83s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0757


 47%|████▋     | 1076/2304 [3:14:51<1:14:30,  3.64s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0764


 47%|████▋     | 1077/2304 [3:14:54<1:11:27,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 47%|████▋     | 1078/2304 [3:14:59<1:17:17,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 47%|████▋     | 1079/2304 [3:15:03<1:20:33,  3.95s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0755


 47%|████▋     | 1080/2304 [3:15:07<1:22:37,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 47%|████▋     | 1081/2304 [3:15:11<1:17:51,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 47%|████▋     | 1082/2304 [3:15:14<1:12:53,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 47%|████▋     | 1083/2304 [3:15:17<1:10:32,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 47%|████▋     | 1084/2304 [3:15:21<1:16:09,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0734


 47%|████▋     | 1085/2304 [3:15:26<1:19:51,  3.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 47%|████▋     | 1086/2304 [3:15:30<1:21:46,  4.03s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0808


 47%|████▋     | 1087/2304 [3:15:33<1:16:59,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0673


 47%|████▋     | 1088/2304 [3:15:36<1:11:58,  3.55s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 47%|████▋     | 1089/2304 [3:15:39<1:09:55,  3.45s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 47%|████▋     | 1090/2304 [3:15:44<1:16:02,  3.76s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0796


 47%|████▋     | 1091/2304 [3:15:48<1:19:20,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 47%|████▋     | 1092/2304 [3:15:52<1:21:31,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0818


 47%|████▋     | 1093/2304 [3:15:55<1:16:23,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0769


 47%|████▋     | 1094/2304 [3:15:59<1:11:42,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0767


 48%|████▊     | 1095/2304 [3:16:02<1:09:49,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0774


 48%|████▊     | 1096/2304 [3:16:06<1:15:20,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 48%|████▊     | 1097/2304 [3:16:11<1:19:09,  3.93s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


 48%|████▊     | 1098/2304 [3:16:15<1:21:06,  4.03s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 48%|████▊     | 1099/2304 [3:16:18<1:15:58,  3.78s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 48%|████▊     | 1100/2304 [3:16:21<1:11:35,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0711


 48%|████▊     | 1101/2304 [3:16:24<1:09:28,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0743


 48%|████▊     | 1102/2304 [3:16:29<1:14:52,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 48%|████▊     | 1103/2304 [3:16:33<1:17:30,  3.87s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 48%|████▊     | 1104/2304 [3:16:37<1:20:48,  4.04s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0855


 48%|████▊     | 1105/2304 [3:16:41<1:15:44,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 48%|████▊     | 1106/2304 [3:16:44<1:11:00,  3.56s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0669


 48%|████▊     | 1107/2304 [3:16:47<1:09:28,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0800


 48%|████▊     | 1108/2304 [3:16:51<1:14:40,  3.75s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0859


 48%|████▊     | 1109/2304 [3:16:55<1:17:18,  3.88s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 48%|████▊     | 1110/2304 [3:17:00<1:20:34,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 48%|████▊     | 1111/2304 [3:17:03<1:15:43,  3.81s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0711


 48%|████▊     | 1112/2304 [3:17:06<1:10:57,  3.57s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0750


 48%|████▊     | 1113/2304 [3:17:09<1:08:52,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0760


 48%|████▊     | 1114/2304 [3:17:14<1:14:09,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0763


 48%|████▊     | 1115/2304 [3:17:18<1:16:32,  3.86s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 48%|████▊     | 1116/2304 [3:17:22<1:20:33,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0747


 48%|████▊     | 1117/2304 [3:17:26<1:15:58,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0693


 49%|████▊     | 1118/2304 [3:17:29<1:11:09,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 49%|████▊     | 1119/2304 [3:17:32<1:08:52,  3.49s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 49%|████▊     | 1120/2304 [3:17:36<1:14:51,  3.79s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 49%|████▊     | 1121/2304 [3:17:41<1:17:17,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0757


 49%|████▊     | 1122/2304 [3:17:45<1:20:21,  4.08s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 49%|████▊     | 1123/2304 [3:17:48<1:15:07,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0745


 49%|████▉     | 1124/2304 [3:17:51<1:10:24,  3.58s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0783


 49%|████▉     | 1125/2304 [3:17:55<1:08:11,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0711


 49%|████▉     | 1126/2304 [3:17:59<1:13:04,  3.72s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0807


 49%|████▉     | 1127/2304 [3:18:03<1:16:48,  3.92s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0773


 49%|████▉     | 1128/2304 [3:18:08<1:20:06,  4.09s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0723


 49%|████▉     | 1129/2304 [3:18:11<1:15:10,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0868


 49%|████▉     | 1130/2304 [3:18:14<1:10:16,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 49%|████▉     | 1131/2304 [3:18:17<1:08:25,  3.50s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0756


 49%|████▉     | 1132/2304 [3:18:22<1:13:05,  3.74s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0968


 49%|████▉     | 1133/2304 [3:18:26<1:16:50,  3.94s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 49%|████▉     | 1134/2304 [3:18:30<1:19:51,  4.10s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0832


 49%|████▉     | 1135/2304 [3:18:34<1:14:53,  3.84s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0774


 49%|████▉     | 1136/2304 [3:18:37<1:09:56,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0737


 49%|████▉     | 1137/2304 [3:18:40<1:07:30,  3.47s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0712


 49%|████▉     | 1138/2304 [3:18:44<1:12:05,  3.71s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 49%|████▉     | 1139/2304 [3:18:49<1:15:53,  3.91s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 49%|████▉     | 1140/2304 [3:18:53<1:18:34,  4.05s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0919


 50%|████▉     | 1141/2304 [3:18:56<1:13:39,  3.80s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0762


 50%|████▉     | 1142/2304 [3:18:59<1:09:28,  3.59s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0774


 50%|████▉     | 1143/2304 [3:19:02<1:07:16,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0803


 50%|████▉     | 1144/2304 [3:19:07<1:11:52,  3.72s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 50%|████▉     | 1145/2304 [3:19:11<1:15:34,  3.91s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0739


 50%|████▉     | 1146/2304 [3:19:16<1:18:46,  4.08s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0697


 50%|████▉     | 1147/2304 [3:19:19<1:13:34,  3.82s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0929


 50%|████▉     | 1148/2304 [3:19:22<1:09:16,  3.60s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0795


 50%|████▉     | 1149/2304 [3:19:25<1:07:04,  3.48s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 50%|████▉     | 1150/2304 [3:19:29<1:11:29,  3.72s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0834


 50%|████▉     | 1151/2304 [3:19:34<1:15:07,  3.91s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0728


 50%|█████     | 1152/2304 [3:19:38<1:18:09,  4.07s/it]

Config: {'activation': 'relu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0736


 50%|█████     | 1153/2304 [3:19:55<2:29:10,  7.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1278


 50%|█████     | 1154/2304 [3:20:11<3:16:27, 10.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0937


 50%|█████     | 1155/2304 [3:20:27<3:51:04, 12.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0925


 50%|█████     | 1156/2304 [3:20:53<5:08:54, 16.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0778


 50%|█████     | 1157/2304 [3:21:18<6:01:31, 18.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0900


 50%|█████     | 1158/2304 [3:21:44<6:41:09, 21.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0910


 50%|█████     | 1159/2304 [3:22:00<6:15:14, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0974


 50%|█████     | 1160/2304 [3:22:16<5:53:38, 18.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0919


 50%|█████     | 1161/2304 [3:22:33<5:41:03, 17.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0999


 50%|█████     | 1162/2304 [3:22:58<6:24:25, 20.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0653


 50%|█████     | 1163/2304 [3:23:23<6:51:11, 21.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0902


 51%|█████     | 1164/2304 [3:23:49<7:14:05, 22.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 51%|█████     | 1165/2304 [3:24:05<6:37:00, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1673


 51%|█████     | 1166/2304 [3:24:21<6:09:29, 19.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1370


 51%|█████     | 1167/2304 [3:24:38<5:52:07, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1022


 51%|█████     | 1168/2304 [3:25:04<6:31:36, 20.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0862


 51%|█████     | 1169/2304 [3:25:29<6:56:15, 22.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0932


 51%|█████     | 1170/2304 [3:25:54<7:16:09, 23.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0796


 51%|█████     | 1171/2304 [3:26:11<6:39:08, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1346


 51%|█████     | 1172/2304 [3:26:27<6:10:23, 19.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1112


 51%|█████     | 1173/2304 [3:26:43<5:52:13, 18.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0987


 51%|█████     | 1174/2304 [3:27:09<6:29:09, 20.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0894


 51%|█████     | 1175/2304 [3:27:34<6:54:40, 22.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0733


 51%|█████     | 1176/2304 [3:28:00<7:15:08, 23.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0905


 51%|█████     | 1177/2304 [3:28:16<6:36:19, 21.10s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0828


 51%|█████     | 1178/2304 [3:28:32<6:07:28, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1102


 51%|█████     | 1179/2304 [3:28:48<5:49:32, 18.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1177


 51%|█████     | 1180/2304 [3:29:14<6:28:06, 20.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0900


 51%|█████▏    | 1181/2304 [3:29:39<6:53:09, 22.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1012


 51%|█████▏    | 1182/2304 [3:30:05<7:12:52, 23.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1036


 51%|█████▏    | 1183/2304 [3:30:21<6:35:02, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0993


 51%|█████▏    | 1184/2304 [3:30:38<6:07:28, 19.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1045


 51%|█████▏    | 1185/2304 [3:30:54<5:48:58, 18.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0972


 51%|█████▏    | 1186/2304 [3:31:20<6:29:21, 20.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


 52%|█████▏    | 1187/2304 [3:31:45<6:53:52, 22.23s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0908


 52%|█████▏    | 1188/2304 [3:32:11<7:14:47, 23.38s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0814


 52%|█████▏    | 1189/2304 [3:32:28<6:35:55, 21.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0961


 52%|█████▏    | 1190/2304 [3:32:44<6:05:56, 19.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1013


 52%|█████▏    | 1191/2304 [3:33:01<5:48:14, 18.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1047


 52%|█████▏    | 1192/2304 [3:33:26<6:26:49, 20.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0865


 52%|█████▏    | 1193/2304 [3:33:52<6:50:36, 22.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1340


 52%|█████▏    | 1194/2304 [3:34:17<7:09:54, 23.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0971


 52%|█████▏    | 1195/2304 [3:34:34<6:32:17, 21.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0935


 52%|█████▏    | 1196/2304 [3:34:50<6:04:36, 19.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0942


 52%|█████▏    | 1197/2304 [3:35:06<5:45:29, 18.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1165


 52%|█████▏    | 1198/2304 [3:35:32<6:23:55, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0880


 52%|█████▏    | 1199/2304 [3:35:57<6:46:35, 22.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0833


 52%|█████▏    | 1200/2304 [3:36:23<7:07:45, 23.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1084


 52%|█████▏    | 1201/2304 [3:36:40<6:30:30, 21.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0887


 52%|█████▏    | 1202/2304 [3:36:56<6:02:09, 19.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1399


 52%|█████▏    | 1203/2304 [3:37:12<5:44:16, 18.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1072


 52%|█████▏    | 1204/2304 [3:37:38<6:23:08, 20.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0928


 52%|█████▏    | 1205/2304 [3:38:04<6:46:48, 22.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


 52%|█████▏    | 1206/2304 [3:38:29<7:06:34, 23.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0922


 52%|█████▏    | 1207/2304 [3:38:46<6:28:54, 21.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0871


 52%|█████▏    | 1208/2304 [3:39:02<6:00:14, 19.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1559


 52%|█████▏    | 1209/2304 [3:39:18<5:41:10, 18.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0940


 53%|█████▎    | 1210/2304 [3:39:44<6:18:43, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0888


 53%|█████▎    | 1211/2304 [3:40:09<6:41:40, 22.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 53%|█████▎    | 1212/2304 [3:40:35<7:01:50, 23.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 53%|█████▎    | 1213/2304 [3:40:51<6:24:44, 21.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1556


 53%|█████▎    | 1214/2304 [3:41:07<5:56:03, 19.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1003


 53%|█████▎    | 1215/2304 [3:41:24<5:38:10, 18.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1252


 53%|█████▎    | 1216/2304 [3:41:49<6:16:57, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0824


 53%|█████▎    | 1217/2304 [3:42:14<6:39:46, 22.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0897


 53%|█████▎    | 1218/2304 [3:42:40<6:59:12, 23.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 53%|█████▎    | 1219/2304 [3:42:57<6:22:19, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1515


 53%|█████▎    | 1220/2304 [3:43:13<5:53:47, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0890


 53%|█████▎    | 1221/2304 [3:43:29<5:35:45, 18.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 53%|█████▎    | 1222/2304 [3:43:54<6:13:27, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


 53%|█████▎    | 1223/2304 [3:44:20<6:36:54, 22.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.2091


 53%|█████▎    | 1224/2304 [3:44:46<6:58:32, 23.25s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 53%|█████▎    | 1225/2304 [3:45:02<6:22:15, 21.26s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0928


 53%|█████▎    | 1226/2304 [3:45:18<5:53:26, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1305


 53%|█████▎    | 1227/2304 [3:45:35<5:35:27, 18.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0928


 53%|█████▎    | 1228/2304 [3:46:01<6:13:48, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 53%|█████▎    | 1229/2304 [3:46:26<6:36:32, 22.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1068


 53%|█████▎    | 1230/2304 [3:46:51<6:55:51, 23.23s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1029


 53%|█████▎    | 1231/2304 [3:47:08<6:19:52, 21.24s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1062


 53%|█████▎    | 1232/2304 [3:47:24<5:51:53, 19.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1097


 54%|█████▎    | 1233/2304 [3:47:41<5:34:35, 18.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0985


 54%|█████▎    | 1234/2304 [3:48:06<6:11:26, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0777


 54%|█████▎    | 1235/2304 [3:48:31<6:33:19, 22.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1086


 54%|█████▎    | 1236/2304 [3:48:57<6:52:44, 23.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0818


 54%|█████▎    | 1237/2304 [3:49:14<6:16:04, 21.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0927


 54%|█████▎    | 1238/2304 [3:49:30<5:49:09, 19.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1027


 54%|█████▍    | 1239/2304 [3:49:46<5:32:17, 18.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1493


 54%|█████▍    | 1240/2304 [3:50:12<6:09:37, 20.84s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0848


 54%|█████▍    | 1241/2304 [3:50:37<6:32:36, 22.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0829


 54%|█████▍    | 1242/2304 [3:51:03<6:50:34, 23.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0958


 54%|█████▍    | 1243/2304 [3:51:19<6:13:54, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0969


 54%|█████▍    | 1244/2304 [3:51:35<5:46:28, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.2115


 54%|█████▍    | 1245/2304 [3:51:52<5:29:16, 18.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1232


 54%|█████▍    | 1246/2304 [3:52:17<6:05:38, 20.74s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0807


 54%|█████▍    | 1247/2304 [3:52:42<6:28:26, 22.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1010


 54%|█████▍    | 1248/2304 [3:53:08<6:47:05, 23.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0800


 54%|█████▍    | 1249/2304 [3:53:24<6:10:51, 21.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1009


 54%|█████▍    | 1250/2304 [3:53:40<5:44:02, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0910


 54%|█████▍    | 1251/2304 [3:53:57<5:27:14, 18.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0850


 54%|█████▍    | 1252/2304 [3:54:22<6:03:15, 20.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 54%|█████▍    | 1253/2304 [3:54:47<6:25:36, 22.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0873


 54%|█████▍    | 1254/2304 [3:55:13<6:43:19, 23.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0821


 54%|█████▍    | 1255/2304 [3:55:29<6:08:21, 21.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0940


 55%|█████▍    | 1256/2304 [3:55:45<5:41:17, 19.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0889


 55%|█████▍    | 1257/2304 [3:56:02<5:24:28, 18.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0874


 55%|█████▍    | 1258/2304 [3:56:27<6:00:03, 20.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0933


 55%|█████▍    | 1259/2304 [3:56:52<6:21:48, 21.92s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0726


 55%|█████▍    | 1260/2304 [3:57:18<6:40:40, 23.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0843


 55%|█████▍    | 1261/2304 [3:57:34<6:05:26, 21.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1258


 55%|█████▍    | 1262/2304 [3:57:50<5:38:45, 19.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 55%|█████▍    | 1263/2304 [3:58:06<5:22:26, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0913


 55%|█████▍    | 1264/2304 [3:58:32<5:57:48, 20.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0731


 55%|█████▍    | 1265/2304 [3:58:57<6:19:20, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0969


 55%|█████▍    | 1266/2304 [3:59:22<6:37:44, 22.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0823


 55%|█████▍    | 1267/2304 [3:59:39<6:02:43, 20.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 55%|█████▌    | 1268/2304 [3:59:55<5:36:38, 19.50s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0981


 55%|█████▌    | 1269/2304 [4:00:11<5:20:38, 18.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1481


 55%|█████▌    | 1270/2304 [4:00:37<5:56:59, 20.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0722


 55%|█████▌    | 1271/2304 [4:01:02<6:18:45, 22.00s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0847


 55%|█████▌    | 1272/2304 [4:01:27<6:37:29, 23.11s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0924


 55%|█████▌    | 1273/2304 [4:01:44<6:02:21, 21.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0995


 55%|█████▌    | 1274/2304 [4:02:00<5:36:40, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1080


 55%|█████▌    | 1275/2304 [4:02:16<5:19:40, 18.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0943


 55%|█████▌    | 1276/2304 [4:02:42<5:54:34, 20.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 55%|█████▌    | 1277/2304 [4:03:07<6:16:52, 22.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.2128


 55%|█████▌    | 1278/2304 [4:03:33<6:35:17, 23.12s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 56%|█████▌    | 1279/2304 [4:03:49<6:01:05, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1087


 56%|█████▌    | 1280/2304 [4:04:05<5:35:31, 19.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1089


 56%|█████▌    | 1281/2304 [4:04:22<5:19:05, 18.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


 56%|█████▌    | 1282/2304 [4:04:48<5:54:50, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0825


 56%|█████▌    | 1283/2304 [4:05:13<6:15:55, 22.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0869


 56%|█████▌    | 1284/2304 [4:05:38<6:33:19, 23.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0870


 56%|█████▌    | 1285/2304 [4:05:55<5:58:55, 21.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1363


 56%|█████▌    | 1286/2304 [4:06:11<5:33:10, 19.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0782


 56%|█████▌    | 1287/2304 [4:06:27<5:17:15, 18.72s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0882


 56%|█████▌    | 1288/2304 [4:06:53<5:51:57, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0833


 56%|█████▌    | 1289/2304 [4:07:18<6:13:15, 22.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0801


 56%|█████▌    | 1290/2304 [4:07:44<6:32:35, 23.23s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0955


 56%|█████▌    | 1291/2304 [4:08:01<5:58:11, 21.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1122


 56%|█████▌    | 1292/2304 [4:08:17<5:31:48, 19.67s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0896


 56%|█████▌    | 1293/2304 [4:08:33<5:15:31, 18.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0849


 56%|█████▌    | 1294/2304 [4:08:59<5:49:17, 20.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0908


 56%|█████▌    | 1295/2304 [4:09:24<6:11:00, 22.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0880


 56%|█████▋    | 1296/2304 [4:09:49<6:27:44, 23.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0941


 56%|█████▋    | 1297/2304 [4:10:05<5:52:33, 21.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0858


 56%|█████▋    | 1298/2304 [4:10:21<5:25:57, 19.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0929


 56%|█████▋    | 1299/2304 [4:10:37<5:09:59, 18.51s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0750


 56%|█████▋    | 1300/2304 [4:11:03<5:44:16, 20.57s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1497


 56%|█████▋    | 1301/2304 [4:11:28<6:05:48, 21.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1003


 57%|█████▋    | 1302/2304 [4:11:53<6:23:48, 22.98s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 57%|█████▋    | 1303/2304 [4:12:10<5:50:43, 21.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0907


 57%|█████▋    | 1304/2304 [4:12:26<5:25:54, 19.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1278


 57%|█████▋    | 1305/2304 [4:12:42<5:10:27, 18.65s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0869


 57%|█████▋    | 1306/2304 [4:13:08<5:44:06, 20.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0813


 57%|█████▋    | 1307/2304 [4:13:33<6:03:56, 21.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 57%|█████▋    | 1308/2304 [4:13:58<6:20:53, 22.95s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 57%|█████▋    | 1309/2304 [4:14:15<5:48:27, 21.01s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0838


 57%|█████▋    | 1310/2304 [4:14:31<5:23:18, 19.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1830


 57%|█████▋    | 1311/2304 [4:14:47<5:08:29, 18.64s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1058


 57%|█████▋    | 1312/2304 [4:15:13<5:42:41, 20.73s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1645


 57%|█████▋    | 1313/2304 [4:15:38<6:04:37, 22.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0937


 57%|█████▋    | 1314/2304 [4:16:03<6:20:54, 23.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0889


 57%|█████▋    | 1315/2304 [4:16:20<5:46:37, 21.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0974


 57%|█████▋    | 1316/2304 [4:16:36<5:20:53, 19.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0910


 57%|█████▋    | 1317/2304 [4:16:53<5:11:24, 18.93s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1274


 57%|█████▋    | 1318/2304 [4:17:21<5:53:10, 21.49s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0747


 57%|█████▋    | 1319/2304 [4:17:46<6:13:51, 22.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0936


 57%|█████▋    | 1320/2304 [4:18:12<6:27:31, 23.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0794


 57%|█████▋    | 1321/2304 [4:18:28<5:51:33, 21.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 57%|█████▋    | 1322/2304 [4:18:45<5:25:07, 19.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0967


 57%|█████▋    | 1323/2304 [4:19:01<5:08:06, 18.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0979


 57%|█████▋    | 1324/2304 [4:19:26<5:40:15, 20.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 58%|█████▊    | 1325/2304 [4:19:52<6:01:18, 22.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0727


 58%|█████▊    | 1326/2304 [4:20:17<6:17:51, 23.18s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0655


 58%|█████▊    | 1327/2304 [4:20:34<5:44:28, 21.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1063


 58%|█████▊    | 1328/2304 [4:20:50<5:21:48, 19.78s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1082


 58%|█████▊    | 1329/2304 [4:21:07<5:06:23, 18.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 58%|█████▊    | 1330/2304 [4:21:33<5:41:31, 21.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.2722


 58%|█████▊    | 1331/2304 [4:21:59<6:03:20, 22.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0854


 58%|█████▊    | 1332/2304 [4:22:25<6:19:36, 23.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0737


 58%|█████▊    | 1333/2304 [4:22:41<5:45:16, 21.34s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0841


 58%|█████▊    | 1334/2304 [4:22:57<5:19:35, 19.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0878


 58%|█████▊    | 1335/2304 [4:23:14<5:02:53, 18.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1076


 58%|█████▊    | 1336/2304 [4:23:39<5:36:32, 20.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0942


 58%|█████▊    | 1337/2304 [4:24:04<5:56:34, 22.12s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1541


 58%|█████▊    | 1338/2304 [4:24:30<6:13:52, 23.22s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0994


 58%|█████▊    | 1339/2304 [4:24:47<5:42:27, 21.29s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


 58%|█████▊    | 1340/2304 [4:25:03<5:16:09, 19.68s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1289


 58%|█████▊    | 1341/2304 [4:25:19<5:00:04, 18.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1071


 58%|█████▊    | 1342/2304 [4:25:45<5:33:20, 20.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1329


 58%|█████▊    | 1343/2304 [4:26:10<5:54:42, 22.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1304


 58%|█████▊    | 1344/2304 [4:26:36<6:12:21, 23.27s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0873


 58%|█████▊    | 1345/2304 [4:26:53<5:38:52, 21.20s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1127


 58%|█████▊    | 1346/2304 [4:27:09<5:14:20, 19.69s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0925


 58%|█████▊    | 1347/2304 [4:27:25<4:58:16, 18.70s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1183


 59%|█████▊    | 1348/2304 [4:27:50<5:30:00, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 59%|█████▊    | 1349/2304 [4:28:16<5:50:38, 22.03s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 59%|█████▊    | 1350/2304 [4:28:41<6:07:28, 23.11s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 59%|█████▊    | 1351/2304 [4:28:57<5:34:29, 21.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0859


 59%|█████▊    | 1352/2304 [4:29:13<5:10:01, 19.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0893


 59%|█████▊    | 1353/2304 [4:29:30<4:55:16, 18.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 59%|█████▉    | 1354/2304 [4:29:56<5:29:31, 20.81s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0663


 59%|█████▉    | 1355/2304 [4:30:21<5:50:22, 22.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1745


 59%|█████▉    | 1356/2304 [4:30:47<6:06:40, 23.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 59%|█████▉    | 1357/2304 [4:31:03<5:33:57, 21.16s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0983


 59%|█████▉    | 1358/2304 [4:31:19<5:08:48, 19.59s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0991


 59%|█████▉    | 1359/2304 [4:31:35<4:53:08, 18.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0985


 59%|█████▉    | 1360/2304 [4:32:01<5:25:51, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0981


 59%|█████▉    | 1361/2304 [4:32:26<5:46:50, 22.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0879


 59%|█████▉    | 1362/2304 [4:32:52<6:04:19, 23.21s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 59%|█████▉    | 1363/2304 [4:33:09<5:31:39, 21.15s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0885


 59%|█████▉    | 1364/2304 [4:33:25<5:07:11, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1341


 59%|█████▉    | 1365/2304 [4:33:41<4:51:57, 18.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0769


 59%|█████▉    | 1366/2304 [4:34:07<5:24:42, 20.77s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0677


 59%|█████▉    | 1367/2304 [4:34:32<5:44:50, 22.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0812


 59%|█████▉    | 1368/2304 [4:34:57<6:00:52, 23.13s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0728


 59%|█████▉    | 1369/2304 [4:35:14<5:29:25, 21.14s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0832


 59%|█████▉    | 1370/2304 [4:35:30<5:05:23, 19.62s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0824


 60%|█████▉    | 1371/2304 [4:35:46<4:50:08, 18.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0874


 60%|█████▉    | 1372/2304 [4:36:12<5:22:16, 20.75s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0725


 60%|█████▉    | 1373/2304 [4:36:37<5:43:05, 22.11s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 60%|█████▉    | 1374/2304 [4:37:03<5:59:31, 23.19s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0664


 60%|█████▉    | 1375/2304 [4:37:19<5:27:48, 21.17s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0940


 60%|█████▉    | 1376/2304 [4:37:35<5:03:19, 19.61s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0816


 60%|█████▉    | 1377/2304 [4:37:52<4:47:51, 18.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0953


 60%|█████▉    | 1378/2304 [4:38:17<5:19:39, 20.71s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0775


 60%|█████▉    | 1379/2304 [4:38:42<5:38:39, 21.97s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0678


 60%|█████▉    | 1380/2304 [4:39:08<5:55:32, 23.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 60%|█████▉    | 1381/2304 [4:39:24<5:23:50, 21.05s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0925


 60%|█████▉    | 1382/2304 [4:39:40<5:00:14, 19.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0862


 60%|██████    | 1383/2304 [4:39:57<4:45:16, 18.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1132


 60%|██████    | 1384/2304 [4:40:22<5:18:17, 20.76s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0850


 60%|██████    | 1385/2304 [4:40:47<5:37:16, 22.02s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1481


 60%|██████    | 1386/2304 [4:41:13<5:53:28, 23.10s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0944


 60%|██████    | 1387/2304 [4:41:29<5:21:57, 21.07s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0729


 60%|██████    | 1388/2304 [4:41:45<4:58:15, 19.54s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 60%|██████    | 1389/2304 [4:42:02<4:42:50, 18.55s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0908


 60%|██████    | 1390/2304 [4:42:27<5:14:17, 20.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0971


 60%|██████    | 1391/2304 [4:42:52<5:33:27, 21.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0843


 60%|██████    | 1392/2304 [4:43:18<5:50:15, 23.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0922


 60%|██████    | 1393/2304 [4:43:34<5:19:27, 21.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0653


 61%|██████    | 1394/2304 [4:43:50<4:56:37, 19.56s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0923


 61%|██████    | 1395/2304 [4:44:07<4:42:11, 18.63s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 61%|██████    | 1396/2304 [4:44:32<5:12:38, 20.66s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0665


 61%|██████    | 1397/2304 [4:44:57<5:32:16, 21.98s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 61%|██████    | 1398/2304 [4:45:23<5:48:37, 23.09s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1356


 61%|██████    | 1399/2304 [4:45:39<5:17:56, 21.08s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0812


 61%|██████    | 1400/2304 [4:45:55<4:54:56, 19.58s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 61%|██████    | 1401/2304 [4:46:11<4:39:55, 18.60s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 61%|██████    | 1402/2304 [4:46:38<5:14:23, 20.91s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0657


 61%|██████    | 1403/2304 [4:47:03<5:35:10, 22.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.2157


 61%|██████    | 1404/2304 [4:47:30<5:51:57, 23.46s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 61%|██████    | 1405/2304 [4:47:46<5:20:16, 21.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0827


 61%|██████    | 1406/2304 [4:48:02<4:56:44, 19.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0920


 61%|██████    | 1407/2304 [4:48:19<4:42:34, 18.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0709


 61%|██████    | 1408/2304 [4:48:45<5:14:08, 21.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0873


 61%|██████    | 1409/2304 [4:49:11<5:34:07, 22.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0669


 61%|██████    | 1410/2304 [4:49:37<5:50:28, 23.52s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0901


 61%|██████    | 1411/2304 [4:49:53<5:19:36, 21.47s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0747


 61%|██████▏   | 1412/2304 [4:50:09<4:54:51, 19.83s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1297


 61%|██████▏   | 1413/2304 [4:50:26<4:40:42, 18.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0948


 61%|██████▏   | 1414/2304 [4:50:52<5:12:26, 21.06s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0869


 61%|██████▏   | 1415/2304 [4:51:18<5:31:06, 22.35s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0828


 61%|██████▏   | 1416/2304 [4:51:44<5:46:50, 23.44s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1861


 62%|██████▏   | 1417/2304 [4:52:00<5:15:54, 21.37s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0953


 62%|██████▏   | 1418/2304 [4:52:16<4:53:21, 19.87s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 62%|██████▏   | 1419/2304 [4:52:33<4:38:28, 18.88s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0710


 62%|██████▏   | 1420/2304 [4:52:59<5:09:15, 20.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 62%|██████▏   | 1421/2304 [4:53:25<5:29:27, 22.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0882


 62%|██████▏   | 1422/2304 [4:53:51<5:45:13, 23.48s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 62%|██████▏   | 1423/2304 [4:54:07<5:14:40, 21.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1385


 62%|██████▏   | 1424/2304 [4:54:24<4:51:43, 19.89s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0808


 62%|██████▏   | 1425/2304 [4:54:40<4:36:52, 18.90s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1050


 62%|██████▏   | 1426/2304 [4:55:06<5:07:51, 21.04s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0664


 62%|██████▏   | 1427/2304 [4:55:32<5:27:18, 22.39s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 62%|██████▏   | 1428/2304 [4:55:58<5:41:40, 23.40s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 62%|██████▏   | 1429/2304 [4:56:14<5:11:31, 21.36s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 62%|██████▏   | 1430/2304 [4:56:30<4:48:14, 19.79s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1203


 62%|██████▏   | 1431/2304 [4:56:47<4:34:21, 18.86s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1186


 62%|██████▏   | 1432/2304 [4:57:13<5:04:38, 20.96s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0864


 62%|██████▏   | 1433/2304 [4:57:38<5:23:53, 22.31s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0927


 62%|██████▏   | 1434/2304 [4:58:04<5:40:04, 23.45s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 62%|██████▏   | 1435/2304 [4:58:21<5:10:04, 21.41s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1011


 62%|██████▏   | 1436/2304 [4:58:37<4:47:06, 19.85s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0917


 62%|██████▏   | 1437/2304 [4:58:54<4:32:56, 18.89s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0808


 62%|██████▏   | 1438/2304 [4:59:20<5:02:56, 20.99s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0898


 62%|██████▏   | 1439/2304 [4:59:45<5:21:48, 22.32s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0764


 62%|██████▎   | 1440/2304 [5:00:11<5:37:21, 23.43s/it]

Config: {'activation': 'gelu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0930


 63%|██████▎   | 1441/2304 [5:00:20<4:34:28, 19.08s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0971


 63%|██████▎   | 1442/2304 [5:00:29<3:49:05, 15.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0919


 63%|██████▎   | 1443/2304 [5:00:38<3:18:39, 13.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0892


 63%|██████▎   | 1444/2304 [5:00:51<3:17:12, 13.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0852


 63%|██████▎   | 1445/2304 [5:01:05<3:15:04, 13.63s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1754


 63%|██████▎   | 1446/2304 [5:01:18<3:14:26, 13.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0861


 63%|██████▎   | 1447/2304 [5:01:27<2:53:16, 12.13s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1049


 63%|██████▎   | 1448/2304 [5:01:35<2:37:32, 11.04s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0727


 63%|██████▎   | 1449/2304 [5:01:44<2:27:01, 10.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0876


 63%|██████▎   | 1450/2304 [5:01:57<2:39:56, 11.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 63%|██████▎   | 1451/2304 [5:02:10<2:47:59, 11.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 63%|██████▎   | 1452/2304 [5:02:24<2:54:20, 12.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0841


 63%|██████▎   | 1453/2304 [5:02:33<2:39:16, 11.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0911


 63%|██████▎   | 1454/2304 [5:02:41<2:27:51, 10.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0663


 63%|██████▎   | 1455/2304 [5:02:50<2:20:37,  9.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0671


 63%|██████▎   | 1456/2304 [5:03:03<2:34:38, 10.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.2214


 63%|██████▎   | 1457/2304 [5:03:17<2:45:35, 11.73s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0786


 63%|██████▎   | 1458/2304 [5:03:31<2:53:58, 12.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0722


 63%|██████▎   | 1459/2304 [5:03:39<2:39:08, 11.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1256


 63%|██████▎   | 1460/2304 [5:03:48<2:28:06, 10.53s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0736


 63%|██████▎   | 1461/2304 [5:03:57<2:20:45, 10.02s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0739


 63%|██████▎   | 1462/2304 [5:04:11<2:35:47, 11.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1085


 63%|██████▎   | 1463/2304 [5:04:24<2:44:55, 11.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 64%|██████▎   | 1464/2304 [5:04:38<2:53:28, 12.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1123


 64%|██████▎   | 1465/2304 [5:04:47<2:39:22, 11.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0897


 64%|██████▎   | 1466/2304 [5:04:56<2:27:59, 10.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0814


 64%|██████▎   | 1467/2304 [5:05:05<2:21:01, 10.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0856


 64%|██████▎   | 1468/2304 [5:05:18<2:35:41, 11.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0836


 64%|██████▍   | 1469/2304 [5:05:32<2:45:16, 11.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0745


 64%|██████▍   | 1470/2304 [5:05:46<2:52:53, 12.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0753


 64%|██████▍   | 1471/2304 [5:05:54<2:38:10, 11.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0748


 64%|██████▍   | 1472/2304 [5:06:03<2:27:17, 10.62s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0821


 64%|██████▍   | 1473/2304 [5:06:12<2:20:07, 10.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 64%|██████▍   | 1474/2304 [5:06:26<2:34:42, 11.18s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 64%|██████▍   | 1475/2304 [5:06:39<2:43:47, 11.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 64%|██████▍   | 1476/2304 [5:06:53<2:51:36, 12.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0811


 64%|██████▍   | 1477/2304 [5:07:02<2:37:19, 11.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 64%|██████▍   | 1478/2304 [5:07:11<2:26:26, 10.64s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0706


 64%|██████▍   | 1479/2304 [5:07:20<2:19:32, 10.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 64%|██████▍   | 1480/2304 [5:07:34<2:33:55, 11.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1967


 64%|██████▍   | 1481/2304 [5:07:47<2:42:53, 11.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0691


 64%|██████▍   | 1482/2304 [5:08:01<2:50:15, 12.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0823


 64%|██████▍   | 1483/2304 [5:08:10<2:35:41, 11.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0844


 64%|██████▍   | 1484/2304 [5:08:19<2:24:57, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0752


 64%|██████▍   | 1485/2304 [5:08:27<2:17:55, 10.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1004


 64%|██████▍   | 1486/2304 [5:08:41<2:32:19, 11.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0935


 65%|██████▍   | 1487/2304 [5:08:54<2:40:51, 11.81s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0764


 65%|██████▍   | 1488/2304 [5:09:08<2:48:42, 12.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0793


 65%|██████▍   | 1489/2304 [5:09:17<2:34:13, 11.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0847


 65%|██████▍   | 1490/2304 [5:09:26<2:23:03, 10.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0878


 65%|██████▍   | 1491/2304 [5:09:35<2:16:13, 10.05s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0800


 65%|██████▍   | 1492/2304 [5:09:48<2:30:21, 11.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0774


 65%|██████▍   | 1493/2304 [5:10:02<2:38:55, 11.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 65%|██████▍   | 1494/2304 [5:10:15<2:45:44, 12.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0645


 65%|██████▍   | 1495/2304 [5:10:24<2:32:10, 11.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0868


 65%|██████▍   | 1496/2304 [5:10:33<2:20:58, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0795


 65%|██████▍   | 1497/2304 [5:10:41<2:14:16,  9.98s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0898


 65%|██████▌   | 1498/2304 [5:10:55<2:28:27, 11.05s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0810


 65%|██████▌   | 1499/2304 [5:11:08<2:37:00, 11.70s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0781


 65%|██████▌   | 1500/2304 [5:11:22<2:44:21, 12.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0652


 65%|██████▌   | 1501/2304 [5:11:31<2:30:38, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1121


 65%|██████▌   | 1502/2304 [5:11:39<2:20:02, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0874


 65%|██████▌   | 1503/2304 [5:11:48<2:13:38, 10.01s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0819


 65%|██████▌   | 1504/2304 [5:12:02<2:27:41, 11.08s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1218


 65%|██████▌   | 1505/2304 [5:12:15<2:36:07, 11.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0901


 65%|██████▌   | 1506/2304 [5:12:29<2:42:51, 12.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0800


 65%|██████▌   | 1507/2304 [5:12:37<2:29:32, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0730


 65%|██████▌   | 1508/2304 [5:12:46<2:18:53, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0732


 65%|██████▌   | 1509/2304 [5:12:55<2:12:09,  9.97s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0739


 66%|██████▌   | 1510/2304 [5:13:08<2:25:52, 11.02s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0881


 66%|██████▌   | 1511/2304 [5:13:22<2:34:23, 11.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 66%|██████▌   | 1512/2304 [5:13:35<2:41:29, 12.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0831


 66%|██████▌   | 1513/2304 [5:13:44<2:28:29, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0832


 66%|██████▌   | 1514/2304 [5:13:53<2:18:46, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


 66%|██████▌   | 1515/2304 [5:14:02<2:12:23, 10.07s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1007


 66%|██████▌   | 1516/2304 [5:14:16<2:26:54, 11.19s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0807


 66%|██████▌   | 1517/2304 [5:14:29<2:35:19, 11.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 66%|██████▌   | 1518/2304 [5:14:43<2:42:39, 12.42s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0640


 66%|██████▌   | 1519/2304 [5:14:52<2:28:57, 11.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0775


 66%|██████▌   | 1520/2304 [5:15:01<2:18:27, 10.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0973


 66%|██████▌   | 1521/2304 [5:15:10<2:11:39, 10.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0831


 66%|██████▌   | 1522/2304 [5:15:23<2:25:34, 11.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0745


 66%|██████▌   | 1523/2304 [5:15:37<2:34:05, 11.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 66%|██████▌   | 1524/2304 [5:15:50<2:40:48, 12.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0796


 66%|██████▌   | 1525/2304 [5:15:59<2:27:07, 11.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1139


 66%|██████▌   | 1526/2304 [5:16:08<2:17:11, 10.58s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0684


 66%|██████▋   | 1527/2304 [5:16:17<2:10:42, 10.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1433


 66%|██████▋   | 1528/2304 [5:16:31<2:24:27, 11.17s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0921


 66%|██████▋   | 1529/2304 [5:16:44<2:33:10, 11.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0752


 66%|██████▋   | 1530/2304 [5:16:58<2:39:26, 12.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0886


 66%|██████▋   | 1531/2304 [5:17:07<2:26:40, 11.38s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0787


 66%|██████▋   | 1532/2304 [5:17:15<2:16:27, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0746


 67%|██████▋   | 1533/2304 [5:17:24<2:10:00, 10.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0899


 67%|██████▋   | 1534/2304 [5:17:38<2:23:02, 11.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0761


 67%|██████▋   | 1535/2304 [5:17:51<2:30:59, 11.78s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0781


 67%|██████▋   | 1536/2304 [5:18:05<2:38:20, 12.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0905


 67%|██████▋   | 1537/2304 [5:18:14<2:25:08, 11.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0849


 67%|██████▋   | 1538/2304 [5:18:23<2:15:21, 10.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 67%|██████▋   | 1539/2304 [5:18:32<2:09:31, 10.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0920


 67%|██████▋   | 1540/2304 [5:18:46<2:22:55, 11.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0679


 67%|██████▋   | 1541/2304 [5:18:59<2:30:44, 11.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0704


 67%|██████▋   | 1542/2304 [5:19:13<2:37:57, 12.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


 67%|██████▋   | 1543/2304 [5:19:22<2:24:29, 11.39s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0754


 67%|██████▋   | 1544/2304 [5:19:31<2:14:34, 10.62s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0768


 67%|██████▋   | 1545/2304 [5:19:40<2:08:06, 10.13s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0817


 67%|██████▋   | 1546/2304 [5:19:53<2:21:31, 11.20s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0763


 67%|██████▋   | 1547/2304 [5:20:07<2:29:51, 11.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 67%|██████▋   | 1548/2304 [5:20:21<2:36:56, 12.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 67%|██████▋   | 1549/2304 [5:20:29<2:23:24, 11.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1456


 67%|██████▋   | 1550/2304 [5:20:38<2:13:14, 10.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0819


 67%|██████▋   | 1551/2304 [5:20:47<2:06:34, 10.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1415


 67%|██████▋   | 1552/2304 [5:21:01<2:20:39, 11.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


 67%|██████▋   | 1553/2304 [5:21:14<2:28:18, 11.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 67%|██████▋   | 1554/2304 [5:21:28<2:35:06, 12.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1204


 67%|██████▋   | 1555/2304 [5:21:37<2:21:45, 11.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0782


 68%|██████▊   | 1556/2304 [5:21:46<2:11:28, 10.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1036


 68%|██████▊   | 1557/2304 [5:21:54<2:05:20, 10.07s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1259


 68%|██████▊   | 1558/2304 [5:22:08<2:18:07, 11.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0757


 68%|██████▊   | 1559/2304 [5:22:21<2:25:58, 11.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0823


 68%|██████▊   | 1560/2304 [5:22:35<2:32:38, 12.31s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0726


 68%|██████▊   | 1561/2304 [5:22:44<2:20:15, 11.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 68%|██████▊   | 1562/2304 [5:22:53<2:10:17, 10.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0872


 68%|██████▊   | 1563/2304 [5:23:02<2:04:08, 10.05s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1006


 68%|██████▊   | 1564/2304 [5:23:15<2:17:27, 11.14s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 68%|██████▊   | 1565/2304 [5:23:29<2:25:48, 11.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 68%|██████▊   | 1566/2304 [5:23:42<2:32:10, 12.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0918


 68%|██████▊   | 1567/2304 [5:23:51<2:19:13, 11.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0876


 68%|██████▊   | 1568/2304 [5:24:00<2:09:25, 10.55s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0950


 68%|██████▊   | 1569/2304 [5:24:09<2:03:58, 10.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0763


 68%|██████▊   | 1570/2304 [5:24:23<2:17:06, 11.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 68%|██████▊   | 1571/2304 [5:24:36<2:25:16, 11.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0799


 68%|██████▊   | 1572/2304 [5:24:50<2:32:23, 12.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0654


 68%|██████▊   | 1573/2304 [5:24:59<2:18:57, 11.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1237


 68%|██████▊   | 1574/2304 [5:25:08<2:08:45, 10.58s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0674


 68%|██████▊   | 1575/2304 [5:25:17<2:02:28, 10.08s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1326


 68%|██████▊   | 1576/2304 [5:25:31<2:16:27, 11.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0683


 68%|██████▊   | 1577/2304 [5:25:44<2:24:20, 11.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0756


 68%|██████▊   | 1578/2304 [5:25:58<2:31:09, 12.49s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0733


 69%|██████▊   | 1579/2304 [5:26:07<2:18:04, 11.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0849


 69%|██████▊   | 1580/2304 [5:26:16<2:08:23, 10.64s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1023


 69%|██████▊   | 1581/2304 [5:26:25<2:01:56, 10.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 69%|██████▊   | 1582/2304 [5:26:39<2:16:36, 11.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0912


 69%|██████▊   | 1583/2304 [5:26:53<2:26:08, 12.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0751


 69%|██████▉   | 1584/2304 [5:27:07<2:33:31, 12.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0696


 69%|██████▉   | 1585/2304 [5:27:16<2:20:30, 11.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0822


 69%|██████▉   | 1586/2304 [5:27:25<2:10:23, 10.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0826


 69%|██████▉   | 1587/2304 [5:27:34<2:03:45, 10.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0711


 69%|██████▉   | 1588/2304 [5:27:48<2:16:27, 11.44s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0849


 69%|██████▉   | 1589/2304 [5:28:02<2:23:18, 12.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0939


 69%|██████▉   | 1590/2304 [5:28:15<2:28:59, 12.52s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0761


 69%|██████▉   | 1591/2304 [5:28:24<2:15:47, 11.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1154


 69%|██████▉   | 1592/2304 [5:28:33<2:05:53, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0785


 69%|██████▉   | 1593/2304 [5:28:42<1:59:47, 10.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0973


 69%|██████▉   | 1594/2304 [5:28:56<2:12:37, 11.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 69%|██████▉   | 1595/2304 [5:29:10<2:23:17, 12.13s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0687


 69%|██████▉   | 1596/2304 [5:29:25<2:33:39, 13.02s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0655


 69%|██████▉   | 1597/2304 [5:29:34<2:18:41, 11.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0715


 69%|██████▉   | 1598/2304 [5:29:43<2:07:55, 10.87s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0803


 69%|██████▉   | 1599/2304 [5:29:52<2:00:38, 10.27s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1860


 69%|██████▉   | 1600/2304 [5:30:05<2:13:13, 11.36s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0945


 69%|██████▉   | 1601/2304 [5:30:21<2:26:02, 12.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 70%|██████▉   | 1602/2304 [5:30:35<2:32:26, 13.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 70%|██████▉   | 1603/2304 [5:30:44<2:17:47, 11.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1280


 70%|██████▉   | 1604/2304 [5:30:53<2:06:57, 10.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


 70%|██████▉   | 1605/2304 [5:31:01<1:59:47, 10.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0808


 70%|██████▉   | 1606/2304 [5:31:15<2:11:24, 11.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0791


 70%|██████▉   | 1607/2304 [5:31:28<2:18:28, 11.92s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0731


 70%|██████▉   | 1608/2304 [5:31:42<2:24:27, 12.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0940


 70%|██████▉   | 1609/2304 [5:31:51<2:12:39, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1056


 70%|██████▉   | 1610/2304 [5:32:01<2:04:49, 10.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0902


 70%|██████▉   | 1611/2304 [5:32:11<2:02:18, 10.59s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1038


 70%|██████▉   | 1612/2304 [5:32:25<2:16:21, 11.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0657


 70%|███████   | 1613/2304 [5:32:39<2:21:56, 12.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


 70%|███████   | 1614/2304 [5:32:54<2:32:25, 13.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0750


 70%|███████   | 1615/2304 [5:33:04<2:20:12, 12.21s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


 70%|███████   | 1616/2304 [5:33:13<2:08:14, 11.18s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0868


 70%|███████   | 1617/2304 [5:33:22<2:01:07, 10.58s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0786


 70%|███████   | 1618/2304 [5:33:36<2:11:56, 11.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 70%|███████   | 1619/2304 [5:33:49<2:18:25, 12.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0744


 70%|███████   | 1620/2304 [5:34:03<2:23:07, 12.56s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0770


 70%|███████   | 1621/2304 [5:34:12<2:10:17, 11.45s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 70%|███████   | 1622/2304 [5:34:20<2:00:36, 10.61s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1144


 70%|███████   | 1623/2304 [5:34:29<1:54:40, 10.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0912


 70%|███████   | 1624/2304 [5:34:43<2:07:09, 11.22s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1539


 71%|███████   | 1625/2304 [5:34:56<2:13:59, 11.84s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0884


 71%|███████   | 1626/2304 [5:35:11<2:24:12, 12.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0819


 71%|███████   | 1627/2304 [5:35:21<2:15:13, 11.98s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0744


 71%|███████   | 1628/2304 [5:35:31<2:08:08, 11.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1006


 71%|███████   | 1629/2304 [5:35:41<2:00:55, 10.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0744


 71%|███████   | 1630/2304 [5:35:55<2:12:25, 11.79s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0880


 71%|███████   | 1631/2304 [5:36:08<2:17:57, 12.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0754


 71%|███████   | 1632/2304 [5:36:22<2:22:55, 12.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0845


 71%|███████   | 1633/2304 [5:36:31<2:09:45, 11.60s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0751


 71%|███████   | 1634/2304 [5:36:40<1:59:20, 10.69s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 71%|███████   | 1635/2304 [5:36:49<1:53:20, 10.16s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0772


 71%|███████   | 1636/2304 [5:37:02<2:03:59, 11.14s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 71%|███████   | 1637/2304 [5:37:15<2:10:43, 11.76s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0645


 71%|███████   | 1638/2304 [5:37:29<2:16:22, 12.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0659


 71%|███████   | 1639/2304 [5:37:38<2:04:47, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0987


 71%|███████   | 1640/2304 [5:37:46<1:56:16, 10.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0841


 71%|███████   | 1641/2304 [5:37:55<1:50:47, 10.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 71%|███████▏  | 1642/2304 [5:38:09<2:02:09, 11.07s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0658


 71%|███████▏  | 1643/2304 [5:38:22<2:09:16, 11.73s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 71%|███████▏  | 1644/2304 [5:38:36<2:15:08, 12.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0660


 71%|███████▏  | 1645/2304 [5:38:45<2:03:57, 11.29s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0958


 71%|███████▏  | 1646/2304 [5:38:53<1:55:09, 10.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0838


 71%|███████▏  | 1647/2304 [5:39:02<1:49:30, 10.00s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0730


 72%|███████▏  | 1648/2304 [5:39:16<2:00:44, 11.04s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0845


 72%|███████▏  | 1649/2304 [5:39:29<2:07:15, 11.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0840


 72%|███████▏  | 1650/2304 [5:39:42<2:13:31, 12.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0819


 72%|███████▏  | 1651/2304 [5:39:51<2:02:20, 11.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0791


 72%|███████▏  | 1652/2304 [5:40:00<1:54:04, 10.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0965


 72%|███████▏  | 1653/2304 [5:40:09<1:48:26,  9.99s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0727


 72%|███████▏  | 1654/2304 [5:40:22<1:59:35, 11.04s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0783


 72%|███████▏  | 1655/2304 [5:40:35<2:06:08, 11.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0777


 72%|███████▏  | 1656/2304 [5:40:49<2:12:22, 12.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 72%|███████▏  | 1657/2304 [5:40:58<2:01:06, 11.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0952


 72%|███████▏  | 1658/2304 [5:41:07<1:52:48, 10.48s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 72%|███████▏  | 1659/2304 [5:41:15<1:47:00,  9.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0668


 72%|███████▏  | 1660/2304 [5:41:29<1:58:43, 11.06s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


 72%|███████▏  | 1661/2304 [5:41:42<2:05:36, 11.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


 72%|███████▏  | 1662/2304 [5:41:56<2:10:49, 12.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0695


 72%|███████▏  | 1663/2304 [5:42:05<1:59:58, 11.23s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0820


 72%|███████▏  | 1664/2304 [5:42:13<1:51:16, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 72%|███████▏  | 1665/2304 [5:42:22<1:45:57,  9.95s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0751


 72%|███████▏  | 1666/2304 [5:42:35<1:57:04, 11.01s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0837


 72%|███████▏  | 1667/2304 [5:42:49<2:04:02, 11.68s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 72%|███████▏  | 1668/2304 [5:43:02<2:09:48, 12.25s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 72%|███████▏  | 1669/2304 [5:43:11<1:58:55, 11.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0721


 72%|███████▏  | 1670/2304 [5:43:20<1:50:34, 10.46s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0924


 73%|███████▎  | 1671/2304 [5:43:29<1:45:38, 10.01s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 73%|███████▎  | 1672/2304 [5:43:42<1:56:48, 11.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1401


 73%|███████▎  | 1673/2304 [5:43:56<2:03:46, 11.77s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0897


 73%|███████▎  | 1674/2304 [5:44:09<2:09:23, 12.32s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0695


 73%|███████▎  | 1675/2304 [5:44:18<1:58:14, 11.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0884


 73%|███████▎  | 1676/2304 [5:44:27<1:49:54, 10.50s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1025


 73%|███████▎  | 1677/2304 [5:44:36<1:45:02, 10.05s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0847


 73%|███████▎  | 1678/2304 [5:44:49<1:56:03, 11.12s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0750


 73%|███████▎  | 1679/2304 [5:45:03<2:02:25, 11.75s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0759


 73%|███████▎  | 1680/2304 [5:45:16<2:08:37, 12.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 73%|███████▎  | 1681/2304 [5:45:25<1:57:42, 11.34s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0789


 73%|███████▎  | 1682/2304 [5:45:34<1:50:30, 10.66s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0776


 73%|███████▎  | 1683/2304 [5:45:44<1:46:23, 10.28s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0684


 73%|███████▎  | 1684/2304 [5:45:59<2:02:32, 11.86s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0774


 73%|███████▎  | 1685/2304 [5:46:14<2:11:14, 12.72s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 73%|███████▎  | 1686/2304 [5:46:29<2:16:23, 13.24s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 73%|███████▎  | 1687/2304 [5:46:38<2:03:36, 12.02s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0763


 73%|███████▎  | 1688/2304 [5:46:47<1:53:27, 11.05s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0783


 73%|███████▎  | 1689/2304 [5:46:55<1:46:42, 10.41s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0792


 73%|███████▎  | 1690/2304 [5:47:09<1:56:37, 11.40s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0659


 73%|███████▎  | 1691/2304 [5:47:22<2:01:35, 11.90s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0658


 73%|███████▎  | 1692/2304 [5:47:36<2:05:27, 12.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0818


 73%|███████▎  | 1693/2304 [5:47:44<1:54:41, 11.26s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0874


 74%|███████▎  | 1694/2304 [5:47:53<1:46:00, 10.43s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0971


 74%|███████▎  | 1695/2304 [5:48:02<1:40:56,  9.94s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0846


 74%|███████▎  | 1696/2304 [5:48:15<1:52:17, 11.08s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0720


 74%|███████▎  | 1697/2304 [5:48:28<1:58:06, 11.67s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0670


 74%|███████▎  | 1698/2304 [5:48:42<2:02:43, 12.15s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1338


 74%|███████▎  | 1699/2304 [5:48:50<1:52:14, 11.13s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0763


 74%|███████▍  | 1700/2304 [5:48:59<1:44:20, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 74%|███████▍  | 1701/2304 [5:49:08<1:39:14,  9.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0847


 74%|███████▍  | 1702/2304 [5:49:21<1:49:41, 10.93s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0788


 74%|███████▍  | 1703/2304 [5:49:34<1:56:00, 11.58s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0716


 74%|███████▍  | 1704/2304 [5:49:48<2:00:57, 12.10s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0833


 74%|███████▍  | 1705/2304 [5:49:56<1:50:31, 11.07s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0665


 74%|███████▍  | 1706/2304 [5:50:05<1:42:42, 10.30s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 74%|███████▍  | 1707/2304 [5:50:13<1:37:40,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 74%|███████▍  | 1708/2304 [5:50:27<1:47:44, 10.85s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0656


 74%|███████▍  | 1709/2304 [5:50:40<1:54:09, 11.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


 74%|███████▍  | 1710/2304 [5:50:53<1:59:04, 12.03s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0657


 74%|███████▍  | 1711/2304 [5:51:02<1:49:30, 11.08s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0673


 74%|███████▍  | 1712/2304 [5:51:10<1:41:52, 10.33s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0892


 74%|███████▍  | 1713/2304 [5:51:19<1:36:43,  9.82s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0807


 74%|███████▍  | 1714/2304 [5:51:32<1:47:06, 10.89s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0734


 74%|███████▍  | 1715/2304 [5:51:46<1:53:19, 11.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0661


 74%|███████▍  | 1716/2304 [5:51:59<1:58:26, 12.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0658


 75%|███████▍  | 1717/2304 [5:52:08<1:48:27, 11.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 75%|███████▍  | 1718/2304 [5:52:16<1:41:06, 10.35s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0807


 75%|███████▍  | 1719/2304 [5:52:25<1:36:22,  9.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0677


 75%|███████▍  | 1720/2304 [5:52:38<1:46:10, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1048


 75%|███████▍  | 1721/2304 [5:52:51<1:52:07, 11.54s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 75%|███████▍  | 1722/2304 [5:53:05<1:57:17, 12.09s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0672


 75%|███████▍  | 1723/2304 [5:53:14<1:47:34, 11.11s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 75%|███████▍  | 1724/2304 [5:53:22<1:40:15, 10.37s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0664


 75%|███████▍  | 1725/2304 [5:53:31<1:35:18,  9.88s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0820


 75%|███████▍  | 1726/2304 [5:53:44<1:45:03, 10.91s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 75%|███████▍  | 1727/2304 [5:53:57<1:50:40, 11.51s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0758


 75%|███████▌  | 1728/2304 [5:54:10<1:55:36, 12.04s/it]

Config: {'activation': 'gelu', 'batch_size': 128, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0756


 75%|███████▌  | 1729/2304 [5:54:15<1:34:55,  9.91s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0797


 75%|███████▌  | 1730/2304 [5:54:20<1:20:16,  8.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0719


 75%|███████▌  | 1731/2304 [5:54:25<1:10:44,  7.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0869


 75%|███████▌  | 1732/2304 [5:54:33<1:10:05,  7.35s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0797


 75%|███████▌  | 1733/2304 [5:54:40<1:09:45,  7.33s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0744


 75%|███████▌  | 1734/2304 [5:54:47<1:09:29,  7.31s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0741


 75%|███████▌  | 1735/2304 [5:54:52<1:02:40,  6.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 75%|███████▌  | 1736/2304 [5:54:57<58:04,  6.14s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0872


 75%|███████▌  | 1737/2304 [5:55:02<54:33,  5.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0944


 75%|███████▌  | 1738/2304 [5:55:09<58:35,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0883


 75%|███████▌  | 1739/2304 [5:55:16<1:01:24,  6.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0833


 76%|███████▌  | 1740/2304 [5:55:24<1:03:43,  6.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0863


 76%|███████▌  | 1741/2304 [5:55:29<58:45,  6.26s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0806


 76%|███████▌  | 1742/2304 [5:55:34<54:45,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0718


 76%|███████▌  | 1743/2304 [5:55:39<52:22,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 76%|███████▌  | 1744/2304 [5:55:46<57:36,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 76%|███████▌  | 1745/2304 [5:55:53<1:00:14,  6.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 76%|███████▌  | 1746/2304 [5:56:01<1:02:52,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0802


 76%|███████▌  | 1747/2304 [5:56:06<57:32,  6.20s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 76%|███████▌  | 1748/2304 [5:56:11<53:55,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0729


 76%|███████▌  | 1749/2304 [5:56:16<51:53,  5.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0693


 76%|███████▌  | 1750/2304 [5:56:23<56:25,  6.11s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0787


 76%|███████▌  | 1751/2304 [5:56:30<58:54,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0851


 76%|███████▌  | 1752/2304 [5:56:38<1:01:49,  6.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 76%|███████▌  | 1753/2304 [5:56:43<57:02,  6.21s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0916


 76%|███████▌  | 1754/2304 [5:56:48<53:40,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 76%|███████▌  | 1755/2304 [5:56:53<51:17,  5.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1064


 76%|███████▌  | 1756/2304 [5:57:00<55:51,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 76%|███████▋  | 1757/2304 [5:57:07<58:58,  6.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 76%|███████▋  | 1758/2304 [5:57:15<1:01:14,  6.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 76%|███████▋  | 1759/2304 [5:57:20<56:20,  6.20s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0864


 76%|███████▋  | 1760/2304 [5:57:25<53:15,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0848


 76%|███████▋  | 1761/2304 [5:57:30<50:39,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0818


 76%|███████▋  | 1762/2304 [5:57:37<56:08,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 77%|███████▋  | 1763/2304 [5:57:45<58:53,  6.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 77%|███████▋  | 1764/2304 [5:57:52<1:01:09,  6.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0816


 77%|███████▋  | 1765/2304 [5:57:57<56:31,  6.29s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0695


 77%|███████▋  | 1766/2304 [5:58:02<52:34,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 77%|███████▋  | 1767/2304 [5:58:07<50:24,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1064


 77%|███████▋  | 1768/2304 [5:58:14<54:48,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 77%|███████▋  | 1769/2304 [5:58:22<57:30,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0807


 77%|███████▋  | 1770/2304 [5:58:29<1:00:20,  6.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0664


 77%|███████▋  | 1771/2304 [5:58:34<55:26,  6.24s/it]  

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0962


 77%|███████▋  | 1772/2304 [5:58:39<51:48,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0740


 77%|███████▋  | 1773/2304 [5:58:44<49:47,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0723


 77%|███████▋  | 1774/2304 [5:58:52<54:16,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 77%|███████▋  | 1775/2304 [5:58:59<57:15,  6.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0708


 77%|███████▋  | 1776/2304 [5:59:06<58:58,  6.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 77%|███████▋  | 1777/2304 [5:59:11<54:13,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0904


 77%|███████▋  | 1778/2304 [5:59:16<51:03,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1111


 77%|███████▋  | 1779/2304 [5:59:21<48:29,  5.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0872


 77%|███████▋  | 1780/2304 [5:59:28<53:09,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0785


 77%|███████▋  | 1781/2304 [5:59:36<56:19,  6.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1083


 77%|███████▋  | 1782/2304 [5:59:43<58:19,  6.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


 77%|███████▋  | 1783/2304 [5:59:48<54:09,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0823


 77%|███████▋  | 1784/2304 [5:59:53<50:36,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0810


 77%|███████▋  | 1785/2304 [5:59:58<48:16,  5.58s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1159


 78%|███████▊  | 1786/2304 [6:00:05<52:53,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0820


 78%|███████▊  | 1787/2304 [6:00:12<55:16,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0895


 78%|███████▊  | 1788/2304 [6:00:20<58:02,  6.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0755


 78%|███████▊  | 1789/2304 [6:00:25<53:17,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0735


 78%|███████▊  | 1790/2304 [6:00:30<49:40,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0961


 78%|███████▊  | 1791/2304 [6:00:35<47:51,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 78%|███████▊  | 1792/2304 [6:00:42<52:11,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 78%|███████▊  | 1793/2304 [6:00:49<55:12,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0844


 78%|███████▊  | 1794/2304 [6:00:57<57:43,  6.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0834


 78%|███████▊  | 1795/2304 [6:01:02<53:01,  6.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0694


 78%|███████▊  | 1796/2304 [6:01:07<49:39,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0705


 78%|███████▊  | 1797/2304 [6:01:12<47:18,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0753


 78%|███████▊  | 1798/2304 [6:01:19<51:34,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0825


 78%|███████▊  | 1799/2304 [6:01:27<54:55,  6.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0830


 78%|███████▊  | 1800/2304 [6:01:34<56:37,  6.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0717


 78%|███████▊  | 1801/2304 [6:01:39<52:07,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1117


 78%|███████▊  | 1802/2304 [6:01:44<49:12,  5.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0808


 78%|███████▊  | 1803/2304 [6:01:49<47:04,  5.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1125


 78%|███████▊  | 1804/2304 [6:01:56<51:14,  6.15s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1175


 78%|███████▊  | 1805/2304 [6:02:04<53:38,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 78%|███████▊  | 1806/2304 [6:02:11<55:47,  6.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0939


 78%|███████▊  | 1807/2304 [6:02:16<51:44,  6.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0866


 78%|███████▊  | 1808/2304 [6:02:21<48:24,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.1035


 79%|███████▊  | 1809/2304 [6:02:26<46:28,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0837


 79%|███████▊  | 1810/2304 [6:02:34<50:31,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0851


 79%|███████▊  | 1811/2304 [6:02:41<52:52,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 79%|███████▊  | 1812/2304 [6:02:48<55:21,  6.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1808


 79%|███████▊  | 1813/2304 [6:02:53<50:56,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0796


 79%|███████▊  | 1814/2304 [6:02:58<47:39,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0830


 79%|███████▉  | 1815/2304 [6:03:03<45:59,  5.64s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0765


 79%|███████▉  | 1816/2304 [6:03:11<49:54,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0795


 79%|███████▉  | 1817/2304 [6:03:18<52:46,  6.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0826


 79%|███████▉  | 1818/2304 [6:03:25<54:45,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0718


 79%|███████▉  | 1819/2304 [6:03:30<50:19,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1020


 79%|███████▉  | 1820/2304 [6:03:35<47:24,  5.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0863


 79%|███████▉  | 1821/2304 [6:03:40<45:20,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 79%|███████▉  | 1822/2304 [6:03:48<49:26,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1347


 79%|███████▉  | 1823/2304 [6:03:55<52:08,  6.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0707


 79%|███████▉  | 1824/2304 [6:04:02<54:09,  6.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1111


 79%|███████▉  | 1825/2304 [6:04:08<50:02,  6.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0798


 79%|███████▉  | 1826/2304 [6:04:12<46:33,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0756


 79%|███████▉  | 1827/2304 [6:04:17<44:19,  5.58s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0965


 79%|███████▉  | 1828/2304 [6:04:25<48:41,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1045


 79%|███████▉  | 1829/2304 [6:04:32<50:54,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0753


 79%|███████▉  | 1830/2304 [6:04:39<53:27,  6.77s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0944


 79%|███████▉  | 1831/2304 [6:04:44<49:05,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0938


 80%|███████▉  | 1832/2304 [6:04:49<45:51,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0716


 80%|███████▉  | 1833/2304 [6:04:54<44:05,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0804


 80%|███████▉  | 1834/2304 [6:05:02<47:57,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0982


 80%|███████▉  | 1835/2304 [6:05:09<50:17,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0978


 80%|███████▉  | 1836/2304 [6:05:16<52:39,  6.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0763


 80%|███████▉  | 1837/2304 [6:05:21<48:26,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1693


 80%|███████▉  | 1838/2304 [6:05:26<45:26,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0894


 80%|███████▉  | 1839/2304 [6:05:31<43:11,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0997


 80%|███████▉  | 1840/2304 [6:05:39<46:59,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0969


 80%|███████▉  | 1841/2304 [6:05:46<49:44,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 80%|███████▉  | 1842/2304 [6:05:53<51:42,  6.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0959


 80%|███████▉  | 1843/2304 [6:05:58<47:27,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1549


 80%|████████  | 1844/2304 [6:06:03<44:41,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 80%|████████  | 1845/2304 [6:06:08<42:43,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0816


 80%|████████  | 1846/2304 [6:06:16<46:52,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0751


 80%|████████  | 1847/2304 [6:06:23<48:57,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.1274


 80%|████████  | 1848/2304 [6:06:30<50:45,  6.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0749


 80%|████████  | 1849/2304 [6:06:35<47:06,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 80%|████████  | 1850/2304 [6:06:40<43:57,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0814


 80%|████████  | 1851/2304 [6:06:45<42:16,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0791


 80%|████████  | 1852/2304 [6:06:52<45:57,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0775


 80%|████████  | 1853/2304 [6:06:59<48:10,  6.41s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 80%|████████  | 1854/2304 [6:07:07<50:43,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 81%|████████  | 1855/2304 [6:07:12<46:38,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0724


 81%|████████  | 1856/2304 [6:07:17<43:34,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0817


 81%|████████  | 1857/2304 [6:07:22<41:51,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1075


 81%|████████  | 1858/2304 [6:07:29<45:30,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0646


 81%|████████  | 1859/2304 [6:07:37<48:07,  6.49s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 81%|████████  | 1860/2304 [6:07:44<49:56,  6.75s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0801


 81%|████████  | 1861/2304 [6:07:49<45:50,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0945


 81%|████████  | 1862/2304 [6:07:54<43:16,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0781


 81%|████████  | 1863/2304 [6:07:59<41:04,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0862


 81%|████████  | 1864/2304 [6:08:06<44:35,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0860


 81%|████████  | 1865/2304 [6:08:14<47:04,  6.43s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0697


 81%|████████  | 1866/2304 [6:08:21<48:44,  6.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


 81%|████████  | 1867/2304 [6:08:26<45:24,  6.23s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0739


 81%|████████  | 1868/2304 [6:08:31<42:28,  5.84s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0786


 81%|████████  | 1869/2304 [6:08:36<40:32,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 81%|████████  | 1870/2304 [6:08:44<44:50,  6.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0840


 81%|████████  | 1871/2304 [6:08:51<47:06,  6.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 81%|████████▏ | 1872/2304 [6:08:58<49:03,  6.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1137


 81%|████████▏ | 1873/2304 [6:09:03<45:07,  6.28s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0996


 81%|████████▏ | 1874/2304 [6:09:08<41:55,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0769


 81%|████████▏ | 1875/2304 [6:09:13<40:28,  5.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0851


 81%|████████▏ | 1876/2304 [6:09:21<43:43,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0948


 81%|████████▏ | 1877/2304 [6:09:28<45:55,  6.45s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 82%|████████▏ | 1878/2304 [6:09:35<48:00,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0744


 82%|████████▏ | 1879/2304 [6:09:40<44:04,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1177


 82%|████████▏ | 1880/2304 [6:09:45<41:19,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 82%|████████▏ | 1881/2304 [6:09:50<39:21,  5.58s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 82%|████████▏ | 1882/2304 [6:09:57<42:41,  6.07s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 82%|████████▏ | 1883/2304 [6:10:05<45:19,  6.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 82%|████████▏ | 1884/2304 [6:10:12<47:10,  6.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1180


 82%|████████▏ | 1885/2304 [6:10:17<43:25,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0817


 82%|████████▏ | 1886/2304 [6:10:22<40:52,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0864


 82%|████████▏ | 1887/2304 [6:10:27<38:58,  5.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0867


 82%|████████▏ | 1888/2304 [6:10:35<42:40,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0874


 82%|████████▏ | 1889/2304 [6:10:42<44:45,  6.47s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0723


 82%|████████▏ | 1890/2304 [6:10:49<46:20,  6.72s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0928


 82%|████████▏ | 1891/2304 [6:10:54<42:55,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0752


 82%|████████▏ | 1892/2304 [6:10:59<40:02,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0739


 82%|████████▏ | 1893/2304 [6:11:04<38:48,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1122


 82%|████████▏ | 1894/2304 [6:11:12<42:09,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0758


 82%|████████▏ | 1895/2304 [6:11:19<44:08,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0766


 82%|████████▏ | 1896/2304 [6:11:27<46:18,  6.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1304


 82%|████████▏ | 1897/2304 [6:11:32<42:30,  6.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0708


 82%|████████▏ | 1898/2304 [6:11:36<39:37,  5.86s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 82%|████████▏ | 1899/2304 [6:11:42<37:59,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0980


 82%|████████▏ | 1900/2304 [6:11:49<41:22,  6.14s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0752


 83%|████████▎ | 1901/2304 [6:11:56<43:42,  6.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0660


 83%|████████▎ | 1902/2304 [6:12:04<45:19,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0787


 83%|████████▎ | 1903/2304 [6:12:09<41:34,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0950


 83%|████████▎ | 1904/2304 [6:12:14<39:10,  5.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 83%|████████▎ | 1905/2304 [6:12:19<37:21,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0881


 83%|████████▎ | 1906/2304 [6:12:26<40:51,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0648


 83%|████████▎ | 1907/2304 [6:12:34<43:36,  6.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0656


 83%|████████▎ | 1908/2304 [6:12:41<45:25,  6.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0863


 83%|████████▎ | 1909/2304 [6:12:46<41:52,  6.36s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0975


 83%|████████▎ | 1910/2304 [6:12:51<38:57,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0799


 83%|████████▎ | 1911/2304 [6:12:56<36:53,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0745


 83%|████████▎ | 1912/2304 [6:13:04<40:34,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1432


 83%|████████▎ | 1913/2304 [6:13:11<42:21,  6.50s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0705


 83%|████████▎ | 1914/2304 [6:13:18<44:06,  6.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1462


 83%|████████▎ | 1915/2304 [6:13:23<40:26,  6.24s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0838


 83%|████████▎ | 1916/2304 [6:13:28<38:01,  5.88s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0818


 83%|████████▎ | 1917/2304 [6:13:34<36:33,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0814


 83%|████████▎ | 1918/2304 [6:13:41<39:51,  6.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0749


 83%|████████▎ | 1919/2304 [6:13:48<41:57,  6.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0668


 83%|████████▎ | 1920/2304 [6:13:56<43:43,  6.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1288


 83%|████████▎ | 1921/2304 [6:14:01<40:11,  6.30s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0796


 83%|████████▎ | 1922/2304 [6:14:06<37:44,  5.93s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0672


 83%|████████▎ | 1923/2304 [6:14:11<36:00,  5.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1420


 84%|████████▎ | 1924/2304 [6:14:18<39:06,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0767


 84%|████████▎ | 1925/2304 [6:14:26<41:14,  6.53s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 84%|████████▎ | 1926/2304 [6:14:33<42:45,  6.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0765


 84%|████████▎ | 1927/2304 [6:14:38<39:40,  6.32s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0976


 84%|████████▎ | 1928/2304 [6:14:43<37:07,  5.92s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0711


 84%|████████▎ | 1929/2304 [6:14:48<35:18,  5.65s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1173


 84%|████████▍ | 1930/2304 [6:14:56<38:36,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0670


 84%|████████▍ | 1931/2304 [6:15:03<40:17,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 84%|████████▍ | 1932/2304 [6:15:10<41:46,  6.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0885


 84%|████████▍ | 1933/2304 [6:15:16<38:47,  6.27s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1068


 84%|████████▍ | 1934/2304 [6:15:20<36:04,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0807


 84%|████████▍ | 1935/2304 [6:15:26<34:44,  5.65s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0956


 84%|████████▍ | 1936/2304 [6:15:33<37:45,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


 84%|████████▍ | 1937/2304 [6:15:40<39:29,  6.46s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0774


 84%|████████▍ | 1938/2304 [6:15:47<41:05,  6.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0873


 84%|████████▍ | 1939/2304 [6:15:52<37:49,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0937


 84%|████████▍ | 1940/2304 [6:15:57<35:17,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0789


 84%|████████▍ | 1941/2304 [6:16:03<33:58,  5.62s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0905


 84%|████████▍ | 1942/2304 [6:16:10<36:56,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0985


 84%|████████▍ | 1943/2304 [6:16:17<39:13,  6.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0710


 84%|████████▍ | 1944/2304 [6:16:25<40:40,  6.78s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0971


 84%|████████▍ | 1945/2304 [6:16:30<37:23,  6.25s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 84%|████████▍ | 1946/2304 [6:16:35<35:08,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0939


 85%|████████▍ | 1947/2304 [6:16:40<33:28,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0847


 85%|████████▍ | 1948/2304 [6:16:47<36:18,  6.12s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0661


 85%|████████▍ | 1949/2304 [6:16:54<38:31,  6.51s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 85%|████████▍ | 1950/2304 [6:17:02<39:53,  6.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0852


 85%|████████▍ | 1951/2304 [6:17:07<37:00,  6.29s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 85%|████████▍ | 1952/2304 [6:17:12<34:31,  5.89s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0771


 85%|████████▍ | 1953/2304 [6:17:17<32:56,  5.63s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0811


 85%|████████▍ | 1954/2304 [6:17:24<36:02,  6.18s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0651


 85%|████████▍ | 1955/2304 [6:17:32<37:54,  6.52s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0671


 85%|████████▍ | 1956/2304 [6:17:39<39:27,  6.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0662


 85%|████████▍ | 1957/2304 [6:17:44<36:10,  6.26s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0822


 85%|████████▍ | 1958/2304 [6:17:49<33:37,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0753


 85%|████████▌ | 1959/2304 [6:17:54<32:16,  5.61s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1085


 85%|████████▌ | 1960/2304 [6:18:01<34:58,  6.10s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0935


 85%|████████▌ | 1961/2304 [6:18:09<36:43,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0819


 85%|████████▌ | 1962/2304 [6:18:16<38:24,  6.74s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 85%|████████▌ | 1963/2304 [6:18:21<35:16,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0986


 85%|████████▌ | 1964/2304 [6:18:26<33:09,  5.85s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0830


 85%|████████▌ | 1965/2304 [6:18:31<31:34,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0843


 85%|████████▌ | 1966/2304 [6:18:38<34:18,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 85%|████████▌ | 1967/2304 [6:18:46<36:23,  6.48s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 85%|████████▌ | 1968/2304 [6:18:53<37:40,  6.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0805


 85%|████████▌ | 1969/2304 [6:18:58<34:43,  6.22s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0738


 86%|████████▌ | 1970/2304 [6:19:03<32:40,  5.87s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0881


 86%|████████▌ | 1971/2304 [6:19:08<31:01,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 86%|████████▌ | 1972/2304 [6:19:15<33:54,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.1000


 86%|████████▌ | 1973/2304 [6:19:22<35:24,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0680


 86%|████████▌ | 1974/2304 [6:19:30<36:40,  6.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0919


 86%|████████▌ | 1975/2304 [6:19:35<34:03,  6.21s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0970


 86%|████████▌ | 1976/2304 [6:19:40<31:48,  5.82s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0675


 86%|████████▌ | 1977/2304 [6:19:45<30:32,  5.60s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0649


 86%|████████▌ | 1978/2304 [6:19:52<33:02,  6.08s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0659


 86%|████████▌ | 1979/2304 [6:19:59<34:32,  6.38s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0674


 86%|████████▌ | 1980/2304 [6:20:06<36:08,  6.69s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0661


 86%|████████▌ | 1981/2304 [6:20:11<33:11,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0840


 86%|████████▌ | 1982/2304 [6:20:16<30:55,  5.76s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 86%|████████▌ | 1983/2304 [6:20:21<29:44,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1006


 86%|████████▌ | 1984/2304 [6:20:29<32:16,  6.05s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0813


 86%|████████▌ | 1985/2304 [6:20:36<34:07,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0688


 86%|████████▌ | 1986/2304 [6:20:43<35:32,  6.71s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 86%|████████▌ | 1987/2304 [6:20:48<32:36,  6.17s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0804


 86%|████████▋ | 1988/2304 [6:20:53<30:42,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0761


 86%|████████▋ | 1989/2304 [6:20:58<29:12,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0777


 86%|████████▋ | 1990/2304 [6:21:05<31:43,  6.06s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0827


 86%|████████▋ | 1991/2304 [6:21:13<33:28,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0742


 86%|████████▋ | 1992/2304 [6:21:20<34:43,  6.68s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0843


 87%|████████▋ | 1993/2304 [6:21:25<32:05,  6.19s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0947


 87%|████████▋ | 1994/2304 [6:21:30<29:54,  5.79s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 87%|████████▋ | 1995/2304 [6:21:35<28:37,  5.56s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0742


 87%|████████▋ | 1996/2304 [6:21:42<31:26,  6.13s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0654


 87%|████████▋ | 1997/2304 [6:21:49<32:50,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0676


 87%|████████▋ | 1998/2304 [6:21:57<34:19,  6.73s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0850


 87%|████████▋ | 1999/2304 [6:22:02<31:30,  6.20s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.1054


 87%|████████▋ | 2000/2304 [6:22:07<29:25,  5.81s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0649


 87%|████████▋ | 2001/2304 [6:22:12<28:13,  5.59s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.1019


 87%|████████▋ | 2002/2304 [6:22:19<30:40,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 87%|████████▋ | 2003/2304 [6:22:26<31:58,  6.37s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 87%|████████▋ | 2004/2304 [6:22:33<33:28,  6.70s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0666


 87%|████████▋ | 2005/2304 [6:22:38<30:41,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0783


 87%|████████▋ | 2006/2304 [6:22:43<28:49,  5.80s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0744


 87%|████████▋ | 2007/2304 [6:22:48<27:26,  5.54s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0997


 87%|████████▋ | 2008/2304 [6:22:55<29:47,  6.04s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0644


 87%|████████▋ | 2009/2304 [6:23:03<31:33,  6.42s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 87%|████████▋ | 2010/2304 [6:23:10<32:37,  6.66s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0850


 87%|████████▋ | 2011/2304 [6:23:15<30:03,  6.16s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0725


 87%|████████▋ | 2012/2304 [6:23:20<28:21,  5.83s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0857


 87%|████████▋ | 2013/2304 [6:23:25<27:01,  5.57s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 87%|████████▋ | 2014/2304 [6:23:32<29:24,  6.09s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0660


 87%|████████▋ | 2015/2304 [6:23:39<30:47,  6.39s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0700


 88%|████████▊ | 2016/2304 [6:23:47<31:59,  6.67s/it]

Config: {'activation': 'gelu', 'batch_size': 256, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1015


 88%|████████▊ | 2017/2304 [6:23:50<27:00,  5.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0726


 88%|████████▊ | 2018/2304 [6:23:53<23:19,  4.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


 88%|████████▊ | 2019/2304 [6:23:56<20:54,  4.40s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0737


 88%|████████▊ | 2020/2304 [6:24:01<20:52,  4.41s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0693


 88%|████████▊ | 2021/2304 [6:24:05<20:47,  4.41s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0659


 88%|████████▊ | 2022/2304 [6:24:09<20:33,  4.37s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0720


 88%|████████▊ | 2023/2304 [6:24:13<18:56,  4.04s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0722


 88%|████████▊ | 2024/2304 [6:24:16<17:26,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 88%|████████▊ | 2025/2304 [6:24:19<16:37,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0689


 88%|████████▊ | 2026/2304 [6:24:23<17:41,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0676


 88%|████████▊ | 2027/2304 [6:24:28<18:19,  3.97s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0722


 88%|████████▊ | 2028/2304 [6:24:32<18:44,  4.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0740


 88%|████████▊ | 2029/2304 [6:24:35<17:38,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0672


 88%|████████▊ | 2030/2304 [6:24:38<16:36,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 88%|████████▊ | 2031/2304 [6:24:42<16:07,  3.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0685


 88%|████████▊ | 2032/2304 [6:24:46<17:18,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0655


 88%|████████▊ | 2033/2304 [6:24:51<17:58,  3.98s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0689


 88%|████████▊ | 2034/2304 [6:24:55<18:23,  4.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0682


 88%|████████▊ | 2035/2304 [6:24:58<17:10,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0677


 88%|████████▊ | 2036/2304 [6:25:01<16:03,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 88%|████████▊ | 2037/2304 [6:25:04<15:32,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0721


 88%|████████▊ | 2038/2304 [6:25:09<16:40,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0710


 88%|████████▊ | 2039/2304 [6:25:13<17:09,  3.88s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0715


 89%|████████▊ | 2040/2304 [6:25:18<17:56,  4.08s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0666


 89%|████████▊ | 2041/2304 [6:25:21<16:45,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0685


 89%|████████▊ | 2042/2304 [6:25:24<15:38,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0717


 89%|████████▊ | 2043/2304 [6:25:27<15:04,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0722


 89%|████████▊ | 2044/2304 [6:25:32<16:21,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0784


 89%|████████▉ | 2045/2304 [6:25:36<16:53,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0733


 89%|████████▉ | 2046/2304 [6:25:40<17:39,  4.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0669


 89%|████████▉ | 2047/2304 [6:25:44<16:28,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0684


 89%|████████▉ | 2048/2304 [6:25:47<15:23,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0683


 89%|████████▉ | 2049/2304 [6:25:50<14:51,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0690


 89%|████████▉ | 2050/2304 [6:25:54<16:05,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0700


 89%|████████▉ | 2051/2304 [6:25:59<16:38,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0677


 89%|████████▉ | 2052/2304 [6:26:03<17:13,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0693


 89%|████████▉ | 2053/2304 [6:26:06<16:12,  3.87s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 89%|████████▉ | 2054/2304 [6:26:10<15:51,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 89%|████████▉ | 2055/2304 [6:26:14<15:53,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0723


 89%|████████▉ | 2056/2304 [6:26:19<17:38,  4.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0680


 89%|████████▉ | 2057/2304 [6:26:24<17:37,  4.28s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0721


 89%|████████▉ | 2058/2304 [6:26:28<17:48,  4.34s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0768


 89%|████████▉ | 2059/2304 [6:26:31<16:27,  4.03s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0706


 89%|████████▉ | 2060/2304 [6:26:34<15:17,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0688


 89%|████████▉ | 2061/2304 [6:26:38<14:35,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0662


 89%|████████▉ | 2062/2304 [6:26:42<15:34,  3.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0726


 90%|████████▉ | 2063/2304 [6:26:46<16:01,  3.99s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0690


 90%|████████▉ | 2064/2304 [6:26:51<16:31,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0708


 90%|████████▉ | 2065/2304 [6:26:54<15:22,  3.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0697


 90%|████████▉ | 2066/2304 [6:26:57<14:15,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0777


 90%|████████▉ | 2067/2304 [6:27:00<13:44,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0675


 90%|████████▉ | 2068/2304 [6:27:05<14:38,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0704


 90%|████████▉ | 2069/2304 [6:27:09<15:21,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0649


 90%|████████▉ | 2070/2304 [6:27:13<15:53,  4.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0701


 90%|████████▉ | 2071/2304 [6:27:17<14:56,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0808


 90%|████████▉ | 2072/2304 [6:27:20<13:54,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0718


 90%|████████▉ | 2073/2304 [6:27:23<13:27,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0711


 90%|█████████ | 2074/2304 [6:27:27<14:18,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0729


 90%|█████████ | 2075/2304 [6:27:32<15:02,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 90%|█████████ | 2076/2304 [6:27:36<15:35,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0700


 90%|█████████ | 2077/2304 [6:27:39<14:23,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0690


 90%|█████████ | 2078/2304 [6:27:43<13:44,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0704


 90%|█████████ | 2079/2304 [6:27:46<13:10,  3.51s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0714


 90%|█████████ | 2080/2304 [6:27:50<13:58,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0652


 90%|█████████ | 2081/2304 [6:27:55<14:40,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0754


 90%|█████████ | 2082/2304 [6:27:59<15:08,  4.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


 90%|█████████ | 2083/2304 [6:28:02<14:02,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0714


 90%|█████████ | 2084/2304 [6:28:06<13:54,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0741


 90%|█████████ | 2085/2304 [6:28:10<13:41,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0691


 91%|█████████ | 2086/2304 [6:28:15<15:05,  4.15s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0703


 91%|█████████ | 2087/2304 [6:28:20<15:53,  4.39s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 91%|█████████ | 2088/2304 [6:28:24<16:04,  4.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0678


 91%|█████████ | 2089/2304 [6:28:27<14:37,  4.08s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0734


 91%|█████████ | 2090/2304 [6:28:31<13:39,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 91%|█████████ | 2091/2304 [6:28:34<12:48,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


 91%|█████████ | 2092/2304 [6:28:38<13:39,  3.87s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0701


 91%|█████████ | 2093/2304 [6:28:43<14:10,  4.03s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 91%|█████████ | 2094/2304 [6:28:47<14:34,  4.17s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0774


 91%|█████████ | 2095/2304 [6:28:50<13:21,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 91%|█████████ | 2096/2304 [6:28:53<12:41,  3.66s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0694


 91%|█████████ | 2097/2304 [6:28:57<12:03,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0767


 91%|█████████ | 2098/2304 [6:29:01<12:53,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0681


 91%|█████████ | 2099/2304 [6:29:05<13:29,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0733


 91%|█████████ | 2100/2304 [6:29:10<13:58,  4.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0720


 91%|█████████ | 2101/2304 [6:29:13<12:56,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0663


 91%|█████████ | 2102/2304 [6:29:16<12:14,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0721


 91%|█████████▏| 2103/2304 [6:29:19<11:39,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0697


 91%|█████████▏| 2104/2304 [6:29:24<12:41,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 91%|█████████▏| 2105/2304 [6:29:28<13:08,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0717


 91%|█████████▏| 2106/2304 [6:29:33<13:36,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0687


 91%|█████████▏| 2107/2304 [6:29:36<12:36,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0686


 91%|█████████▏| 2108/2304 [6:29:40<12:30,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0731


 92%|█████████▏| 2109/2304 [6:29:43<12:21,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 92%|█████████▏| 2110/2304 [6:29:49<13:42,  4.24s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0706


 92%|█████████▏| 2111/2304 [6:29:54<14:26,  4.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 92%|█████████▏| 2112/2304 [6:29:58<14:30,  4.54s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 32, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0702


 92%|█████████▏| 2113/2304 [6:30:01<13:05,  4.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0779


 92%|█████████▏| 2114/2304 [6:30:05<12:07,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0692


 92%|█████████▏| 2115/2304 [6:30:08<11:24,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0795


 92%|█████████▏| 2116/2304 [6:30:12<12:05,  3.86s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0685


 92%|█████████▏| 2117/2304 [6:30:17<12:30,  4.01s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0709


 92%|█████████▏| 2118/2304 [6:30:21<12:48,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0694


 92%|█████████▏| 2119/2304 [6:30:24<11:43,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0825


 92%|█████████▏| 2120/2304 [6:30:27<11:07,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0686


 92%|█████████▏| 2121/2304 [6:30:30<10:43,  3.52s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0715


 92%|█████████▏| 2122/2304 [6:30:35<11:28,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0911


 92%|█████████▏| 2123/2304 [6:30:39<11:56,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0659


 92%|█████████▏| 2124/2304 [6:30:44<12:24,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0746


 92%|█████████▏| 2125/2304 [6:30:47<11:21,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0673


 92%|█████████▏| 2126/2304 [6:30:50<10:47,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0693


 92%|█████████▏| 2127/2304 [6:30:53<10:16,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0710


 92%|█████████▏| 2128/2304 [6:30:58<11:07,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0678


 92%|█████████▏| 2129/2304 [6:31:02<11:32,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0706


 92%|█████████▏| 2130/2304 [6:31:06<11:53,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 92%|█████████▏| 2131/2304 [6:31:10<10:55,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0716


 93%|█████████▎| 2132/2304 [6:31:13<10:17,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0700


 93%|█████████▎| 2133/2304 [6:31:16<09:44,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0706


 93%|█████████▎| 2134/2304 [6:31:20<10:32,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0669


 93%|█████████▎| 2135/2304 [6:31:25<11:02,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0696


 93%|█████████▎| 2136/2304 [6:31:29<11:28,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0725


 93%|█████████▎| 2137/2304 [6:31:32<10:35,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0712


 93%|█████████▎| 2138/2304 [6:31:35<09:59,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0793


 93%|█████████▎| 2139/2304 [6:31:38<09:28,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 93%|█████████▎| 2140/2304 [6:31:43<10:14,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0775


 93%|█████████▎| 2141/2304 [6:31:47<10:41,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0654


 93%|█████████▎| 2142/2304 [6:31:52<11:05,  4.11s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0686


 93%|█████████▎| 2143/2304 [6:31:55<10:13,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0765


 93%|█████████▎| 2144/2304 [6:31:58<09:39,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 93%|█████████▎| 2145/2304 [6:32:01<09:12,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0748


 93%|█████████▎| 2146/2304 [6:32:06<10:01,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0737


 93%|█████████▎| 2147/2304 [6:32:11<10:57,  4.19s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0703


 93%|█████████▎| 2148/2304 [6:32:16<11:33,  4.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0704


 93%|█████████▎| 2149/2304 [6:32:20<11:01,  4.27s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0682


 93%|█████████▎| 2150/2304 [6:32:23<10:31,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0682


 93%|█████████▎| 2151/2304 [6:32:27<09:47,  3.84s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0688


 93%|█████████▎| 2152/2304 [6:32:31<10:14,  4.04s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0691


 93%|█████████▎| 2153/2304 [6:32:36<10:26,  4.15s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0711


 93%|█████████▎| 2154/2304 [6:32:40<10:27,  4.18s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0716


 94%|█████████▎| 2155/2304 [6:32:43<09:41,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0676


 94%|█████████▎| 2156/2304 [6:32:46<09:04,  3.68s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0704


 94%|█████████▎| 2157/2304 [6:32:49<08:34,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0723


 94%|█████████▎| 2158/2304 [6:32:54<09:11,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0741


 94%|█████████▎| 2159/2304 [6:32:58<09:33,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 94%|█████████▍| 2160/2304 [6:33:03<09:50,  4.10s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.1186


 94%|█████████▍| 2161/2304 [6:33:06<09:03,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0807


 94%|█████████▍| 2162/2304 [6:33:09<08:34,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0774


 94%|█████████▍| 2163/2304 [6:33:12<08:06,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0782


 94%|█████████▍| 2164/2304 [6:33:16<08:43,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0715


 94%|█████████▍| 2165/2304 [6:33:21<09:05,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0665


 94%|█████████▍| 2166/2304 [6:33:25<09:21,  4.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0849


 94%|█████████▍| 2167/2304 [6:33:28<08:34,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 94%|█████████▍| 2168/2304 [6:33:31<08:09,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0790


 94%|█████████▍| 2169/2304 [6:33:34<07:42,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0761


 94%|█████████▍| 2170/2304 [6:33:39<08:17,  3.71s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0923


 94%|█████████▍| 2171/2304 [6:33:43<08:39,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 94%|█████████▍| 2172/2304 [6:33:47<08:54,  4.05s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0813


 94%|█████████▍| 2173/2304 [6:33:51<08:10,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0749


 94%|█████████▍| 2174/2304 [6:33:54<07:45,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 94%|█████████▍| 2175/2304 [6:33:57<07:21,  3.42s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0721


 94%|█████████▍| 2176/2304 [6:34:01<07:57,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0771


 94%|█████████▍| 2177/2304 [6:34:06<08:17,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0712


 95%|█████████▍| 2178/2304 [6:34:10<08:34,  4.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0715


 95%|█████████▍| 2179/2304 [6:34:13<07:51,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0723


 95%|█████████▍| 2180/2304 [6:34:16<07:23,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0699


 95%|█████████▍| 2181/2304 [6:34:19<07:05,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0667


 95%|█████████▍| 2182/2304 [6:34:24<07:37,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0711


 95%|█████████▍| 2183/2304 [6:34:28<07:54,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0734


 95%|█████████▍| 2184/2304 [6:34:32<08:04,  4.04s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0683


 95%|█████████▍| 2185/2304 [6:34:36<07:35,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0768


 95%|█████████▍| 2186/2304 [6:34:39<07:10,  3.65s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0715


 95%|█████████▍| 2187/2304 [6:34:42<06:47,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0868


 95%|█████████▍| 2188/2304 [6:34:47<07:18,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0730


 95%|█████████▌| 2189/2304 [6:34:51<07:34,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0667


 95%|█████████▌| 2190/2304 [6:34:55<07:42,  4.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0885


 95%|█████████▌| 2191/2304 [6:34:58<07:11,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0772


 95%|█████████▌| 2192/2304 [6:35:02<06:45,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 95%|█████████▌| 2193/2304 [6:35:05<06:22,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 95%|█████████▌| 2194/2304 [6:35:09<06:51,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0716


 95%|█████████▌| 2195/2304 [6:35:13<07:06,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0684


 95%|█████████▌| 2196/2304 [6:35:18<07:14,  4.02s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0871


 95%|█████████▌| 2197/2304 [6:35:21<06:44,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0758


 95%|█████████▌| 2198/2304 [6:35:24<06:21,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0795


 95%|█████████▌| 2199/2304 [6:35:27<06:00,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0741


 95%|█████████▌| 2200/2304 [6:35:32<06:27,  3.72s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0769


 96%|█████████▌| 2201/2304 [6:35:36<06:43,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0729


 96%|█████████▌| 2202/2304 [6:35:40<06:51,  4.03s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0775


 96%|█████████▌| 2203/2304 [6:35:43<06:23,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0773


 96%|█████████▌| 2204/2304 [6:35:46<05:55,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0751


 96%|█████████▌| 2205/2304 [6:35:50<05:41,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0728


 96%|█████████▌| 2206/2304 [6:35:54<06:05,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0892


 96%|█████████▌| 2207/2304 [6:35:58<06:19,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0735


 96%|█████████▌| 2208/2304 [6:36:03<06:26,  4.03s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 64, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0738


 96%|█████████▌| 2209/2304 [6:36:06<06:01,  3.80s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0814


 96%|█████████▌| 2210/2304 [6:36:09<05:36,  3.57s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0711


 96%|█████████▌| 2211/2304 [6:36:12<05:21,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0755


 96%|█████████▌| 2212/2304 [6:36:17<05:43,  3.73s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0713


 96%|█████████▌| 2213/2304 [6:36:21<05:55,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0673


 96%|█████████▌| 2214/2304 [6:36:25<05:59,  4.00s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0798


 96%|█████████▌| 2215/2304 [6:36:28<05:34,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0925


 96%|█████████▌| 2216/2304 [6:36:31<05:15,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0666


 96%|█████████▌| 2217/2304 [6:36:35<04:59,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0701


 96%|█████████▋| 2218/2304 [6:36:39<05:21,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0741


 96%|█████████▋| 2219/2304 [6:36:43<05:35,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 96%|█████████▋| 2220/2304 [6:36:48<05:41,  4.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0698


 96%|█████████▋| 2221/2304 [6:36:51<05:15,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0780


 96%|█████████▋| 2222/2304 [6:36:54<04:58,  3.64s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0665


 96%|█████████▋| 2223/2304 [6:36:57<04:40,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0829


 97%|█████████▋| 2224/2304 [6:37:02<05:00,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0684


 97%|█████████▋| 2225/2304 [6:37:06<05:11,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0713


 97%|█████████▋| 2226/2304 [6:37:10<05:17,  4.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0880


 97%|█████████▋| 2227/2304 [6:37:14<04:54,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0709


 97%|█████████▋| 2228/2304 [6:37:17<04:35,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0747


 97%|█████████▋| 2229/2304 [6:37:20<04:20,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0702


 97%|█████████▋| 2230/2304 [6:37:24<04:37,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0663


 97%|█████████▋| 2231/2304 [6:37:29<04:46,  3.92s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0749


 97%|█████████▋| 2232/2304 [6:37:33<04:49,  4.03s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0680


 97%|█████████▋| 2233/2304 [6:37:36<04:28,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0766


 97%|█████████▋| 2234/2304 [6:37:39<04:11,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0677


 97%|█████████▋| 2235/2304 [6:37:42<03:56,  3.43s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0732


 97%|█████████▋| 2236/2304 [6:37:47<04:14,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0819


 97%|█████████▋| 2237/2304 [6:37:51<04:23,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0740


 97%|█████████▋| 2238/2304 [6:37:56<04:28,  4.07s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0692


 97%|█████████▋| 2239/2304 [6:37:59<04:08,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0782


 97%|█████████▋| 2240/2304 [6:38:02<03:52,  3.63s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0743


 97%|█████████▋| 2241/2304 [6:38:05<03:38,  3.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0724


 97%|█████████▋| 2242/2304 [6:38:10<03:52,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0827


 97%|█████████▋| 2243/2304 [6:38:14<04:00,  3.95s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0672


 97%|█████████▋| 2244/2304 [6:38:18<04:02,  4.04s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0711


 97%|█████████▋| 2245/2304 [6:38:21<03:43,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0756


 97%|█████████▋| 2246/2304 [6:38:25<03:29,  3.62s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0712


 98%|█████████▊| 2247/2304 [6:38:28<03:16,  3.45s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 98%|█████████▊| 2248/2304 [6:38:32<03:29,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0933


 98%|█████████▊| 2249/2304 [6:38:36<03:34,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0755


 98%|█████████▊| 2250/2304 [6:38:41<03:36,  4.02s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0694


 98%|█████████▊| 2251/2304 [6:38:44<03:20,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0720


 98%|█████████▊| 2252/2304 [6:38:47<03:07,  3.61s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0723


 98%|█████████▊| 2253/2304 [6:38:50<02:55,  3.44s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0897


 98%|█████████▊| 2254/2304 [6:38:55<03:06,  3.74s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0701


 98%|█████████▊| 2255/2304 [6:38:59<03:13,  3.94s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0871


 98%|█████████▊| 2256/2304 [6:39:03<03:14,  4.05s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0703


 98%|█████████▊| 2257/2304 [6:39:07<02:58,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0861


 98%|█████████▊| 2258/2304 [6:39:10<02:44,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0724


 98%|█████████▊| 2259/2304 [6:39:13<02:36,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0861


 98%|█████████▊| 2260/2304 [6:39:17<02:45,  3.75s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0786


 98%|█████████▊| 2261/2304 [6:39:22<02:49,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0682


 98%|█████████▊| 2262/2304 [6:39:26<02:49,  4.04s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0702


 98%|█████████▊| 2263/2304 [6:39:29<02:35,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0820


 98%|█████████▊| 2264/2304 [6:39:32<02:22,  3.56s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0710


 98%|█████████▊| 2265/2304 [6:39:35<02:15,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0879


 98%|█████████▊| 2266/2304 [6:39:40<02:22,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0809


 98%|█████████▊| 2267/2304 [6:39:44<02:25,  3.93s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0683


 98%|█████████▊| 2268/2304 [6:39:49<02:27,  4.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0724


 98%|█████████▊| 2269/2304 [6:39:52<02:13,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0742


 99%|█████████▊| 2270/2304 [6:39:55<02:02,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0728


 99%|█████████▊| 2271/2304 [6:39:58<01:56,  3.53s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0695


 99%|█████████▊| 2272/2304 [6:40:03<02:02,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0752


 99%|█████████▊| 2273/2304 [6:40:07<02:02,  3.96s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0718


 99%|█████████▊| 2274/2304 [6:40:12<02:03,  4.13s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0760


 99%|█████████▊| 2275/2304 [6:40:15<01:51,  3.85s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0918


 99%|█████████▉| 2276/2304 [6:40:18<01:40,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0738


 99%|█████████▉| 2277/2304 [6:40:21<01:34,  3.50s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0735


 99%|█████████▉| 2278/2304 [6:40:25<01:38,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0723


 99%|█████████▉| 2279/2304 [6:40:30<01:37,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0714


 99%|█████████▉| 2280/2304 [6:40:34<01:37,  4.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 3, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0835


 99%|█████████▉| 2281/2304 [6:40:37<01:27,  3.81s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0681


 99%|█████████▉| 2282/2304 [6:40:40<01:18,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0722


 99%|█████████▉| 2283/2304 [6:40:44<01:13,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0717


 99%|█████████▉| 2284/2304 [6:40:48<01:15,  3.78s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0735


 99%|█████████▉| 2285/2304 [6:40:52<01:14,  3.91s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0686


 99%|█████████▉| 2286/2304 [6:40:57<01:13,  4.09s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0652


 99%|█████████▉| 2287/2304 [6:41:00<01:05,  3.83s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0727


 99%|█████████▉| 2288/2304 [6:41:03<00:57,  3.60s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0760


 99%|█████████▉| 2289/2304 [6:41:06<00:52,  3.49s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0794


 99%|█████████▉| 2290/2304 [6:41:11<00:52,  3.77s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0719


 99%|█████████▉| 2291/2304 [6:41:15<00:50,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0701


 99%|█████████▉| 2292/2304 [6:41:19<00:48,  4.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0674


100%|█████████▉| 2293/2304 [6:41:23<00:41,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0879


100%|█████████▉| 2294/2304 [6:41:26<00:35,  3.58s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0695


100%|█████████▉| 2295/2304 [6:41:29<00:31,  3.46s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0726


100%|█████████▉| 2296/2304 [6:41:33<00:30,  3.76s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0697


100%|█████████▉| 2297/2304 [6:41:37<00:27,  3.89s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0666


100%|█████████▉| 2298/2304 [6:41:42<00:24,  4.06s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 2, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0727


100%|█████████▉| 2299/2304 [6:41:45<00:19,  3.82s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'Adam'}, Loss: 0.0767


100%|█████████▉| 2300/2304 [6:41:48<00:14,  3.59s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'RMSprop'}, Loss: 0.0831


100%|█████████▉| 2301/2304 [6:41:51<00:10,  3.48s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 2, 'optimizer': 'AdamW'}, Loss: 0.0719


100%|█████████▉| 2302/2304 [6:41:56<00:07,  3.79s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'Adam'}, Loss: 0.0754


100%|█████████▉| 2303/2304 [6:42:00<00:03,  3.90s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}, Loss: 0.0663


100%|██████████| 2304/2304 [6:42:05<00:00, 10.47s/it]

Config: {'activation': 'gelu', 'batch_size': 512, 'd_model': 128, 'dim_feedforward': 512, 'kernel_size': 5, 'lr': 0.0005, 'nhead': 4, 'num_layers': 4, 'optimizer': 'AdamW'}, Loss: 0.0712

✅ Best Config: {'activation': 'relu', 'batch_size': 64, 'd_model': 128, 'dim_feedforward': 256, 'kernel_size': 5, 'lr': 0.001, 'nhead': 4, 'num_layers': 4, 'optimizer': 'RMSprop'}
✅ Best Loss: 0.0638





##### 2.3 Test

In [None]:
def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    model.to(device)

    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred)
            targets.append(yb)

    # (B*T, 8, 8) 텐서 형태로 합치기
    preds_tensor = torch.cat(preds, dim=0)
    targets_tensor = torch.cat(targets, dim=0)

    return preds_tensor, targets_tensor

with open(f'{model_save_path}/best_model_window10per30_CT_config.json', 'r') as f:
    best_config = json.load(f)

best_model = CorrPredictorCNNTransformer(
    kernel_size=best_config['kernel_size'],
    d_model=best_config['d_model'],
    nhead=best_config['nhead'],
    num_layers=best_config['num_layers'],
    dim_feedforward=best_config['dim_feedforward'],
    activation=best_config['activation']
)
best_model.load_state_dict(torch.load(f"{model_save_path}/best_model_window10per30_CT_weights.pth"))

test_loader = DataLoader(test_ds, batch_size=best_config['batch_size'], shuffle=False)
preds_tensor, targets_tensor = evaluate_model(best_model, test_loader, device=device)

# 저장
torch.save({
    'preds': preds_tensor,
    'targets': targets_tensor
}, f"{model_save_path}/best_model_window10per30_CT_result.pt")

In [None]:
# Performance metrics

preds_flat = preds_tensor.view(preds_tensor.size(0), -1).cpu().numpy()
targets_flat = targets_tensor.view(targets_tensor.size(0), -1).cpu().numpy()

mse = mean_squared_error(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
rmse = np.sqrt(mse)

# frobenius_loss
cos_sim = cosine_similarity(targets_flat, preds_flat)
mean_cos_sim = np.diag(cos_sim).mean()

# frobenius_loss
diff = preds_tensor - targets_tensor
frobenius_per_sample = torch.norm(diff, p='fro', dim=(1, 2))
mean_frobenius = frobenius_per_sample.mean().item()

print(f"\n📊 Evaluation Results:")
print(f"MSE               : {mse:.5f}")
print(f"MAE               : {mae:.5f}")
print(f"RMSE              : {rmse:.5f}")
print(f"Cosine Similarity : {mean_cos_sim:.5f}")
print(f"Frobenius Norm    : {mean_frobenius:.5f}")


📊 Evaluation Results:
MSE               : 0.07216
MAE               : 0.18775
RMSE              : 0.26862
Cosine Similarity : 0.93739
Frobenius Norm    : 1.91125
