In [1]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

In [2]:
# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE.type

'cuda'

In [3]:
# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Pre-processing input data

In [4]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [5]:
results_df = pd.read_csv('../results/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [6]:
# remove one computer for testing
g_train = results_df[results_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])
g_test = results_df[results_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])

In [7]:
mm_df = results_df[results_df['benchmark']=='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
mm_train = mm_df[mm_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
mm_test = mm_df[mm_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [8]:
st_df = results_df[results_df['benchmark']!='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
st_train = st_df[st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
st_test = st_df[st_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [9]:
features = mm_test.columns[1:]
target = mm_test.columns[0]

In [10]:
# general data
## split data
X_g_train = g_train[features]
y_g_train = g_train[target]

X_g_test = g_test[features]
y_g_test = g_test[target]

## normalize data
scaler = StandardScaler()
X_g_train = scaler.fit_transform(X_g_train)
X_g_test = scaler.transform(X_g_test)

## convert to tensor
X_g_train = torch.tensor(X_g_train, dtype=torch.float32).unsqueeze(1)
X_g_test = torch.tensor(X_g_test, dtype=torch.float32).unsqueeze(1)
y_g_train = torch.tensor(y_g_train.values, dtype=torch.float32).view(-1, 1)
y_g_test = torch.tensor(y_g_test.values, dtype=torch.float32).view(-1, 1)

In [11]:
# single thread data
## split data
X_st_train = st_train[features]
y_st_train = st_train[target]

X_st_test = st_test[features]
y_st_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_st_train = scaler.fit_transform(X_st_train)
X_st_test = scaler.transform(X_st_test)

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32).unsqueeze(1)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32).unsqueeze(1)
y_st_train = torch.tensor(y_st_train.values, dtype=torch.float32).view(-1, 1)
y_st_test = torch.tensor(y_st_test.values, dtype=torch.float32).view(-1, 1)

In [12]:
# multi thread data
## split data
X_mm_train = st_train[features]
y_mm_train = st_train[target]

X_mm_test = st_test[features]
y_mm_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_mm_train = scaler.fit_transform(X_mm_train)
X_mm_test = scaler.transform(X_mm_test)

## convert to tensor
X_mm_train = torch.tensor(X_mm_train, dtype=torch.float32).unsqueeze(1)
X_mm_test = torch.tensor(X_mm_test, dtype=torch.float32).unsqueeze(1)
y_mm_train = torch.tensor(y_mm_train.values, dtype=torch.float32).view(-1, 1)
y_mm_test = torch.tensor(y_mm_test.values, dtype=torch.float32).view(-1, 1)

if DEVICE.type == 'cuda':
	# move to DEVICE
	X_g_train = X_g_train.to(DEVICE)
	y_g_train = y_g_train.to(DEVICE)
	X_g_test = X_g_test.to(DEVICE)
	y_g_test = y_g_test.to(DEVICE)

	X_st_train = X_st_train.to(DEVICE)
	y_st_train = y_st_train.to(DEVICE)
	X_st_test = X_st_test.to(DEVICE)
	y_st_test = y_st_test.to(DEVICE)

	X_mm_train = X_mm_train.to(DEVICE)
	y_mm_train = y_mm_train.to(DEVICE)
	X_mm_test = X_mm_test.to(DEVICE)
	y_mm_test = y_mm_test.to(DEVICE)

# Model

In [13]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, dropout=0.1):
		super(TransformerModel, self).__init__()
		# layers
		self.embedding = nn.Linear(input_dim, model_dim)
		encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
		self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
		self.fc = nn.Linear(model_dim, output_dim)
		self.dropout = nn.Dropout(dropout)
	
	def forward(self, x):
		x = self.embedding(x)
		x = self.dropout(x)
		x = self.transformer(x)
		x = self.fc(x.mean(dim=1))
		return x

In [14]:
def objective(trial: optuna.Trial, X_train, y_train, X_test, y_test, input_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	model_dim = trial.suggest_int('model_dim', num_heads * 4, num_heads * 64, step=num_heads)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	dropout = trial.suggest_float('dropout', 0.1, 0.5)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
	if DEVICE.type == 'cuda':
		model = model.to(DEVICE)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()
	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

		# trial.report(val_loss.item(), epoch+1)
		# if trial.should_prune():
		# 	raise optuna.TrialPruned()
	print(f"Trial: {trial.number} - Loss: {loss.item()} - Val Loss: {val_loss.item()}")
	return val_loss.item()

# Hyperparameters Optimization

In [15]:
n_trials = 100

## General

In [16]:
# configuration optuna
study_g = optuna.create_study(direction='minimize')
study_g.optimize(lambda trial: objective(trial, X_g_train, y_g_train, X_g_test, y_g_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 01:22:53,010] A new study created in memory with name: no-name-b9e37e17-8f00-40f5-b00a-982e98fe202f
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 01:22:53,852] Trial 0 finished with value: 382.2214660644531 and parameters: {'num_heads': 6, 'model_dim': 366, 'num_layers': 2, 'dropout': 0.2436427146319615, 'learning_rate': 0.0005434352201447981, 'weight_decay': 0.0001711173563561157, 'num_epochs': 44}. Best is trial 0 with value: 382.2214660644531.


Trial: 0 - Loss: 116.48963928222656 - Val Loss: 382.2214660644531


[I 2024-06-20 01:22:54,277] Trial 1 finished with value: 346.6871032714844 and parameters: {'num_heads': 1, 'model_dim': 40, 'num_layers': 3, 'dropout': 0.21070024518744457, 'learning_rate': 0.0023484676838076667, 'weight_decay': 0.00012671613027483546, 'num_epochs': 84}. Best is trial 1 with value: 346.6871032714844.


Trial: 1 - Loss: 97.26270294189453 - Val Loss: 346.6871032714844


[I 2024-06-20 01:22:54,521] Trial 2 finished with value: 677.4373779296875 and parameters: {'num_heads': 4, 'model_dim': 236, 'num_layers': 2, 'dropout': 0.2059426149895794, 'learning_rate': 0.00013121736594080206, 'weight_decay': 0.003127410642160959, 'num_epochs': 56}. Best is trial 1 with value: 346.6871032714844.
[I 2024-06-20 01:22:54,710] Trial 3 finished with value: 712.9075317382812 and parameters: {'num_heads': 6, 'model_dim': 264, 'num_layers': 2, 'dropout': 0.16539281254007077, 'learning_rate': 0.00011141973874125393, 'weight_decay': 2.3163982904876242e-05, 'num_epochs': 34}. Best is trial 1 with value: 346.6871032714844.


Trial: 2 - Loss: 270.51531982421875 - Val Loss: 677.4373779296875
Trial: 3 - Loss: 291.061279296875 - Val Loss: 712.9075317382812


[I 2024-06-20 01:22:55,042] Trial 4 finished with value: 668.4429321289062 and parameters: {'num_heads': 7, 'model_dim': 441, 'num_layers': 2, 'dropout': 0.2783689247764861, 'learning_rate': 5.871055820930133e-05, 'weight_decay': 0.0050477928561349274, 'num_epochs': 41}. Best is trial 1 with value: 346.6871032714844.
[I 2024-06-20 01:22:55,169] Trial 5 finished with value: 846.0787353515625 and parameters: {'num_heads': 5, 'model_dim': 155, 'num_layers': 3, 'dropout': 0.4337607569074007, 'learning_rate': 2.389518825613386e-05, 'weight_decay': 0.0004407702669435401, 'num_epochs': 19}. Best is trial 1 with value: 346.6871032714844.


Trial: 4 - Loss: 264.83489990234375 - Val Loss: 668.4429321289062
Trial: 5 - Loss: 370.369384765625 - Val Loss: 846.0787353515625


[I 2024-06-20 01:22:55,651] Trial 6 finished with value: 144.23526000976562 and parameters: {'num_heads': 8, 'model_dim': 192, 'num_layers': 3, 'dropout': 0.24201800817037752, 'learning_rate': 0.003507485885537309, 'weight_decay': 0.002933467775265353, 'num_epochs': 61}. Best is trial 6 with value: 144.23526000976562.


Trial: 6 - Loss: 46.50959396362305 - Val Loss: 144.23526000976562


[I 2024-06-20 01:22:56,378] Trial 7 finished with value: 652.7785034179688 and parameters: {'num_heads': 7, 'model_dim': 252, 'num_layers': 3, 'dropout': 0.4694437329987222, 'learning_rate': 9.878950202709404e-05, 'weight_decay': 0.0007710148711054703, 'num_epochs': 90}. Best is trial 6 with value: 144.23526000976562.
[I 2024-06-20 01:22:56,546] Trial 8 finished with value: 833.9682006835938 and parameters: {'num_heads': 1, 'model_dim': 15, 'num_layers': 1, 'dropout': 0.11469312435163298, 'learning_rate': 0.0006261790103204518, 'weight_decay': 4.4684173909434455e-05, 'num_epochs': 73}. Best is trial 6 with value: 144.23526000976562.


Trial: 7 - Loss: 255.1532745361328 - Val Loss: 652.7785034179688
Trial: 8 - Loss: 364.9342346191406 - Val Loss: 833.9682006835938


[I 2024-06-20 01:22:56,935] Trial 9 finished with value: 127.44278717041016 and parameters: {'num_heads': 5, 'model_dim': 320, 'num_layers': 6, 'dropout': 0.30556118701978763, 'learning_rate': 0.005127987355304439, 'weight_decay': 0.0028048543251298877, 'num_epochs': 23}. Best is trial 9 with value: 127.44278717041016.
[I 2024-06-20 01:22:57,098] Trial 10 finished with value: 396.8204650878906 and parameters: {'num_heads': 3, 'model_dim': 78, 'num_layers': 6, 'dropout': 0.3631514416091684, 'learning_rate': 0.009155140121025591, 'weight_decay': 0.000926830211552893, 'num_epochs': 13}. Best is trial 9 with value: 127.44278717041016.


Trial: 9 - Loss: 51.32682418823242 - Val Loss: 127.44278717041016
Trial: 10 - Loss: 139.64601135253906 - Val Loss: 396.8204650878906


[I 2024-06-20 01:22:57,719] Trial 11 finished with value: 139.70346069335938 and parameters: {'num_heads': 3, 'model_dim': 129, 'num_layers': 6, 'dropout': 0.34106702508366854, 'learning_rate': 0.00875108686493953, 'weight_decay': 0.009370760557594605, 'num_epochs': 63}. Best is trial 9 with value: 127.44278717041016.


Trial: 11 - Loss: 46.67820739746094 - Val Loss: 139.70346069335938


[I 2024-06-20 01:22:58,012] Trial 12 finished with value: 89.84608459472656 and parameters: {'num_heads': 3, 'model_dim': 99, 'num_layers': 6, 'dropout': 0.35180216011737625, 'learning_rate': 0.009485675348989793, 'weight_decay': 0.00907179485236277, 'num_epochs': 27}. Best is trial 12 with value: 89.84608459472656.


Trial: 12 - Loss: 59.850921630859375 - Val Loss: 89.84608459472656


[I 2024-06-20 01:22:58,286] Trial 13 finished with value: 587.6739501953125 and parameters: {'num_heads': 3, 'model_dim': 81, 'num_layers': 5, 'dropout': 0.3917120051222738, 'learning_rate': 0.002028250820899622, 'weight_decay': 0.0016834732273117005, 'num_epochs': 28}. Best is trial 12 with value: 89.84608459472656.


Trial: 13 - Loss: 222.51686096191406 - Val Loss: 587.6739501953125


[I 2024-06-20 01:22:58,526] Trial 14 finished with value: 681.9595947265625 and parameters: {'num_heads': 4, 'model_dim': 108, 'num_layers': 5, 'dropout': 0.31391755122842463, 'learning_rate': 0.0010067181059187618, 'weight_decay': 0.008507460283687922, 'num_epochs': 25}. Best is trial 12 with value: 89.84608459472656.
[I 2024-06-20 01:22:58,651] Trial 15 finished with value: 708.5595703125 and parameters: {'num_heads': 2, 'model_dim': 50, 'num_layers': 5, 'dropout': 0.3938816430397712, 'learning_rate': 0.004706231636998248, 'weight_decay': 0.001681578791644038, 'num_epochs': 11}. Best is trial 12 with value: 89.84608459472656.


Trial: 14 - Loss: 275.30718994140625 - Val Loss: 681.9595947265625
Trial: 15 - Loss: 297.164794921875 - Val Loss: 708.5595703125


[I 2024-06-20 01:22:59,234] Trial 16 finished with value: 296.49822998046875 and parameters: {'num_heads': 5, 'model_dim': 195, 'num_layers': 6, 'dropout': 0.49628170527169124, 'learning_rate': 0.001388548623423311, 'weight_decay': 0.00421340283171166, 'num_epochs': 43}. Best is trial 12 with value: 89.84608459472656.


Trial: 16 - Loss: 81.08552551269531 - Val Loss: 296.49822998046875


[I 2024-06-20 01:22:59,465] Trial 17 finished with value: 581.573974609375 and parameters: {'num_heads': 2, 'model_dim': 30, 'num_layers': 4, 'dropout': 0.2858131029575635, 'learning_rate': 0.005024856728777846, 'weight_decay': 0.009367447608454033, 'num_epochs': 29}. Best is trial 12 with value: 89.84608459472656.


Trial: 17 - Loss: 222.49642944335938 - Val Loss: 581.573974609375


[I 2024-06-20 01:22:59,849] Trial 18 finished with value: 658.3374633789062 and parameters: {'num_heads': 4, 'model_dim': 160, 'num_layers': 4, 'dropout': 0.32708527543503285, 'learning_rate': 0.0003045530509468868, 'weight_decay': 0.0017176547146728702, 'num_epochs': 48}. Best is trial 12 with value: 89.84608459472656.


Trial: 18 - Loss: 258.9593200683594 - Val Loss: 658.3374633789062


[I 2024-06-20 01:23:00,164] Trial 19 finished with value: 680.8934936523438 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 5, 'dropout': 0.4157955483920545, 'learning_rate': 0.00029884165822210464, 'weight_decay': 0.000310197712508431, 'num_epochs': 20}. Best is trial 12 with value: 89.84608459472656.


Trial: 19 - Loss: 273.32611083984375 - Val Loss: 680.8934936523438


[I 2024-06-20 01:23:00,543] Trial 20 finished with value: 853.978271484375 and parameters: {'num_heads': 2, 'model_dim': 66, 'num_layers': 6, 'dropout': 0.36072988830346114, 'learning_rate': 1.4241299774196948e-05, 'weight_decay': 0.000845272335797247, 'num_epochs': 37}. Best is trial 12 with value: 89.84608459472656.


Trial: 20 - Loss: 374.3424987792969 - Val Loss: 853.978271484375


[I 2024-06-20 01:23:01,210] Trial 21 finished with value: 141.52268981933594 and parameters: {'num_heads': 3, 'model_dim': 105, 'num_layers': 6, 'dropout': 0.3426511133771928, 'learning_rate': 0.008680705924140015, 'weight_decay': 0.008850674402045325, 'num_epochs': 67}. Best is trial 12 with value: 89.84608459472656.


Trial: 21 - Loss: 47.21173858642578 - Val Loss: 141.52268981933594


[I 2024-06-20 01:23:02,166] Trial 22 finished with value: 142.47239685058594 and parameters: {'num_heads': 3, 'model_dim': 129, 'num_layers': 6, 'dropout': 0.29364077307222675, 'learning_rate': 0.009315852292395085, 'weight_decay': 0.005262461932388368, 'num_epochs': 100}. Best is trial 12 with value: 89.84608459472656.


Trial: 22 - Loss: 47.160011291503906 - Val Loss: 142.47239685058594


[I 2024-06-20 01:23:02,696] Trial 23 finished with value: 154.204833984375 and parameters: {'num_heads': 5, 'model_dim': 145, 'num_layers': 5, 'dropout': 0.373238344204779, 'learning_rate': 0.00442527761225344, 'weight_decay': 0.0026466443669197442, 'num_epochs': 50}. Best is trial 12 with value: 89.84608459472656.


Trial: 23 - Loss: 47.274322509765625 - Val Loss: 154.204833984375


[I 2024-06-20 01:23:03,451] Trial 24 finished with value: 156.89434814453125 and parameters: {'num_heads': 4, 'model_dim': 112, 'num_layers': 6, 'dropout': 0.2572641949600063, 'learning_rate': 0.0026490172109101748, 'weight_decay': 0.005935184176687782, 'num_epochs': 76}. Best is trial 12 with value: 89.84608459472656.


Trial: 24 - Loss: 46.65596389770508 - Val Loss: 156.89434814453125


[I 2024-06-20 01:23:03,856] Trial 25 finished with value: 143.1973876953125 and parameters: {'num_heads': 2, 'model_dim': 62, 'num_layers': 4, 'dropout': 0.3263887702628615, 'learning_rate': 0.005992524768176793, 'weight_decay': 0.00962026769023481, 'num_epochs': 58}. Best is trial 12 with value: 89.84608459472656.


Trial: 25 - Loss: 47.010494232177734 - Val Loss: 143.1973876953125


[I 2024-06-20 01:23:04,401] Trial 26 finished with value: 369.79205322265625 and parameters: {'num_heads': 3, 'model_dim': 90, 'num_layers': 5, 'dropout': 0.4088019843250828, 'learning_rate': 0.0014351606809896694, 'weight_decay': 0.0022564538355451156, 'num_epochs': 66}. Best is trial 12 with value: 89.84608459472656.


Trial: 26 - Loss: 108.38531494140625 - Val Loss: 369.79205322265625


[I 2024-06-20 01:23:04,675] Trial 27 finished with value: 118.2438735961914 and parameters: {'num_heads': 5, 'model_dim': 175, 'num_layers': 6, 'dropout': 0.44573675246702527, 'learning_rate': 0.006451630768430231, 'weight_decay': 1.1369422220406084e-05, 'num_epochs': 19}. Best is trial 12 with value: 89.84608459472656.


Trial: 27 - Loss: 47.60029220581055 - Val Loss: 118.2438735961914


[I 2024-06-20 01:23:04,913] Trial 28 finished with value: 666.2779541015625 and parameters: {'num_heads': 5, 'model_dim': 190, 'num_layers': 5, 'dropout': 0.44960782856389625, 'learning_rate': 0.0007792582199061189, 'weight_decay': 5.548611921660877e-05, 'num_epochs': 18}. Best is trial 12 with value: 89.84608459472656.


Trial: 28 - Loss: 267.1192626953125 - Val Loss: 666.2779541015625


[I 2024-06-20 01:23:05,428] Trial 29 finished with value: 119.0051498413086 and parameters: {'num_heads': 6, 'model_dim': 366, 'num_layers': 6, 'dropout': 0.4919620080884355, 'learning_rate': 0.0030533976335566132, 'weight_decay': 1.3603731457907405e-05, 'num_epochs': 24}. Best is trial 12 with value: 89.84608459472656.


Trial: 29 - Loss: 51.00620651245117 - Val Loss: 119.0051498413086


[I 2024-06-20 01:23:05,958] Trial 30 finished with value: 146.95462036132812 and parameters: {'num_heads': 7, 'model_dim': 385, 'num_layers': 4, 'dropout': 0.48898742193415784, 'learning_rate': 0.0029914354191437928, 'weight_decay': 1.2169758731013782e-05, 'num_epochs': 33}. Best is trial 12 with value: 89.84608459472656.


Trial: 30 - Loss: 46.80778884887695 - Val Loss: 146.95462036132812


[I 2024-06-20 01:23:06,431] Trial 31 finished with value: 156.01815795898438 and parameters: {'num_heads': 6, 'model_dim': 330, 'num_layers': 6, 'dropout': 0.4595022994270122, 'learning_rate': 0.00604375945506149, 'weight_decay': 1.4364829324784578e-05, 'num_epochs': 24}. Best is trial 12 with value: 89.84608459472656.


Trial: 31 - Loss: 49.944644927978516 - Val Loss: 156.01815795898438


[I 2024-06-20 01:23:06,716] Trial 32 finished with value: 449.8370666503906 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 6, 'dropout': 0.4358827479648171, 'learning_rate': 0.0017235203260215532, 'weight_decay': 2.41107085963968e-05, 'num_epochs': 15}. Best is trial 12 with value: 89.84608459472656.


Trial: 32 - Loss: 157.1891632080078 - Val Loss: 449.8370666503906


[I 2024-06-20 01:23:06,950] Trial 33 finished with value: 356.88134765625 and parameters: {'num_heads': 6, 'model_dim': 348, 'num_layers': 6, 'dropout': 0.4730077282240165, 'learning_rate': 0.0031908645646947804, 'weight_decay': 5.428613914389718e-05, 'num_epochs': 10}. Best is trial 12 with value: 89.84608459472656.


Trial: 33 - Loss: 122.27954864501953 - Val Loss: 356.88134765625


[I 2024-06-20 01:23:07,264] Trial 34 finished with value: 92.53144073486328 and parameters: {'num_heads': 4, 'model_dim': 216, 'num_layers': 5, 'dropout': 0.4993285856318103, 'learning_rate': 0.006274518276431132, 'weight_decay': 0.00012813782967791368, 'num_epochs': 24}. Best is trial 12 with value: 89.84608459472656.


Trial: 34 - Loss: 60.8966178894043 - Val Loss: 92.53144073486328


[I 2024-06-20 01:23:07,582] Trial 35 finished with value: 155.70040893554688 and parameters: {'num_heads': 4, 'model_dim': 176, 'num_layers': 5, 'dropout': 0.48980611224224696, 'learning_rate': 0.00630428556791755, 'weight_decay': 0.000134815823395981, 'num_epochs': 31}. Best is trial 12 with value: 89.84608459472656.


Trial: 35 - Loss: 48.61876678466797 - Val Loss: 155.70040893554688


[I 2024-06-20 01:23:08,321] Trial 36 finished with value: 133.79452514648438 and parameters: {'num_heads': 7, 'model_dim': 399, 'num_layers': 5, 'dropout': 0.4459619407292268, 'learning_rate': 0.00218544973857031, 'weight_decay': 2.282609851890581e-05, 'num_epochs': 38}. Best is trial 12 with value: 89.84608459472656.
[I 2024-06-20 01:23:08,472] Trial 37 finished with value: 302.8251953125 and parameters: {'num_heads': 4, 'model_dim': 208, 'num_layers': 1, 'dropout': 0.4180056063329012, 'learning_rate': 0.0038303802775913297, 'weight_decay': 7.91585808501392e-05, 'num_epochs': 50}. Best is trial 12 with value: 89.84608459472656.


Trial: 36 - Loss: 47.05313491821289 - Val Loss: 133.79452514648438
Trial: 37 - Loss: 16.431171417236328 - Val Loss: 302.8251953125


[I 2024-06-20 01:23:08,782] Trial 38 finished with value: 690.6179809570312 and parameters: {'num_heads': 6, 'model_dim': 222, 'num_layers': 6, 'dropout': 0.4739810837250868, 'learning_rate': 0.0004866357949269841, 'weight_decay': 0.00024826490314750247, 'num_epochs': 18}. Best is trial 12 with value: 89.84608459472656.


Trial: 38 - Loss: 279.7718811035156 - Val Loss: 690.6179809570312


[I 2024-06-20 01:23:09,287] Trial 39 finished with value: 669.5341186523438 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 5, 'dropout': 0.20185302276395733, 'learning_rate': 0.00019691074354299268, 'weight_decay': 3.1941496552191574e-05, 'num_epochs': 35}. Best is trial 12 with value: 89.84608459472656.


Trial: 39 - Loss: 265.55938720703125 - Val Loss: 669.5341186523438


[I 2024-06-20 01:23:09,521] Trial 40 finished with value: 556.6016845703125 and parameters: {'num_heads': 4, 'model_dim': 168, 'num_layers': 4, 'dropout': 0.4288839003103745, 'learning_rate': 0.0011160203836476292, 'weight_decay': 1.5548999745682105e-05, 'num_epochs': 27}. Best is trial 12 with value: 89.84608459472656.


Trial: 40 - Loss: 204.63330078125 - Val Loss: 556.6016845703125


[I 2024-06-20 01:23:09,861] Trial 41 finished with value: 98.05553436279297 and parameters: {'num_heads': 5, 'model_dim': 240, 'num_layers': 6, 'dropout': 0.2681437361804495, 'learning_rate': 0.006888827343127268, 'weight_decay': 1.0097725426931753e-05, 'num_epochs': 22}. Best is trial 12 with value: 89.84608459472656.


Trial: 41 - Loss: 60.00043487548828 - Val Loss: 98.05553436279297


[I 2024-06-20 01:23:10,249] Trial 42 finished with value: 87.11537170410156 and parameters: {'num_heads': 8, 'model_dim': 232, 'num_layers': 6, 'dropout': 0.25706160003021056, 'learning_rate': 0.006870505482736272, 'weight_decay': 1.0648603121045677e-05, 'num_epochs': 21}. Best is trial 42 with value: 87.11537170410156.


Trial: 42 - Loss: 63.906585693359375 - Val Loss: 87.11537170410156


[I 2024-06-20 01:23:10,560] Trial 43 finished with value: 88.7005844116211 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 6, 'dropout': 0.22968934520520667, 'learning_rate': 0.006969784310833724, 'weight_decay': 1.9694652001923183e-05, 'num_epochs': 16}. Best is trial 42 with value: 87.11537170410156.


Trial: 43 - Loss: 53.976749420166016 - Val Loss: 88.7005844116211


[I 2024-06-20 01:23:10,857] Trial 44 finished with value: 83.18446350097656 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 6, 'dropout': 0.2335146118112467, 'learning_rate': 0.007509929249371125, 'weight_decay': 1.0082972724121124e-05, 'num_epochs': 15}. Best is trial 44 with value: 83.18446350097656.


Trial: 44 - Loss: 57.421287536621094 - Val Loss: 83.18446350097656


[I 2024-06-20 01:23:11,127] Trial 45 finished with value: 770.6041870117188 and parameters: {'num_heads': 8, 'model_dim': 264, 'num_layers': 5, 'dropout': 0.21163563190113438, 'learning_rate': 5.7776073762417083e-05, 'weight_decay': 9.189030599175897e-05, 'num_epochs': 15}. Best is trial 44 with value: 83.18446350097656.


Trial: 45 - Loss: 327.36883544921875 - Val Loss: 770.6041870117188


[I 2024-06-20 01:23:11,416] Trial 46 finished with value: 71.9397201538086 and parameters: {'num_heads': 8, 'model_dim': 224, 'num_layers': 6, 'dropout': 0.2320146403887001, 'learning_rate': 0.009731200416128362, 'weight_decay': 1.8538654992265227e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 46 - Loss: 70.12247467041016 - Val Loss: 71.9397201538086


[I 2024-06-20 01:23:11,701] Trial 47 finished with value: 73.51943969726562 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 6, 'dropout': 0.2320151418703012, 'learning_rate': 0.009013311232107968, 'weight_decay': 2.304396563168003e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.
[I 2024-06-20 01:23:11,818] Trial 48 finished with value: 301.9877014160156 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 2, 'dropout': 0.22667277093908805, 'learning_rate': 0.0037741048009397424, 'weight_decay': 1.933389349969881e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 47 - Loss: 68.08429718017578 - Val Loss: 73.51943969726562
Trial: 48 - Loss: 94.56229400634766 - Val Loss: 301.9877014160156


[I 2024-06-20 01:23:12,035] Trial 49 finished with value: 102.4902114868164 and parameters: {'num_heads': 8, 'model_dim': 256, 'num_layers': 6, 'dropout': 0.16657593900312925, 'learning_rate': 0.009750901383229621, 'weight_decay': 3.758859871530922e-05, 'num_epochs': 10}. Best is trial 46 with value: 71.9397201538086.


Trial: 49 - Loss: 48.010650634765625 - Val Loss: 102.4902114868164


[I 2024-06-20 01:23:12,486] Trial 50 finished with value: 79.85980987548828 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 6, 'dropout': 0.18092235559298342, 'learning_rate': 0.0043461653311652505, 'weight_decay': 3.029982905005861e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 50 - Loss: 65.95068359375 - Val Loss: 79.85980987548828


[I 2024-06-20 01:23:12,925] Trial 51 finished with value: 77.91217041015625 and parameters: {'num_heads': 8, 'model_dim': 504, 'num_layers': 6, 'dropout': 0.1811583631293731, 'learning_rate': 0.004480175604977052, 'weight_decay': 3.0071628315009285e-05, 'num_epochs': 15}. Best is trial 46 with value: 71.9397201538086.


Trial: 51 - Loss: 66.35456085205078 - Val Loss: 77.91217041015625


[I 2024-06-20 01:23:13,260] Trial 52 finished with value: 108.11568450927734 and parameters: {'num_heads': 7, 'model_dim': 441, 'num_layers': 6, 'dropout': 0.17323574742941536, 'learning_rate': 0.004609407013388938, 'weight_decay': 2.931185910303472e-05, 'num_epochs': 12}. Best is trial 46 with value: 71.9397201538086.


Trial: 52 - Loss: 47.45988082885742 - Val Loss: 108.11568450927734


[I 2024-06-20 01:23:13,845] Trial 53 finished with value: 117.0226058959961 and parameters: {'num_heads': 8, 'model_dim': 504, 'num_layers': 6, 'dropout': 0.12857357836600164, 'learning_rate': 0.008106957834632608, 'weight_decay': 4.396350111987341e-05, 'num_epochs': 21}. Best is trial 46 with value: 71.9397201538086.


Trial: 53 - Loss: 49.35973358154297 - Val Loss: 117.0226058959961


[I 2024-06-20 01:23:14,264] Trial 54 finished with value: 196.21429443359375 and parameters: {'num_heads': 7, 'model_dim': 448, 'num_layers': 6, 'dropout': 0.1903696233165338, 'learning_rate': 0.002444718775631109, 'weight_decay': 1.7174405088413187e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 54 - Loss: 57.108848571777344 - Val Loss: 196.21429443359375


[I 2024-06-20 01:23:14,540] Trial 55 finished with value: 215.98565673828125 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 6, 'dropout': 0.14772577801194467, 'learning_rate': 0.004224520238561217, 'weight_decay': 2.6925317730368044e-05, 'num_epochs': 10}. Best is trial 46 with value: 71.9397201538086.


Trial: 55 - Loss: 68.58917999267578 - Val Loss: 215.98565673828125


[I 2024-06-20 01:23:15,303] Trial 56 finished with value: 157.86663818359375 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 6, 'dropout': 0.24660839196663065, 'learning_rate': 0.005063914261327443, 'weight_decay': 6.169656261950395e-05, 'num_epochs': 29}. Best is trial 46 with value: 71.9397201538086.


Trial: 56 - Loss: 47.14401626586914 - Val Loss: 157.86663818359375


[I 2024-06-20 01:23:15,696] Trial 57 finished with value: 168.83160400390625 and parameters: {'num_heads': 8, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.18765227875467835, 'learning_rate': 0.00753517081630478, 'weight_decay': 1.0086625690441312e-05, 'num_epochs': 21}. Best is trial 46 with value: 71.9397201538086.


Trial: 57 - Loss: 47.546119689941406 - Val Loss: 168.83160400390625


[I 2024-06-20 01:23:16,082] Trial 58 finished with value: 128.0482940673828 and parameters: {'num_heads': 8, 'model_dim': 488, 'num_layers': 6, 'dropout': 0.14765734730154068, 'learning_rate': 0.0035458782864617295, 'weight_decay': 3.709367441152408e-05, 'num_epochs': 13}. Best is trial 46 with value: 71.9397201538086.


Trial: 58 - Loss: 47.086029052734375 - Val Loss: 128.0482940673828


[I 2024-06-20 01:23:16,288] Trial 59 finished with value: 366.9762878417969 and parameters: {'num_heads': 7, 'model_dim': 301, 'num_layers': 3, 'dropout': 0.22016254502453927, 'learning_rate': 0.0018346713895373695, 'weight_decay': 2.1154049825803756e-05, 'num_epochs': 18}. Best is trial 46 with value: 71.9397201538086.


Trial: 59 - Loss: 115.88435363769531 - Val Loss: 366.9762878417969


[I 2024-06-20 01:23:17,016] Trial 60 finished with value: 672.2609252929688 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 6, 'dropout': 0.2727843606414705, 'learning_rate': 4.547495698123527e-05, 'weight_decay': 0.0006226250478289958, 'num_epochs': 27}. Best is trial 46 with value: 71.9397201538086.


Trial: 60 - Loss: 267.16424560546875 - Val Loss: 672.2609252929688


[I 2024-06-20 01:23:17,324] Trial 61 finished with value: 89.25370788574219 and parameters: {'num_heads': 8, 'model_dim': 232, 'num_layers': 6, 'dropout': 0.23443553522876814, 'learning_rate': 0.00998461679634406, 'weight_decay': 1.7561061596370928e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 61 - Loss: 68.8281021118164 - Val Loss: 89.25370788574219


[I 2024-06-20 01:23:17,721] Trial 62 finished with value: 81.2283706665039 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 6, 'dropout': 0.25705696787073173, 'learning_rate': 0.005566453255276593, 'weight_decay': 1.4100051484848994e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 62 - Loss: 67.2727279663086 - Val Loss: 81.2283706665039


[I 2024-06-20 01:23:18,076] Trial 63 finished with value: 81.88780975341797 and parameters: {'num_heads': 7, 'model_dim': 427, 'num_layers': 6, 'dropout': 0.2510138767514275, 'learning_rate': 0.005324655212585489, 'weight_decay': 1.4910923338570192e-05, 'num_epochs': 13}. Best is trial 46 with value: 71.9397201538086.


Trial: 63 - Loss: 59.1094970703125 - Val Loss: 81.88780975341797


[I 2024-06-20 01:23:18,418] Trial 64 finished with value: 96.63201904296875 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 6, 'dropout': 0.1949138257076128, 'learning_rate': 0.004852885424185084, 'weight_decay': 1.3878586182843137e-05, 'num_epochs': 13}. Best is trial 46 with value: 71.9397201538086.


Trial: 64 - Loss: 49.94083786010742 - Val Loss: 96.63201904296875


[I 2024-06-20 01:23:18,881] Trial 65 finished with value: 143.41590881347656 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 6, 'dropout': 0.242948321370756, 'learning_rate': 0.005228849907832887, 'weight_decay': 2.8527773978070185e-05, 'num_epochs': 18}. Best is trial 46 with value: 71.9397201538086.


Trial: 65 - Loss: 48.4722785949707 - Val Loss: 143.41590881347656


[I 2024-06-20 01:23:20,702] Trial 66 finished with value: 141.6833953857422 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 5, 'dropout': 0.21318199372233132, 'learning_rate': 0.0026991787150171674, 'weight_decay': 3.638237470122387e-05, 'num_epochs': 89}. Best is trial 46 with value: 71.9397201538086.


Trial: 66 - Loss: 46.90270233154297 - Val Loss: 141.6833953857422


[I 2024-06-20 01:23:21,075] Trial 67 finished with value: 171.0959930419922 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 6, 'dropout': 0.29522078340473584, 'learning_rate': 0.00855466224759933, 'weight_decay': 1.5777216512166172e-05, 'num_epochs': 13}. Best is trial 46 with value: 71.9397201538086.


Trial: 67 - Loss: 48.088600158691406 - Val Loss: 171.0959930419922


[I 2024-06-20 01:23:21,667] Trial 68 finished with value: 134.72125244140625 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 5, 'dropout': 0.258153657924856, 'learning_rate': 0.005327192421325443, 'weight_decay': 4.7341361676649485e-05, 'num_epochs': 26}. Best is trial 46 with value: 71.9397201538086.


Trial: 68 - Loss: 47.65730285644531 - Val Loss: 134.72125244140625


[I 2024-06-20 01:23:22,414] Trial 69 finished with value: 155.68043518066406 and parameters: {'num_heads': 7, 'model_dim': 427, 'num_layers': 6, 'dropout': 0.2782756688920612, 'learning_rate': 0.0035979419817234754, 'weight_decay': 7.112894991829357e-05, 'num_epochs': 31}. Best is trial 46 with value: 71.9397201538086.


Trial: 69 - Loss: 47.082862854003906 - Val Loss: 155.68043518066406


[I 2024-06-20 01:23:22,923] Trial 70 finished with value: 298.0950622558594 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 6, 'dropout': 0.1508349279813247, 'learning_rate': 0.0014600045509456953, 'weight_decay': 2.4518430773879856e-05, 'num_epochs': 19}. Best is trial 46 with value: 71.9397201538086.


Trial: 70 - Loss: 86.78245544433594 - Val Loss: 298.0950622558594


[I 2024-06-20 01:23:23,307] Trial 71 finished with value: 85.22594451904297 and parameters: {'num_heads': 8, 'model_dim': 200, 'num_layers': 6, 'dropout': 0.25854274845056424, 'learning_rate': 0.007505994807634402, 'weight_decay': 1.1596603255790975e-05, 'num_epochs': 21}. Best is trial 46 with value: 71.9397201538086.


Trial: 71 - Loss: 63.8339958190918 - Val Loss: 85.22594451904297


[I 2024-06-20 01:23:23,529] Trial 72 finished with value: 191.57188415527344 and parameters: {'num_heads': 8, 'model_dim': 200, 'num_layers': 6, 'dropout': 0.17659603802005683, 'learning_rate': 0.008145034941082319, 'weight_decay': 1.2800285038993083e-05, 'num_epochs': 11}. Best is trial 46 with value: 71.9397201538086.


Trial: 72 - Loss: 60.96486282348633 - Val Loss: 191.57188415527344


[I 2024-06-20 01:23:23,936] Trial 73 finished with value: 77.83956909179688 and parameters: {'num_heads': 7, 'model_dim': 413, 'num_layers': 6, 'dropout': 0.3146631269097918, 'learning_rate': 0.005542413257227574, 'weight_decay': 2.074962134849982e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 73 - Loss: 70.00279235839844 - Val Loss: 77.83956909179688


[I 2024-06-20 01:23:24,341] Trial 74 finished with value: 102.1731948852539 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 6, 'dropout': 0.3137601599283763, 'learning_rate': 0.00562348777711105, 'weight_decay': 2.099871688581573e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 74 - Loss: 58.848995208740234 - Val Loss: 102.1731948852539


[I 2024-06-20 01:23:24,888] Trial 75 finished with value: 167.9213104248047 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 6, 'dropout': 0.28350148451673995, 'learning_rate': 0.004177339132000419, 'weight_decay': 1.7314283687497583e-05, 'num_epochs': 23}. Best is trial 46 with value: 71.9397201538086.


Trial: 75 - Loss: 47.77314376831055 - Val Loss: 167.9213104248047


[I 2024-06-20 01:23:25,161] Trial 76 finished with value: 238.10679626464844 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 5, 'dropout': 0.23976617268314313, 'learning_rate': 0.0030449389976202664, 'weight_decay': 1.4027503426562655e-05, 'num_epochs': 13}. Best is trial 46 with value: 71.9397201538086.


Trial: 76 - Loss: 71.64600372314453 - Val Loss: 238.10679626464844


[I 2024-06-20 01:23:27,076] Trial 77 finished with value: 143.69224548339844 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 6, 'dropout': 0.20307551223134143, 'learning_rate': 0.002074177972252937, 'weight_decay': 2.5120113284418318e-05, 'num_epochs': 75}. Best is trial 46 with value: 71.9397201538086.


Trial: 77 - Loss: 47.278717041015625 - Val Loss: 143.69224548339844


[I 2024-06-20 01:23:27,538] Trial 78 finished with value: 120.79502868652344 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 6, 'dropout': 0.10975156635385823, 'learning_rate': 0.009975544146212064, 'weight_decay': 3.3604322237761436e-05, 'num_epochs': 17}. Best is trial 46 with value: 71.9397201538086.


Trial: 78 - Loss: 47.245887756347656 - Val Loss: 120.79502868652344


[I 2024-06-20 01:23:27,801] Trial 79 finished with value: 162.25259399414062 and parameters: {'num_heads': 8, 'model_dim': 336, 'num_layers': 6, 'dropout': 0.3036968475083005, 'learning_rate': 0.006053883152436517, 'weight_decay': 1.9866798127358087e-05, 'num_epochs': 10}. Best is trial 46 with value: 71.9397201538086.


Trial: 79 - Loss: 53.98870086669922 - Val Loss: 162.25259399414062


[I 2024-06-20 01:23:28,044] Trial 80 finished with value: 812.1373901367188 and parameters: {'num_heads': 1, 'model_dim': 47, 'num_layers': 5, 'dropout': 0.22339095293697617, 'learning_rate': 0.0004594781780304374, 'weight_decay': 1.2408124760473144e-05, 'num_epochs': 25}. Best is trial 46 with value: 71.9397201538086.


Trial: 80 - Loss: 351.125732421875 - Val Loss: 812.1373901367188


[I 2024-06-20 01:23:28,603] Trial 81 finished with value: 113.63508605957031 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 6, 'dropout': 0.266789096464045, 'learning_rate': 0.007280009771251937, 'weight_decay': 1.59232807663453e-05, 'num_epochs': 20}. Best is trial 46 with value: 71.9397201538086.


Trial: 81 - Loss: 48.009891510009766 - Val Loss: 113.63508605957031


[I 2024-06-20 01:23:28,879] Trial 82 finished with value: 122.99073791503906 and parameters: {'num_heads': 8, 'model_dim': 216, 'num_layers': 6, 'dropout': 0.25442248508335025, 'learning_rate': 0.007199153137515235, 'weight_decay': 1.2408335526298123e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 82 - Loss: 47.255863189697266 - Val Loss: 122.99073791503906


[I 2024-06-20 01:23:29,436] Trial 83 finished with value: 143.98712158203125 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 6, 'dropout': 0.24740200438639207, 'learning_rate': 0.008334605770550113, 'weight_decay': 4.352396892499596e-05, 'num_epochs': 22}. Best is trial 46 with value: 71.9397201538086.
[I 2024-06-20 01:23:29,588] Trial 84 finished with value: 99.55308532714844 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 2, 'dropout': 0.32309693509088067, 'learning_rate': 0.004275612110386559, 'weight_decay': 2.3451864820778442e-05, 'num_epochs': 12}. Best is trial 46 with value: 71.9397201538086.


Trial: 83 - Loss: 47.7265625 - Val Loss: 143.98712158203125
Trial: 84 - Loss: 48.86138916015625 - Val Loss: 99.55308532714844


[I 2024-06-20 01:23:29,905] Trial 85 finished with value: 182.57044982910156 and parameters: {'num_heads': 8, 'model_dim': 144, 'num_layers': 6, 'dropout': 0.2173315356307179, 'learning_rate': 0.00601730407564199, 'weight_decay': 1.128613526183653e-05, 'num_epochs': 19}. Best is trial 46 with value: 71.9397201538086.
[I 2024-06-20 01:23:29,987] Trial 86 finished with value: 396.8157043457031 and parameters: {'num_heads': 8, 'model_dim': 184, 'num_layers': 1, 'dropout': 0.2914411181332559, 'learning_rate': 0.0032064013603500334, 'weight_decay': 1.8639132122944108e-05, 'num_epochs': 16}. Best is trial 46 with value: 71.9397201538086.


Trial: 85 - Loss: 54.5338249206543 - Val Loss: 182.57044982910156
Trial: 86 - Loss: 135.4421844482422 - Val Loss: 396.8157043457031


[I 2024-06-20 01:23:30,361] Trial 87 finished with value: 77.3382568359375 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 6, 'dropout': 0.18275284949474188, 'learning_rate': 0.005257688350445903, 'weight_decay': 1.5533251937174897e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 87 - Loss: 64.19159698486328 - Val Loss: 77.3382568359375


[I 2024-06-20 01:23:30,685] Trial 88 finished with value: 273.7021789550781 and parameters: {'num_heads': 6, 'model_dim': 372, 'num_layers': 6, 'dropout': 0.18668994158292745, 'learning_rate': 0.0026847674833200635, 'weight_decay': 2.741123916023878e-05, 'num_epochs': 14}. Best is trial 46 with value: 71.9397201538086.


Trial: 88 - Loss: 83.55420684814453 - Val Loss: 273.7021789550781


[I 2024-06-20 01:23:30,966] Trial 89 finished with value: 686.6776123046875 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 6, 'dropout': 0.13209649720883904, 'learning_rate': 0.00015469058398127593, 'weight_decay': 3.137438239898729e-05, 'num_epochs': 10}. Best is trial 46 with value: 71.9397201538086.


Trial: 89 - Loss: 276.5738220214844 - Val Loss: 686.6776123046875


[I 2024-06-20 01:23:31,842] Trial 90 finished with value: 137.255126953125 and parameters: {'num_heads': 6, 'model_dim': 360, 'num_layers': 5, 'dropout': 0.15509296901266287, 'learning_rate': 0.0046862167766255226, 'weight_decay': 1.5042023532062332e-05, 'num_epochs': 55}. Best is trial 46 with value: 71.9397201538086.


Trial: 90 - Loss: 46.70164108276367 - Val Loss: 137.255126953125


[I 2024-06-20 01:23:32,352] Trial 91 finished with value: 124.21066284179688 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 6, 'dropout': 0.23567547181906828, 'learning_rate': 0.00783156235398773, 'weight_decay': 1.1961520167324666e-05, 'num_epochs': 20}. Best is trial 46 with value: 71.9397201538086.


Trial: 91 - Loss: 47.21823501586914 - Val Loss: 124.21066284179688


[I 2024-06-20 01:23:32,784] Trial 92 finished with value: 102.95225524902344 and parameters: {'num_heads': 7, 'model_dim': 413, 'num_layers': 6, 'dropout': 0.20608929397405892, 'learning_rate': 0.005629816028356134, 'weight_decay': 1.0217780559265896e-05, 'num_epochs': 17}. Best is trial 46 with value: 71.9397201538086.


Trial: 92 - Loss: 60.464046478271484 - Val Loss: 102.95225524902344


[I 2024-06-20 01:23:33,038] Trial 93 finished with value: 322.7083435058594 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 6, 'dropout': 0.17323771799654922, 'learning_rate': 0.004060423825686573, 'weight_decay': 2.2522180595145435e-05, 'num_epochs': 12}. Best is trial 46 with value: 71.9397201538086.


Trial: 93 - Loss: 105.74701690673828 - Val Loss: 322.7083435058594


[I 2024-06-20 01:23:33,360] Trial 94 finished with value: 88.0292739868164 and parameters: {'num_heads': 8, 'model_dim': 264, 'num_layers': 6, 'dropout': 0.34096319365221184, 'learning_rate': 0.006722341734475021, 'weight_decay': 0.00017656865629601603, 'num_epochs': 15}. Best is trial 46 with value: 71.9397201538086.


Trial: 94 - Loss: 53.42952346801758 - Val Loss: 88.0292739868164


[I 2024-06-20 01:23:33,923] Trial 95 finished with value: 111.2375259399414 and parameters: {'num_heads': 8, 'model_dim': 392, 'num_layers': 6, 'dropout': 0.26557542850722315, 'learning_rate': 0.008892890059325925, 'weight_decay': 1.6375120623570707e-05, 'num_epochs': 23}. Best is trial 46 with value: 71.9397201538086.


Trial: 95 - Loss: 50.0759162902832 - Val Loss: 111.2375259399414


[I 2024-06-20 01:23:34,392] Trial 96 finished with value: 96.37464904785156 and parameters: {'num_heads': 7, 'model_dim': 427, 'num_layers': 6, 'dropout': 0.19762941029031783, 'learning_rate': 0.005252027583090839, 'weight_decay': 1.9016483709447068e-05, 'num_epochs': 18}. Best is trial 46 with value: 71.9397201538086.


Trial: 96 - Loss: 62.93903732299805 - Val Loss: 96.37464904785156


[I 2024-06-20 01:23:34,807] Trial 97 finished with value: 110.85248565673828 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 6, 'dropout': 0.2506730895038982, 'learning_rate': 0.0036681148953918684, 'weight_decay': 0.001200152224829185, 'num_epochs': 15}. Best is trial 46 with value: 71.9397201538086.


Trial: 97 - Loss: 47.48685073852539 - Val Loss: 110.85248565673828


[I 2024-06-20 01:23:37,221] Trial 98 finished with value: 140.67703247070312 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 6, 'dropout': 0.23103645907665055, 'learning_rate': 0.006853117131254777, 'weight_decay': 1.3932977677209717e-05, 'num_epochs': 100}. Best is trial 46 with value: 71.9397201538086.
[I 2024-06-20 01:23:37,364] Trial 99 finished with value: 327.22161865234375 and parameters: {'num_heads': 8, 'model_dim': 224, 'num_layers': 3, 'dropout': 0.27798832468564816, 'learning_rate': 0.004543168074140392, 'weight_decay': 0.00041156840195434233, 'num_epochs': 12}. Best is trial 46 with value: 71.9397201538086.


Trial: 98 - Loss: 46.90190124511719 - Val Loss: 140.67703247070312
Trial: 99 - Loss: 108.8806381225586 - Val Loss: 327.22161865234375


In [17]:
# Results
print(f'Número de pruebas: {len(study_g.trials)}')
trial = study_g.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida: {trial.value}')

Número de pruebas: 100
Mejor prueba: 46
Mejores parametros: {'num_heads': 8, 'model_dim': 224, 'num_layers': 6, 'dropout': 0.2320146403887001, 'learning_rate': 0.009731200416128362, 'weight_decay': 1.8538654992265227e-05, 'num_epochs': 14}
Mejor valor de pérdida: 71.9397201538086


## Single Thread

In [18]:
# configuration optuna
study_st = optuna.create_study(direction='minimize')
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 01:23:37,434] A new study created in memory with name: no-name-27f26d97-d698-4e8b-a710-0582086187aa
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 01:23:37,819] Trial 0 finished with value: 596.8732299804688 and parameters: {'num_heads': 5, 'model_dim': 45, 'num_layers': 5, 'dropout': 0.1339207703374725, 'learning_rate': 8.049873681567756e-05, 'weight_decay': 0.0007161807283236753, 'num_epochs': 48}. Best is trial 0 with value: 596.8732299804688.
[I 2024-06-20 01:23:37,882] Trial 1 finished with value: 555.1475219726562 and parameters: {'num_heads': 2, 'model_dim': 118, 'num_layers': 2, 'dropout': 0.395631613277572, 'learning_rate': 0.0002734949504491032, 'weight_decay': 8.413898593297126e-05, 'num_epochs': 15}. Best is trial 1 with value: 555.1475219726562.
[I 2024-06-20 01:23:38,000] Trial 2 finished with value: 570.3209228515625 and parameters: {'num_heads': 1, 'model_dim': 21, 'nu

Trial: 0 - Loss: 380.5248107910156 - Val Loss: 596.8732299804688
Trial: 1 - Loss: 350.29827880859375 - Val Loss: 555.1475219726562
Trial: 2 - Loss: 365.1371154785156 - Val Loss: 570.3209228515625


[I 2024-06-20 01:23:38,462] Trial 3 finished with value: 33.141597747802734 and parameters: {'num_heads': 7, 'model_dim': 224, 'num_layers': 4, 'dropout': 0.3384521539127864, 'learning_rate': 0.009425342371542283, 'weight_decay': 1.1441429955338632e-05, 'num_epochs': 54}. Best is trial 3 with value: 33.141597747802734.


Trial: 3 - Loss: 17.436294555664062 - Val Loss: 33.141597747802734


[I 2024-06-20 01:23:38,756] Trial 4 finished with value: 131.4488983154297 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 2, 'dropout': 0.38049056849683727, 'learning_rate': 0.0018259939509228263, 'weight_decay': 6.522779228673137e-05, 'num_epochs': 78}. Best is trial 3 with value: 33.141597747802734.
[I 2024-06-20 01:23:38,866] Trial 5 finished with value: 598.3951416015625 and parameters: {'num_heads': 5, 'model_dim': 50, 'num_layers': 4, 'dropout': 0.1937440343446543, 'learning_rate': 0.00042346111989048353, 'weight_decay': 0.0012221083847293085, 'num_epochs': 16}. Best is trial 3 with value: 33.141597747802734.


Trial: 4 - Loss: 9.342180252075195 - Val Loss: 131.4488983154297
Trial: 5 - Loss: 382.4421081542969 - Val Loss: 598.3951416015625


[I 2024-06-20 01:23:39,007] Trial 6 finished with value: 49.03171920776367 and parameters: {'num_heads': 6, 'model_dim': 144, 'num_layers': 5, 'dropout': 0.1025004623021316, 'learning_rate': 0.008746585824356024, 'weight_decay': 0.004208879498879046, 'num_epochs': 15}. Best is trial 3 with value: 33.141597747802734.


Trial: 6 - Loss: 24.651851654052734 - Val Loss: 49.03171920776367


[I 2024-06-20 01:23:39,932] Trial 7 finished with value: 37.86460876464844 and parameters: {'num_heads': 6, 'model_dim': 336, 'num_layers': 6, 'dropout': 0.48763806535828935, 'learning_rate': 0.001573134737108138, 'weight_decay': 0.002323576557150756, 'num_epochs': 62}. Best is trial 3 with value: 33.141597747802734.


Trial: 7 - Loss: 17.439348220825195 - Val Loss: 37.86460876464844


[I 2024-06-20 01:23:40,939] Trial 8 finished with value: 34.560218811035156 and parameters: {'num_heads': 6, 'model_dim': 174, 'num_layers': 6, 'dropout': 0.13919946959289664, 'learning_rate': 0.006019057621013602, 'weight_decay': 2.285005484435488e-05, 'num_epochs': 91}. Best is trial 3 with value: 33.141597747802734.
[I 2024-06-20 01:23:41,099] Trial 9 finished with value: 642.1270751953125 and parameters: {'num_heads': 2, 'model_dim': 94, 'num_layers': 6, 'dropout': 0.251316923976954, 'learning_rate': 1.5952264132735326e-05, 'weight_decay': 0.003561388241829657, 'num_epochs': 16}. Best is trial 3 with value: 33.141597747802734.


Trial: 8 - Loss: 17.742103576660156 - Val Loss: 34.560218811035156
Trial: 9 - Loss: 405.3559265136719 - Val Loss: 642.1270751953125


[I 2024-06-20 01:23:41,287] Trial 10 finished with value: 543.2482299804688 and parameters: {'num_heads': 8, 'model_dim': 504, 'num_layers': 1, 'dropout': 0.362758166397813, 'learning_rate': 2.849850187768663e-05, 'weight_decay': 1.0476210649898452e-05, 'num_epochs': 41}. Best is trial 3 with value: 33.141597747802734.


Trial: 10 - Loss: 336.8794250488281 - Val Loss: 543.2482299804688


[I 2024-06-20 01:23:42,079] Trial 11 finished with value: 35.178260803222656 and parameters: {'num_heads': 8, 'model_dim': 288, 'num_layers': 3, 'dropout': 0.3175376045198284, 'learning_rate': 0.002827845691382471, 'weight_decay': 1.1382598723140346e-05, 'num_epochs': 99}. Best is trial 3 with value: 33.141597747802734.


Trial: 11 - Loss: 17.661880493164062 - Val Loss: 35.178260803222656


[I 2024-06-20 01:23:42,713] Trial 12 finished with value: 35.53691864013672 and parameters: {'num_heads': 7, 'model_dim': 224, 'num_layers': 4, 'dropout': 0.1875672533043362, 'learning_rate': 0.009112288398151579, 'weight_decay': 2.6237611645869207e-05, 'num_epochs': 72}. Best is trial 3 with value: 33.141597747802734.


Trial: 12 - Loss: 17.167837142944336 - Val Loss: 35.53691864013672


[I 2024-06-20 01:23:43,453] Trial 13 finished with value: 57.469173431396484 and parameters: {'num_heads': 7, 'model_dim': 287, 'num_layers': 3, 'dropout': 0.45986987622326053, 'learning_rate': 0.000716107655187609, 'weight_decay': 0.00024164721304363934, 'num_epochs': 94}. Best is trial 3 with value: 33.141597747802734.


Trial: 13 - Loss: 19.859394073486328 - Val Loss: 57.469173431396484


[I 2024-06-20 01:23:43,818] Trial 14 finished with value: 318.8387145996094 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 6, 'dropout': 0.3048421019296915, 'learning_rate': 0.0009670730622119054, 'weight_decay': 3.1764422453707923e-05, 'num_epochs': 35}. Best is trial 3 with value: 33.141597747802734.


Trial: 14 - Loss: 177.09652709960938 - Val Loss: 318.8387145996094


[I 2024-06-20 01:23:44,542] Trial 15 finished with value: 38.3203125 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 4, 'dropout': 0.2217050728505796, 'learning_rate': 0.004151956818722559, 'weight_decay': 0.00019455405853994784, 'num_epochs': 84}. Best is trial 3 with value: 33.141597747802734.


Trial: 15 - Loss: 17.528125762939453 - Val Loss: 38.3203125


[I 2024-06-20 01:23:45,556] Trial 16 finished with value: 354.2801818847656 and parameters: {'num_heads': 7, 'model_dim': 385, 'num_layers': 5, 'dropout': 0.4256227759657109, 'learning_rate': 0.0001639916455363254, 'weight_decay': 2.701704781856224e-05, 'num_epochs': 66}. Best is trial 3 with value: 33.141597747802734.
[I 2024-06-20 01:23:45,747] Trial 17 finished with value: 26.9622745513916 and parameters: {'num_heads': 3, 'model_dim': 135, 'num_layers': 3, 'dropout': 0.34286405533086467, 'learning_rate': 0.0049991678272620355, 'weight_decay': 0.008834810492776438, 'num_epochs': 32}. Best is trial 17 with value: 26.9622745513916.


Trial: 16 - Loss: 198.9624481201172 - Val Loss: 354.2801818847656
Trial: 17 - Loss: 19.56772232055664 - Val Loss: 26.9622745513916


[I 2024-06-20 01:23:45,888] Trial 18 finished with value: 348.3472595214844 and parameters: {'num_heads': 3, 'model_dim': 81, 'num_layers': 2, 'dropout': 0.3351276411348508, 'learning_rate': 0.0023564313773735347, 'weight_decay': 0.008388309636941002, 'num_epochs': 30}. Best is trial 17 with value: 26.9622745513916.


Trial: 18 - Loss: 200.6580047607422 - Val Loss: 348.3472595214844


[I 2024-06-20 01:23:46,197] Trial 19 finished with value: 312.775634765625 and parameters: {'num_heads': 3, 'model_dim': 120, 'num_layers': 3, 'dropout': 0.2806819771023733, 'learning_rate': 0.0009560109412065838, 'weight_decay': 0.0005337441854867694, 'num_epochs': 54}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:46,293] Trial 20 finished with value: 609.601806640625 and parameters: {'num_heads': 3, 'model_dim': 69, 'num_layers': 1, 'dropout': 0.4230086124239212, 'learning_rate': 7.803690984236956e-05, 'weight_decay': 0.00891394449513594, 'num_epochs': 29}. Best is trial 17 with value: 26.9622745513916.


Trial: 19 - Loss: 170.78929138183594 - Val Loss: 312.775634765625
Trial: 20 - Loss: 392.51055908203125 - Val Loss: 609.601806640625


[I 2024-06-20 01:23:46,659] Trial 21 finished with value: 37.91028594970703 and parameters: {'num_heads': 5, 'model_dim': 210, 'num_layers': 4, 'dropout': 0.36720828936454625, 'learning_rate': 0.006379050319673836, 'weight_decay': 1.603131989097744e-05, 'num_epochs': 45}. Best is trial 17 with value: 26.9622745513916.


Trial: 21 - Loss: 17.75816535949707 - Val Loss: 37.91028594970703


[I 2024-06-20 01:23:47,004] Trial 22 finished with value: 33.807945251464844 and parameters: {'num_heads': 6, 'model_dim': 162, 'num_layers': 3, 'dropout': 0.33693381502565123, 'learning_rate': 0.0038371971925468705, 'weight_decay': 4.6734111099081917e-05, 'num_epochs': 57}. Best is trial 17 with value: 26.9622745513916.


Trial: 22 - Loss: 17.326581954956055 - Val Loss: 33.807945251464844


[I 2024-06-20 01:23:47,444] Trial 23 finished with value: 37.50564956665039 and parameters: {'num_heads': 7, 'model_dim': 266, 'num_layers': 3, 'dropout': 0.33944115404212033, 'learning_rate': 0.00336006173444909, 'weight_decay': 5.6927668261581284e-05, 'num_epochs': 56}. Best is trial 17 with value: 26.9622745513916.


Trial: 23 - Loss: 17.559133529663086 - Val Loss: 37.50564956665039


[I 2024-06-20 01:23:47,670] Trial 24 finished with value: 37.68669509887695 and parameters: {'num_heads': 4, 'model_dim': 152, 'num_layers': 2, 'dropout': 0.29946859868290593, 'learning_rate': 0.009728268926955061, 'weight_decay': 0.0001387021833869794, 'num_epochs': 53}. Best is trial 17 with value: 26.9622745513916.


Trial: 24 - Loss: 17.363134384155273 - Val Loss: 37.68669509887695


[I 2024-06-20 01:23:47,932] Trial 25 finished with value: 118.37122344970703 and parameters: {'num_heads': 8, 'model_dim': 376, 'num_layers': 3, 'dropout': 0.4147175658015537, 'learning_rate': 0.001505789356534414, 'weight_decay': 0.00047540190901383125, 'num_epochs': 25}. Best is trial 17 with value: 26.9622745513916.


Trial: 25 - Loss: 51.43721008300781 - Val Loss: 118.37122344970703


[I 2024-06-20 01:23:48,213] Trial 26 finished with value: 548.1338500976562 and parameters: {'num_heads': 1, 'model_dim': 8, 'num_layers': 4, 'dropout': 0.34038068581752856, 'learning_rate': 0.004352922482283704, 'weight_decay': 4.863380696651085e-05, 'num_epochs': 39}. Best is trial 17 with value: 26.9622745513916.


Trial: 26 - Loss: 345.8795471191406 - Val Loss: 548.1338500976562


[I 2024-06-20 01:23:48,565] Trial 27 finished with value: 428.0522155761719 and parameters: {'num_heads': 2, 'model_dim': 100, 'num_layers': 3, 'dropout': 0.24903024099277873, 'learning_rate': 0.0005264704519174222, 'weight_decay': 0.00011837082942032705, 'num_epochs': 63}. Best is trial 17 with value: 26.9622745513916.


Trial: 27 - Loss: 253.21485900878906 - Val Loss: 428.0522155761719


[I 2024-06-20 01:23:48,865] Trial 28 finished with value: 36.033241271972656 and parameters: {'num_heads': 5, 'model_dim': 205, 'num_layers': 2, 'dropout': 0.4517730980052908, 'learning_rate': 0.0026525882880747437, 'weight_decay': 0.001041117171049156, 'num_epochs': 71}. Best is trial 17 with value: 26.9622745513916.


Trial: 28 - Loss: 17.492197036743164 - Val Loss: 36.033241271972656


[I 2024-06-20 01:23:49,238] Trial 29 finished with value: 537.1286010742188 and parameters: {'num_heads': 6, 'model_dim': 150, 'num_layers': 4, 'dropout': 0.3517251926135272, 'learning_rate': 7.069998487093574e-05, 'weight_decay': 4.159632394420382e-05, 'num_epochs': 48}. Best is trial 17 with value: 26.9622745513916.


Trial: 29 - Loss: 335.1739196777344 - Val Loss: 537.1286010742188


[I 2024-06-20 01:23:49,922] Trial 30 finished with value: 407.599609375 and parameters: {'num_heads': 7, 'model_dim': 238, 'num_layers': 5, 'dropout': 0.3957988130400166, 'learning_rate': 0.00023924188113952922, 'weight_decay': 1.4650535809703218e-05, 'num_epochs': 58}. Best is trial 17 with value: 26.9622745513916.


Trial: 30 - Loss: 238.018310546875 - Val Loss: 407.599609375


[I 2024-06-20 01:23:50,902] Trial 31 finished with value: 34.826629638671875 and parameters: {'num_heads': 6, 'model_dim': 168, 'num_layers': 6, 'dropout': 0.14916913726032874, 'learning_rate': 0.005801863656815434, 'weight_decay': 3.630818943211102e-05, 'num_epochs': 89}. Best is trial 17 with value: 26.9622745513916.


Trial: 31 - Loss: 17.571706771850586 - Val Loss: 34.826629638671875


[I 2024-06-20 01:23:51,578] Trial 32 finished with value: 37.59308624267578 and parameters: {'num_heads': 5, 'model_dim': 130, 'num_layers': 5, 'dropout': 0.3174816299239754, 'learning_rate': 0.006448412880575146, 'weight_decay': 1.9145770452638906e-05, 'num_epochs': 82}. Best is trial 17 with value: 26.9622745513916.


Trial: 32 - Loss: 17.451696395874023 - Val Loss: 37.59308624267578


[I 2024-06-20 01:23:51,829] Trial 33 finished with value: 33.020626068115234 and parameters: {'num_heads': 6, 'model_dim': 180, 'num_layers': 2, 'dropout': 0.28309759126453493, 'learning_rate': 0.005962311207583065, 'weight_decay': 1.8918585973594165e-05, 'num_epochs': 47}. Best is trial 17 with value: 26.9622745513916.


Trial: 33 - Loss: 17.197641372680664 - Val Loss: 33.020626068115234


[I 2024-06-20 01:23:52,068] Trial 34 finished with value: 40.57664108276367 and parameters: {'num_heads': 7, 'model_dim': 189, 'num_layers': 2, 'dropout': 0.28593759460080365, 'learning_rate': 0.002145139810728019, 'weight_decay': 9.570291547685859e-05, 'num_epochs': 49}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:52,157] Trial 35 finished with value: 123.21388244628906 and parameters: {'num_heads': 5, 'model_dim': 160, 'num_layers': 1, 'dropout': 0.27124242198581516, 'learning_rate': 0.003629651628185242, 'weight_decay': 1.0624170521902322e-05, 'num_epochs': 23}. Best is trial 17 with value: 26.9622745513916.


Trial: 34 - Loss: 17.578454971313477 - Val Loss: 40.57664108276367
Trial: 35 - Loss: 59.523624420166016 - Val Loss: 123.21388244628906


[I 2024-06-20 01:23:52,328] Trial 36 finished with value: 32.91725158691406 and parameters: {'num_heads': 4, 'model_dim': 128, 'num_layers': 2, 'dropout': 0.3832172551519771, 'learning_rate': 0.009489025479923712, 'weight_decay': 7.251129506660754e-05, 'num_epochs': 37}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:52,490] Trial 37 finished with value: 46.7617073059082 and parameters: {'num_heads': 4, 'model_dim': 112, 'num_layers': 2, 'dropout': 0.39031416246459016, 'learning_rate': 0.0079220790272979, 'weight_decay': 0.0017949326965484768, 'num_epochs': 35}. Best is trial 17 with value: 26.9622745513916.


Trial: 36 - Loss: 17.361330032348633 - Val Loss: 32.91725158691406
Trial: 37 - Loss: 19.99359893798828 - Val Loss: 46.7617073059082


[I 2024-06-20 01:23:52,563] Trial 38 finished with value: 133.68243408203125 and parameters: {'num_heads': 3, 'model_dim': 135, 'num_layers': 2, 'dropout': 0.22548320383355347, 'learning_rate': 0.009957698770413782, 'weight_decay': 7.523901636486044e-05, 'num_epochs': 10}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:52,690] Trial 39 finished with value: 33.91691589355469 and parameters: {'num_heads': 4, 'model_dim': 192, 'num_layers': 1, 'dropout': 0.3697064507568792, 'learning_rate': 0.005259133664059339, 'weight_decay': 0.00029705103157969014, 'num_epochs': 41}. Best is trial 17 with value: 26.9622745513916.


Trial: 38 - Loss: 79.0382080078125 - Val Loss: 133.68243408203125
Trial: 39 - Loss: 17.726970672607422 - Val Loss: 33.91691589355469


[I 2024-06-20 01:23:52,810] Trial 40 finished with value: 357.77801513671875 and parameters: {'num_heads': 3, 'model_dim': 141, 'num_layers': 2, 'dropout': 0.40177483402662484, 'learning_rate': 0.0015763150281614979, 'weight_decay': 1.7542373144705865e-05, 'num_epochs': 23}. Best is trial 17 with value: 26.9622745513916.


Trial: 40 - Loss: 209.76296997070312 - Val Loss: 357.77801513671875


[I 2024-06-20 01:23:53,075] Trial 41 finished with value: 142.15414428710938 and parameters: {'num_heads': 6, 'model_dim': 54, 'num_layers': 3, 'dropout': 0.32798802246256037, 'learning_rate': 0.00409554930596864, 'weight_decay': 0.00016063915717196085, 'num_epochs': 46}. Best is trial 17 with value: 26.9622745513916.


Trial: 41 - Loss: 61.27798080444336 - Val Loss: 142.15414428710938


[I 2024-06-20 01:23:53,288] Trial 42 finished with value: 49.52909469604492 and parameters: {'num_heads': 5, 'model_dim': 120, 'num_layers': 3, 'dropout': 0.2970448622803782, 'learning_rate': 0.006430753827469775, 'weight_decay': 5.3477849909842736e-05, 'num_epochs': 36}. Best is trial 17 with value: 26.9622745513916.


Trial: 42 - Loss: 18.450092315673828 - Val Loss: 49.52909469604492


[I 2024-06-20 01:23:53,703] Trial 43 finished with value: 39.30217742919922 and parameters: {'num_heads': 2, 'model_dim': 108, 'num_layers': 4, 'dropout': 0.3770001934910407, 'learning_rate': 0.0029335406007029575, 'weight_decay': 2.0681179373037444e-05, 'num_epochs': 59}. Best is trial 17 with value: 26.9622745513916.


Trial: 43 - Loss: 17.513660430908203 - Val Loss: 39.30217742919922


[I 2024-06-20 01:23:53,979] Trial 44 finished with value: 31.574825286865234 and parameters: {'num_heads': 4, 'model_dim': 164, 'num_layers': 3, 'dropout': 0.35022538941061343, 'learning_rate': 0.0077091730731396494, 'weight_decay': 0.004768593163996761, 'num_epochs': 44}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:54,111] Trial 45 finished with value: 115.05470275878906 and parameters: {'num_heads': 4, 'model_dim': 180, 'num_layers': 1, 'dropout': 0.35274147994830457, 'learning_rate': 0.007243945335342447, 'weight_decay': 0.006155633528528437, 'num_epochs': 43}. Best is trial 17 with value: 26.9622745513916.


Trial: 44 - Loss: 18.02191734313965 - Val Loss: 31.574825286865234
Trial: 45 - Loss: 7.269745349884033 - Val Loss: 115.05470275878906


[I 2024-06-20 01:23:54,336] Trial 46 finished with value: 32.60560607910156 and parameters: {'num_heads': 4, 'model_dim': 204, 'num_layers': 2, 'dropout': 0.44046600728905755, 'learning_rate': 0.009892302489095452, 'weight_decay': 0.004076141575019139, 'num_epochs': 51}. Best is trial 17 with value: 26.9622745513916.


Trial: 46 - Loss: 17.0913028717041 - Val Loss: 32.60560607910156


[I 2024-06-20 01:23:54,584] Trial 47 finished with value: 34.9834098815918 and parameters: {'num_heads': 4, 'model_dim': 200, 'num_layers': 2, 'dropout': 0.44649925470114227, 'learning_rate': 0.004948098253390204, 'weight_decay': 0.003985394617130977, 'num_epochs': 51}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:54,735] Trial 48 finished with value: 37.00632095336914 and parameters: {'num_heads': 4, 'model_dim': 176, 'num_layers': 2, 'dropout': 0.46844301839216707, 'learning_rate': 0.00778683593949153, 'weight_decay': 0.0025986614718808403, 'num_epochs': 30}. Best is trial 17 with value: 26.9622745513916.


Trial: 47 - Loss: 17.92201042175293 - Val Loss: 34.9834098815918
Trial: 48 - Loss: 18.679725646972656 - Val Loss: 37.00632095336914


[I 2024-06-20 01:23:54,913] Trial 49 finished with value: 644.8704223632812 and parameters: {'num_heads': 3, 'model_dim': 129, 'num_layers': 2, 'dropout': 0.4319346520134616, 'learning_rate': 1.0543978561097432e-05, 'weight_decay': 0.005597953625766651, 'num_epochs': 38}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:55,022] Trial 50 finished with value: 314.16131591796875 and parameters: {'num_heads': 3, 'model_dim': 156, 'num_layers': 1, 'dropout': 0.49921914560444264, 'learning_rate': 0.0012206042885699268, 'weight_decay': 0.00592327094512794, 'num_epochs': 33}. Best is trial 17 with value: 26.9622745513916.


Trial: 49 - Loss: 411.4051818847656 - Val Loss: 644.8704223632812
Trial: 50 - Loss: 180.12315368652344 - Val Loss: 314.16131591796875


[I 2024-06-20 01:23:55,281] Trial 51 finished with value: 40.91781234741211 and parameters: {'num_heads': 4, 'model_dim': 216, 'num_layers': 3, 'dropout': 0.35580339430608715, 'learning_rate': 0.009885492107135276, 'weight_decay': 0.0026816593778737786, 'num_epochs': 44}. Best is trial 17 with value: 26.9622745513916.


Trial: 51 - Loss: 17.927492141723633 - Val Loss: 40.91781234741211


[I 2024-06-20 01:23:55,584] Trial 52 finished with value: 36.85087966918945 and parameters: {'num_heads': 2, 'model_dim': 114, 'num_layers': 3, 'dropout': 0.3104274383556465, 'learning_rate': 0.005218085015648803, 'weight_decay': 0.0016347701505516083, 'num_epochs': 52}. Best is trial 17 with value: 26.9622745513916.


Trial: 52 - Loss: 17.419384002685547 - Val Loss: 36.85087966918945


[I 2024-06-20 01:23:55,996] Trial 53 finished with value: 37.73625183105469 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 4, 'dropout': 0.4093617291163395, 'learning_rate': 0.0069488231993088834, 'weight_decay': 0.00958011963688302, 'num_epochs': 49}. Best is trial 17 with value: 26.9622745513916.


Trial: 53 - Loss: 17.98937225341797 - Val Loss: 37.73625183105469


[I 2024-06-20 01:23:56,369] Trial 54 finished with value: 113.4233627319336 and parameters: {'num_heads': 8, 'model_dim': 280, 'num_layers': 2, 'dropout': 0.25088946800494877, 'learning_rate': 0.004878822741985718, 'weight_decay': 0.003473477927754559, 'num_epochs': 66}. Best is trial 17 with value: 26.9622745513916.


Trial: 54 - Loss: 7.222509384155273 - Val Loss: 113.4233627319336


[I 2024-06-20 01:23:56,616] Trial 55 finished with value: 36.886802673339844 and parameters: {'num_heads': 3, 'model_dim': 168, 'num_layers': 3, 'dropout': 0.38226145902318404, 'learning_rate': 0.007861244606023242, 'weight_decay': 0.006862475839075461, 'num_epochs': 41}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:56,725] Trial 56 finished with value: 186.52452087402344 and parameters: {'num_heads': 4, 'model_dim': 196, 'num_layers': 2, 'dropout': 0.47664798971494415, 'learning_rate': 0.002984936660829528, 'weight_decay': 1.3042305333275134e-05, 'num_epochs': 19}. Best is trial 17 with value: 26.9622745513916.


Trial: 55 - Loss: 17.66189193725586 - Val Loss: 36.886802673339844
Trial: 56 - Loss: 96.99653625488281 - Val Loss: 186.52452087402344


[I 2024-06-20 01:23:57,187] Trial 57 finished with value: 41.93734359741211 and parameters: {'num_heads': 4, 'model_dim': 212, 'num_layers': 4, 'dropout': 0.43798556801283145, 'learning_rate': 0.0018847660175705407, 'weight_decay': 0.004702161842754734, 'num_epochs': 61}. Best is trial 17 with value: 26.9622745513916.


Trial: 57 - Loss: 17.48250389099121 - Val Loss: 41.93734359741211


[I 2024-06-20 01:23:57,421] Trial 58 finished with value: 547.0045166015625 and parameters: {'num_heads': 7, 'model_dim': 259, 'num_layers': 3, 'dropout': 0.31914511711420357, 'learning_rate': 3.558140437443361e-05, 'weight_decay': 0.0008091387928542497, 'num_epochs': 27}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:57,564] Trial 59 finished with value: 35.230552673339844 and parameters: {'num_heads': 5, 'model_dim': 245, 'num_layers': 1, 'dropout': 0.28350090946409345, 'learning_rate': 0.0034628313593574422, 'weight_decay': 2.6999889440997847e-05, 'num_epochs': 47}. Best is trial 17 with value: 26.9622745513916.


Trial: 58 - Loss: 342.7031555175781 - Val Loss: 547.0045166015625
Trial: 59 - Loss: 17.594829559326172 - Val Loss: 35.230552673339844


[I 2024-06-20 01:23:57,723] Trial 60 finished with value: 33.511009216308594 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 2, 'dropout': 0.38487402065907134, 'learning_rate': 0.009799032299196144, 'weight_decay': 0.0004495349378648256, 'num_epochs': 33}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:23:57,885] Trial 61 finished with value: 34.2678337097168 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 2, 'dropout': 0.3784605547869738, 'learning_rate': 0.007961452021451093, 'weight_decay': 0.00040852559201483784, 'num_epochs': 34}. Best is trial 17 with value: 26.9622745513916.


Trial: 60 - Loss: 18.05347442626953 - Val Loss: 33.511009216308594
Trial: 61 - Loss: 17.917816162109375 - Val Loss: 34.2678337097168


[I 2024-06-20 01:23:58,057] Trial 62 finished with value: 44.404197692871094 and parameters: {'num_heads': 2, 'model_dim': 124, 'num_layers': 2, 'dropout': 0.3436946377238897, 'learning_rate': 0.005935156085278333, 'weight_decay': 0.002948761152824831, 'num_epochs': 38}. Best is trial 17 with value: 26.9622745513916.


Trial: 62 - Loss: 17.46938705444336 - Val Loss: 44.404197692871094


[I 2024-06-20 01:23:58,314] Trial 63 finished with value: 35.842262268066406 and parameters: {'num_heads': 3, 'model_dim': 171, 'num_layers': 3, 'dropout': 0.3647072772018729, 'learning_rate': 0.009367964689378285, 'weight_decay': 0.000645813208524233, 'num_epochs': 43}. Best is trial 17 with value: 26.9622745513916.


Trial: 63 - Loss: 17.27305030822754 - Val Loss: 35.842262268066406


[I 2024-06-20 01:23:58,516] Trial 64 finished with value: 34.087486267089844 and parameters: {'num_heads': 4, 'model_dim': 224, 'num_layers': 3, 'dropout': 0.32442928463625276, 'learning_rate': 0.00440964051633284, 'weight_decay': 0.001179959501072311, 'num_epochs': 30}. Best is trial 17 with value: 26.9622745513916.


Trial: 64 - Loss: 17.289953231811523 - Val Loss: 34.087486267089844


[I 2024-06-20 01:23:58,928] Trial 65 finished with value: 33.47180938720703 and parameters: {'num_heads': 3, 'model_dim': 177, 'num_layers': 4, 'dropout': 0.41380225734429976, 'learning_rate': 0.0066686200431219445, 'weight_decay': 0.007246390557758605, 'num_epochs': 54}. Best is trial 17 with value: 26.9622745513916.


Trial: 65 - Loss: 17.911909103393555 - Val Loss: 33.47180938720703


[I 2024-06-20 01:23:59,316] Trial 66 finished with value: 40.32209014892578 and parameters: {'num_heads': 1, 'model_dim': 59, 'num_layers': 4, 'dropout': 0.39400417325804526, 'learning_rate': 0.005995153990796935, 'weight_decay': 0.007274182222819247, 'num_epochs': 54}. Best is trial 17 with value: 26.9622745513916.


Trial: 66 - Loss: 17.598081588745117 - Val Loss: 40.32209014892578


[I 2024-06-20 01:23:59,875] Trial 67 finished with value: 34.124961853027344 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 4, 'dropout': 0.4193682716471358, 'learning_rate': 0.002597075032139666, 'weight_decay': 0.005092157134853544, 'num_epochs': 50}. Best is trial 17 with value: 26.9622745513916.


Trial: 67 - Loss: 17.639175415039062 - Val Loss: 34.124961853027344


[I 2024-06-20 01:24:00,461] Trial 68 finished with value: 430.521728515625 and parameters: {'num_heads': 5, 'model_dim': 140, 'num_layers': 5, 'dropout': 0.4025273142700788, 'learning_rate': 0.0003424149460466732, 'weight_decay': 0.008245429442607753, 'num_epochs': 66}. Best is trial 17 with value: 26.9622745513916.


Trial: 68 - Loss: 254.7159423828125 - Val Loss: 430.521728515625


[I 2024-06-20 01:24:00,854] Trial 69 finished with value: 34.63311767578125 and parameters: {'num_heads': 4, 'model_dim': 152, 'num_layers': 4, 'dropout': 0.2666871520887695, 'learning_rate': 0.003717965232543568, 'weight_decay': 0.002088410243565957, 'num_epochs': 55}. Best is trial 17 with value: 26.9622745513916.


Trial: 69 - Loss: 17.7927303314209 - Val Loss: 34.63311767578125


[I 2024-06-20 01:24:01,143] Trial 70 finished with value: 571.4094848632812 and parameters: {'num_heads': 2, 'model_dim': 84, 'num_layers': 4, 'dropout': 0.2326666184312008, 'learning_rate': 0.00013718706981468993, 'weight_decay': 0.0033111356882314365, 'num_epochs': 40}. Best is trial 17 with value: 26.9622745513916.
[I 2024-06-20 01:24:01,299] Trial 71 finished with value: 31.785560607910156 and parameters: {'num_heads': 3, 'model_dim': 177, 'num_layers': 2, 'dropout': 0.3879956049592471, 'learning_rate': 0.008229342391716816, 'weight_decay': 0.0002148964888841253, 'num_epochs': 32}. Best is trial 17 with value: 26.9622745513916.


Trial: 70 - Loss: 361.1565246582031 - Val Loss: 571.4094848632812
Trial: 71 - Loss: 17.77111053466797 - Val Loss: 31.785560607910156


[I 2024-06-20 01:24:01,509] Trial 72 finished with value: 37.261722564697266 and parameters: {'num_heads': 3, 'model_dim': 177, 'num_layers': 2, 'dropout': 0.3503265481999931, 'learning_rate': 0.00764414593068284, 'weight_decay': 0.00010337662033794042, 'num_epochs': 45}. Best is trial 17 with value: 26.9622745513916.


Trial: 72 - Loss: 17.34697151184082 - Val Loss: 37.261722564697266


[I 2024-06-20 01:24:01,741] Trial 73 finished with value: 41.970054626464844 and parameters: {'num_heads': 2, 'model_dim': 124, 'num_layers': 3, 'dropout': 0.4125850427642353, 'learning_rate': 0.0053507751724236, 'weight_decay': 6.739383425426064e-05, 'num_epochs': 37}. Best is trial 17 with value: 26.9622745513916.


Trial: 73 - Loss: 18.750425338745117 - Val Loss: 41.970054626464844


[I 2024-06-20 01:24:02,243] Trial 74 finished with value: 35.92641830444336 and parameters: {'num_heads': 3, 'model_dim': 165, 'num_layers': 5, 'dropout': 0.36798791827287197, 'learning_rate': 0.006325924649060671, 'weight_decay': 0.00022172974814002158, 'num_epochs': 59}. Best is trial 17 with value: 26.9622745513916.


Trial: 74 - Loss: 17.632102966308594 - Val Loss: 35.92641830444336


[I 2024-06-20 01:24:02,460] Trial 75 finished with value: 39.977333068847656 and parameters: {'num_heads': 4, 'model_dim': 204, 'num_layers': 2, 'dropout': 0.3297660747720398, 'learning_rate': 0.004393785893605957, 'weight_decay': 1.310958859495724e-05, 'num_epochs': 47}. Best is trial 17 with value: 26.9622745513916.


Trial: 75 - Loss: 17.589702606201172 - Val Loss: 39.977333068847656


[I 2024-06-20 01:24:02,829] Trial 76 finished with value: 35.28475570678711 and parameters: {'num_heads': 6, 'model_dim': 102, 'num_layers': 3, 'dropout': 0.2980231178621056, 'learning_rate': 0.008572829567697176, 'weight_decay': 0.0003584576447736495, 'num_epochs': 63}. Best is trial 17 with value: 26.9622745513916.


Trial: 76 - Loss: 17.4597225189209 - Val Loss: 35.28475570678711


[I 2024-06-20 01:24:03,069] Trial 77 finished with value: 26.779861450195312 and parameters: {'num_heads': 7, 'model_dim': 147, 'num_layers': 4, 'dropout': 0.4374999816464191, 'learning_rate': 0.007060883848806592, 'weight_decay': 0.004315825541610453, 'num_epochs': 27}. Best is trial 77 with value: 26.779861450195312.
[I 2024-06-20 01:24:03,182] Trial 78 finished with value: 48.90482711791992 and parameters: {'num_heads': 7, 'model_dim': 91, 'num_layers': 2, 'dropout': 0.45965345714117367, 'learning_rate': 0.008269124299655694, 'weight_decay': 0.004566159602089078, 'num_epochs': 22}. Best is trial 77 with value: 26.779861450195312.


Trial: 77 - Loss: 21.42996597290039 - Val Loss: 26.779861450195312
Trial: 78 - Loss: 22.647445678710938 - Val Loss: 48.90482711791992


[I 2024-06-20 01:24:03,274] Trial 79 finished with value: 147.4307861328125 and parameters: {'num_heads': 6, 'model_dim': 144, 'num_layers': 1, 'dropout': 0.44000111367273587, 'learning_rate': 0.003294050414655516, 'weight_decay': 3.246801873285251e-05, 'num_epochs': 25}. Best is trial 77 with value: 26.779861450195312.
[I 2024-06-20 01:24:03,457] Trial 80 finished with value: 28.827077865600586 and parameters: {'num_heads': 7, 'model_dim': 245, 'num_layers': 2, 'dropout': 0.34214758397777445, 'learning_rate': 0.004896087119179283, 'weight_decay': 0.00017435503971459655, 'num_epochs': 31}. Best is trial 77 with value: 26.779861450195312.


Trial: 79 - Loss: 73.92974853515625 - Val Loss: 147.4307861328125
Trial: 80 - Loss: 19.342815399169922 - Val Loss: 28.827077865600586


[I 2024-06-20 01:24:03,642] Trial 81 finished with value: 37.99057388305664 and parameters: {'num_heads': 7, 'model_dim': 252, 'num_layers': 2, 'dropout': 0.3123088122451396, 'learning_rate': 0.005255244178708251, 'weight_decay': 0.009927258032652257, 'num_epochs': 32}. Best is trial 77 with value: 26.779861450195312.
[I 2024-06-20 01:24:03,803] Trial 82 finished with value: 62.13291931152344 and parameters: {'num_heads': 7, 'model_dim': 231, 'num_layers': 2, 'dropout': 0.35902668183581277, 'learning_rate': 0.007304078158914516, 'weight_decay': 0.0002780180270135965, 'num_epochs': 27}. Best is trial 77 with value: 26.779861450195312.


Trial: 81 - Loss: 17.644569396972656 - Val Loss: 37.99057388305664
Trial: 82 - Loss: 21.85930633544922 - Val Loss: 62.13291931152344


[I 2024-06-20 01:24:03,992] Trial 83 finished with value: 40.327205657958984 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 2, 'dropout': 0.34174119879697773, 'learning_rate': 0.004304016832170439, 'weight_decay': 0.00016693025469031834, 'num_epochs': 28}. Best is trial 77 with value: 26.779861450195312.


Trial: 83 - Loss: 18.5627384185791 - Val Loss: 40.327205657958984


[I 2024-06-20 01:24:04,234] Trial 84 finished with value: 104.948486328125 and parameters: {'num_heads': 8, 'model_dim': 208, 'num_layers': 3, 'dropout': 0.42948583366829535, 'learning_rate': 0.002262520002183224, 'weight_decay': 0.00013230662899240677, 'num_epochs': 31}. Best is trial 77 with value: 26.779861450195312.
[I 2024-06-20 01:24:04,419] Trial 85 finished with value: 26.260526657104492 and parameters: {'num_heads': 7, 'model_dim': 196, 'num_layers': 2, 'dropout': 0.3720771417710591, 'learning_rate': 0.008814942881441384, 'weight_decay': 0.00020189673516965147, 'num_epochs': 36}. Best is trial 85 with value: 26.260526657104492.


Trial: 84 - Loss: 43.12310028076172 - Val Loss: 104.948486328125
Trial: 85 - Loss: 19.623733520507812 - Val Loss: 26.260526657104492


[I 2024-06-20 01:24:04,502] Trial 86 finished with value: 468.6476135253906 and parameters: {'num_heads': 6, 'model_dim': 192, 'num_layers': 1, 'dropout': 0.37434212658742183, 'learning_rate': 0.0006340494722760154, 'weight_decay': 8.527358969901693e-05, 'num_epochs': 20}. Best is trial 85 with value: 26.260526657104492.
[I 2024-06-20 01:24:04,679] Trial 87 finished with value: 34.8640022277832 and parameters: {'num_heads': 7, 'model_dim': 154, 'num_layers': 2, 'dropout': 0.39130525523944026, 'learning_rate': 0.00564623555008096, 'weight_decay': 0.00023179649638362953, 'num_epochs': 36}. Best is trial 85 with value: 26.260526657104492.


Trial: 86 - Loss: 289.8584289550781 - Val Loss: 468.6476135253906
Trial: 87 - Loss: 17.94723892211914 - Val Loss: 34.8640022277832


[I 2024-06-20 01:24:04,866] Trial 88 finished with value: 35.814353942871094 and parameters: {'num_heads': 8, 'model_dim': 72, 'num_layers': 2, 'dropout': 0.4033779552313998, 'learning_rate': 0.009952020521072106, 'weight_decay': 0.00018370848937448417, 'num_epochs': 42}. Best is trial 85 with value: 26.260526657104492.


Trial: 88 - Loss: 17.82510757446289 - Val Loss: 35.814353942871094


[I 2024-06-20 01:24:05,075] Trial 89 finished with value: 40.713077545166016 and parameters: {'num_heads': 7, 'model_dim': 224, 'num_layers': 2, 'dropout': 0.3323090550851591, 'learning_rate': 0.00699805759589283, 'weight_decay': 0.00011634650741227917, 'num_epochs': 40}. Best is trial 85 with value: 26.260526657104492.
[I 2024-06-20 01:24:05,169] Trial 90 finished with value: 71.54434967041016 and parameters: {'num_heads': 5, 'model_dim': 135, 'num_layers': 1, 'dropout': 0.45864021726822174, 'learning_rate': 0.00468190240200496, 'weight_decay': 0.0002848201666517648, 'num_epochs': 25}. Best is trial 85 with value: 26.260526657104492.


Trial: 89 - Loss: 17.60531997680664 - Val Loss: 40.713077545166016
Trial: 90 - Loss: 31.40436363220215 - Val Loss: 71.54434967041016


[I 2024-06-20 01:24:05,371] Trial 91 finished with value: 28.837432861328125 and parameters: {'num_heads': 7, 'model_dim': 266, 'num_layers': 2, 'dropout': 0.34855974882187296, 'learning_rate': 0.008756341634663423, 'weight_decay': 4.076906270623024e-05, 'num_epochs': 34}. Best is trial 85 with value: 26.260526657104492.
[I 2024-06-20 01:24:05,566] Trial 92 finished with value: 44.94063186645508 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 2, 'dropout': 0.3478752476793862, 'learning_rate': 0.008792156053215825, 'weight_decay': 4.5829436701567734e-05, 'num_epochs': 34}. Best is trial 85 with value: 26.260526657104492.


Trial: 91 - Loss: 18.772783279418945 - Val Loss: 28.837432861328125
Trial: 92 - Loss: 18.155426025390625 - Val Loss: 44.94063186645508


[I 2024-06-20 01:24:05,787] Trial 93 finished with value: 35.817054748535156 and parameters: {'num_heads': 7, 'model_dim': 266, 'num_layers': 2, 'dropout': 0.10655127037464945, 'learning_rate': 0.0063905696745366145, 'weight_decay': 6.393486904865064e-05, 'num_epochs': 38}. Best is trial 85 with value: 26.260526657104492.
[I 2024-06-20 01:24:05,980] Trial 94 finished with value: 41.93613815307617 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 2, 'dropout': 0.35659486091959036, 'learning_rate': 0.008622721424316892, 'weight_decay': 2.3011686886808134e-05, 'num_epochs': 32}. Best is trial 85 with value: 26.260526657104492.


Trial: 93 - Loss: 17.802785873413086 - Val Loss: 35.817054748535156
Trial: 94 - Loss: 17.818452835083008 - Val Loss: 41.93613815307617


[I 2024-06-20 01:24:06,129] Trial 95 finished with value: 29.726823806762695 and parameters: {'num_heads': 7, 'model_dim': 161, 'num_layers': 2, 'dropout': 0.30468706883076246, 'learning_rate': 0.007316235115612163, 'weight_decay': 3.9344777158246185e-05, 'num_epochs': 28}. Best is trial 85 with value: 26.260526657104492.
[I 2024-06-20 01:24:06,279] Trial 96 finished with value: 50.719383239746094 and parameters: {'num_heads': 7, 'model_dim': 161, 'num_layers': 2, 'dropout': 0.3658284817628492, 'learning_rate': 0.007457385467974115, 'weight_decay': 3.460994923721535e-05, 'num_epochs': 29}. Best is trial 85 with value: 26.260526657104492.


Trial: 95 - Loss: 23.02613639831543 - Val Loss: 29.726823806762695
Trial: 96 - Loss: 19.819927215576172 - Val Loss: 50.719383239746094


[I 2024-06-20 01:24:06,494] Trial 97 finished with value: 18.258420944213867 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 3, 'dropout': 0.3360505706259301, 'learning_rate': 0.003928416878757786, 'weight_decay': 3.886347853547162e-05, 'num_epochs': 19}. Best is trial 97 with value: 18.258420944213867.


Trial: 97 - Loss: 28.26228904724121 - Val Loss: 18.258420944213867


[I 2024-06-20 01:24:06,699] Trial 98 finished with value: 20.633153915405273 and parameters: {'num_heads': 7, 'model_dim': 448, 'num_layers': 3, 'dropout': 0.3076866724811859, 'learning_rate': 0.003714759508995781, 'weight_decay': 4.193803122630351e-05, 'num_epochs': 16}. Best is trial 97 with value: 18.258420944213867.


Trial: 98 - Loss: 20.096773147583008 - Val Loss: 20.633153915405273


[I 2024-06-20 01:24:06,914] Trial 99 finished with value: 18.869674682617188 and parameters: {'num_heads': 7, 'model_dim': 448, 'num_layers': 3, 'dropout': 0.2913466650891946, 'learning_rate': 0.00378219057930559, 'weight_decay': 3.93564908797926e-05, 'num_epochs': 17}. Best is trial 97 with value: 18.258420944213867.


Trial: 99 - Loss: 23.403974533081055 - Val Loss: 18.869674682617188


In [19]:
# Results
print(f'Número de pruebas: {len(study_st.trials)}')
trial = study_st.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Número de pruebas: 100
Mejor prueba: 97
Mejores parametros: {'num_heads': 7, 'model_dim': 392, 'num_layers': 3, 'dropout': 0.3360505706259301, 'learning_rate': 0.003928416878757786, 'weight_decay': 3.886347853547162e-05, 'num_epochs': 19}
Mejor valor de pérdida en validación: 18.258420944213867


## Multi Thread

In [20]:
# configuration optuna
study_mm = optuna.create_study(direction='minimize')
study_mm.optimize(lambda trial: objective(trial, X_mm_train, y_mm_train, X_mm_test, y_mm_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 01:24:06,944] A new study created in memory with name: no-name-80584570-e114-4e5a-a0bb-315bf720a6b3
[I 2024-06-20 01:24:07,677] Trial 0 finished with value: 565.8142700195312 and parameters: {'num_heads': 7, 'model_dim': 140, 'num_layers': 4, 'dropout': 0.22565408612120363, 'learning_rate': 1.8041709209664783e-05, 'weight_decay': 0.00036186954849408426, 'num_epochs': 97}. Best is trial 0 with value: 565.8142700195312.


Trial: 0 - Loss: 356.2906494140625 - Val Loss: 565.8142700195312


[I 2024-06-20 01:24:08,142] Trial 1 finished with value: 596.915771484375 and parameters: {'num_heads': 3, 'model_dim': 111, 'num_layers': 3, 'dropout': 0.3008162295685637, 'learning_rate': 1.1775890125225865e-05, 'weight_decay': 0.0017547018318881682, 'num_epochs': 89}. Best is trial 0 with value: 565.8142700195312.


Trial: 1 - Loss: 380.14208984375 - Val Loss: 596.915771484375


[I 2024-06-20 01:24:08,547] Trial 2 finished with value: 548.234375 and parameters: {'num_heads': 2, 'model_dim': 28, 'num_layers': 6, 'dropout': 0.47996037032082217, 'learning_rate': 0.0012146015416587047, 'weight_decay': 0.0009632311858997156, 'num_epochs': 43}. Best is trial 2 with value: 548.234375.


Trial: 2 - Loss: 344.435302734375 - Val Loss: 548.234375


[I 2024-06-20 01:24:09,299] Trial 3 finished with value: 39.404212951660156 and parameters: {'num_heads': 7, 'model_dim': 28, 'num_layers': 5, 'dropout': 0.3009681638328212, 'learning_rate': 0.008092911845590195, 'weight_decay': 5.290417238828092e-05, 'num_epochs': 89}. Best is trial 3 with value: 39.404212951660156.


Trial: 3 - Loss: 17.45093536376953 - Val Loss: 39.404212951660156


[I 2024-06-20 01:24:09,884] Trial 4 finished with value: 407.7286071777344 and parameters: {'num_heads': 1, 'model_dim': 50, 'num_layers': 6, 'dropout': 0.21291659815159447, 'learning_rate': 0.0012801052399155708, 'weight_decay': 0.005976156898957671, 'num_epochs': 63}. Best is trial 3 with value: 39.404212951660156.


Trial: 4 - Loss: 237.80523681640625 - Val Loss: 407.7286071777344


[I 2024-06-20 01:24:10,761] Trial 5 finished with value: 569.6710205078125 and parameters: {'num_heads': 7, 'model_dim': 119, 'num_layers': 6, 'dropout': 0.26471219697866727, 'learning_rate': 1.5401405413733014e-05, 'weight_decay': 0.00099357681545313, 'num_epochs': 83}. Best is trial 3 with value: 39.404212951660156.


Trial: 5 - Loss: 360.5135498046875 - Val Loss: 569.6710205078125


[I 2024-06-20 01:24:11,283] Trial 6 finished with value: 271.59515380859375 and parameters: {'num_heads': 1, 'model_dim': 17, 'num_layers': 5, 'dropout': 0.25977701600221625, 'learning_rate': 0.004408959920005124, 'weight_decay': 7.191086593771676e-05, 'num_epochs': 67}. Best is trial 3 with value: 39.404212951660156.
[I 2024-06-20 01:24:11,430] Trial 7 finished with value: 581.34033203125 and parameters: {'num_heads': 7, 'model_dim': 91, 'num_layers': 5, 'dropout': 0.33662873884150374, 'learning_rate': 0.000256045037318686, 'weight_decay': 0.00014622565163553483, 'num_epochs': 16}. Best is trial 3 with value: 39.404212951660156.


Trial: 6 - Loss: 141.56314086914062 - Val Loss: 271.59515380859375
Trial: 7 - Loss: 369.32720947265625 - Val Loss: 581.34033203125


[I 2024-06-20 01:24:12,257] Trial 8 finished with value: 514.5130615234375 and parameters: {'num_heads': 8, 'model_dim': 360, 'num_layers': 5, 'dropout': 0.20652066307234054, 'learning_rate': 1.3123080648526783e-05, 'weight_decay': 0.001505118238781817, 'num_epochs': 58}. Best is trial 3 with value: 39.404212951660156.


Trial: 8 - Loss: 317.6351013183594 - Val Loss: 514.5130615234375


[I 2024-06-20 01:24:12,617] Trial 9 finished with value: 496.1044921875 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 6, 'dropout': 0.12334423100417054, 'learning_rate': 0.0001699500494359308, 'weight_decay': 0.00024917189423057903, 'num_epochs': 30}. Best is trial 3 with value: 39.404212951660156.


Trial: 9 - Loss: 304.45452880859375 - Val Loss: 496.1044921875


[I 2024-06-20 01:24:12,820] Trial 10 finished with value: 119.39568328857422 and parameters: {'num_heads': 5, 'model_dim': 215, 'num_layers': 1, 'dropout': 0.4029320443875414, 'learning_rate': 0.0095503747782115, 'weight_decay': 1.1395257618550385e-05, 'num_epochs': 74}. Best is trial 3 with value: 39.404212951660156.


Trial: 10 - Loss: 4.710106372833252 - Val Loss: 119.39568328857422


[I 2024-06-20 01:24:13,037] Trial 11 finished with value: 98.44071960449219 and parameters: {'num_heads': 5, 'model_dim': 215, 'num_layers': 1, 'dropout': 0.4025509940075564, 'learning_rate': 0.007090585348724382, 'weight_decay': 1.0006030824897905e-05, 'num_epochs': 77}. Best is trial 3 with value: 39.404212951660156.


Trial: 11 - Loss: 5.826260089874268 - Val Loss: 98.44071960449219


[I 2024-06-20 01:24:13,257] Trial 12 finished with value: 102.05707550048828 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 1, 'dropout': 0.3853393473358422, 'learning_rate': 0.002455750208212802, 'weight_decay': 1.33082300059078e-05, 'num_epochs': 81}. Best is trial 3 with value: 39.404212951660156.


Trial: 12 - Loss: 6.3668012619018555 - Val Loss: 102.05707550048828


[I 2024-06-20 01:24:13,741] Trial 13 finished with value: 61.431949615478516 and parameters: {'num_heads': 6, 'model_dim': 288, 'num_layers': 2, 'dropout': 0.487541017271486, 'learning_rate': 0.008227806458110064, 'weight_decay': 3.825159435122536e-05, 'num_epochs': 99}. Best is trial 3 with value: 39.404212951660156.


Trial: 13 - Loss: 9.594518661499023 - Val Loss: 61.431949615478516


[I 2024-06-20 01:24:14,489] Trial 14 finished with value: 42.39659118652344 and parameters: {'num_heads': 6, 'model_dim': 312, 'num_layers': 3, 'dropout': 0.4904119748309937, 'learning_rate': 0.0007652041285476416, 'weight_decay': 5.0181520868825765e-05, 'num_epochs': 100}. Best is trial 3 with value: 39.404212951660156.


Trial: 14 - Loss: 17.450353622436523 - Val Loss: 42.39659118652344


[I 2024-06-20 01:24:15,564] Trial 15 finished with value: 364.623779296875 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 3, 'dropout': 0.44785461692452017, 'learning_rate': 6.243755286787291e-05, 'weight_decay': 3.8163102738589995e-05, 'num_epochs': 99}. Best is trial 3 with value: 39.404212951660156.


Trial: 15 - Loss: 206.60215759277344 - Val Loss: 364.623779296875


[I 2024-06-20 01:24:16,058] Trial 16 finished with value: 132.80291748046875 and parameters: {'num_heads': 6, 'model_dim': 312, 'num_layers': 4, 'dropout': 0.35172833583923613, 'learning_rate': 0.000851556934271011, 'weight_decay': 6.773156148234409e-05, 'num_epochs': 48}. Best is trial 3 with value: 39.404212951660156.


Trial: 16 - Loss: 55.10227966308594 - Val Loss: 132.80291748046875


[I 2024-06-20 01:24:16,570] Trial 17 finished with value: 58.90169906616211 and parameters: {'num_heads': 6, 'model_dim': 384, 'num_layers': 2, 'dropout': 0.13090487281209817, 'learning_rate': 0.0005582283633993322, 'weight_decay': 2.7049912376702175e-05, 'num_epochs': 90}. Best is trial 3 with value: 39.404212951660156.


Trial: 17 - Loss: 20.377485275268555 - Val Loss: 58.90169906616211


[I 2024-06-20 01:24:17,036] Trial 18 finished with value: 114.80341339111328 and parameters: {'num_heads': 4, 'model_dim': 64, 'num_layers': 4, 'dropout': 0.439657958324037, 'learning_rate': 0.002503543530610465, 'weight_decay': 0.00010894135519586701, 'num_epochs': 70}. Best is trial 3 with value: 39.404212951660156.


Trial: 18 - Loss: 43.64692306518555 - Val Loss: 114.80341339111328


[I 2024-06-20 01:24:17,933] Trial 19 finished with value: 351.645263671875 and parameters: {'num_heads': 8, 'model_dim': 440, 'num_layers': 3, 'dropout': 0.31492486252019963, 'learning_rate': 9.850117712207766e-05, 'weight_decay': 0.00023880739390357003, 'num_epochs': 88}. Best is trial 3 with value: 39.404212951660156.
[I 2024-06-20 01:24:18,130] Trial 20 finished with value: 382.2616882324219 and parameters: {'num_heads': 6, 'model_dim': 258, 'num_layers': 2, 'dropout': 0.16679380773936162, 'learning_rate': 0.00044168506353189566, 'weight_decay': 1.9775375209843995e-05, 'num_epochs': 36}. Best is trial 3 with value: 39.404212951660156.


Trial: 19 - Loss: 197.21408081054688 - Val Loss: 351.645263671875
Trial: 20 - Loss: 223.11685180664062 - Val Loss: 382.2616882324219


[I 2024-06-20 01:24:18,613] Trial 21 finished with value: 76.29076385498047 and parameters: {'num_heads': 6, 'model_dim': 336, 'num_layers': 2, 'dropout': 0.11451335518911457, 'learning_rate': 0.0007573339473522829, 'weight_decay': 2.783840966385989e-05, 'num_epochs': 90}. Best is trial 3 with value: 39.404212951660156.


Trial: 21 - Loss: 15.740286827087402 - Val Loss: 76.29076385498047


[I 2024-06-20 01:24:19,169] Trial 22 finished with value: 86.16818237304688 and parameters: {'num_heads': 7, 'model_dim': 371, 'num_layers': 2, 'dropout': 0.1628478135059609, 'learning_rate': 0.0004588038674350466, 'weight_decay': 5.6928998167169566e-05, 'num_epochs': 92}. Best is trial 3 with value: 39.404212951660156.


Trial: 22 - Loss: 30.45647621154785 - Val Loss: 86.16818237304688


[I 2024-06-20 01:24:19,771] Trial 23 finished with value: 35.89328384399414 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 3, 'dropout': 0.1580702574133744, 'learning_rate': 0.002884877600816779, 'weight_decay': 2.2711317771154818e-05, 'num_epochs': 84}. Best is trial 23 with value: 35.89328384399414.


Trial: 23 - Loss: 17.474145889282227 - Val Loss: 35.89328384399414


[I 2024-06-20 01:24:20,241] Trial 24 finished with value: 36.91665267944336 and parameters: {'num_heads': 7, 'model_dim': 168, 'num_layers': 3, 'dropout': 0.2714880694501303, 'learning_rate': 0.0031020932965733225, 'weight_decay': 0.00010632247134671363, 'num_epochs': 77}. Best is trial 23 with value: 35.89328384399414.


Trial: 24 - Loss: 17.36962890625 - Val Loss: 36.91665267944336


[I 2024-06-20 01:24:20,896] Trial 25 finished with value: 36.5834846496582 and parameters: {'num_heads': 7, 'model_dim': 161, 'num_layers': 4, 'dropout': 0.26035193647872706, 'learning_rate': 0.0034890033231638616, 'weight_decay': 0.00012661088181390308, 'num_epochs': 81}. Best is trial 23 with value: 35.89328384399414.


Trial: 25 - Loss: 17.6411190032959 - Val Loss: 36.5834846496582


[I 2024-06-20 01:24:21,380] Trial 26 finished with value: 38.881534576416016 and parameters: {'num_heads': 8, 'model_dim': 160, 'num_layers': 4, 'dropout': 0.2515080545509098, 'learning_rate': 0.0029069268542637723, 'weight_decay': 0.00045408194756770546, 'num_epochs': 57}. Best is trial 23 with value: 35.89328384399414.


Trial: 26 - Loss: 17.561622619628906 - Val Loss: 38.881534576416016


[I 2024-06-20 01:24:21,883] Trial 27 finished with value: 38.44298553466797 and parameters: {'num_heads': 7, 'model_dim': 182, 'num_layers': 3, 'dropout': 0.18679566424781924, 'learning_rate': 0.0037867531496270823, 'weight_decay': 0.00012470217182508777, 'num_epochs': 79}. Best is trial 23 with value: 35.89328384399414.


Trial: 27 - Loss: 17.261423110961914 - Val Loss: 38.44298553466797


[I 2024-06-20 01:24:22,362] Trial 28 finished with value: 53.816062927246094 and parameters: {'num_heads': 5, 'model_dim': 150, 'num_layers': 4, 'dropout': 0.2486183473589793, 'learning_rate': 0.0017127896117146632, 'weight_decay': 0.00011530501295432054, 'num_epochs': 70}. Best is trial 23 with value: 35.89328384399414.


Trial: 28 - Loss: 19.03310203552246 - Val Loss: 53.816062927246094


[I 2024-06-20 01:24:22,827] Trial 29 finished with value: 520.1480102539062 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 3, 'dropout': 0.2790275552134588, 'learning_rate': 3.070984694989339e-05, 'weight_decay': 0.0003537460463590618, 'num_epochs': 62}. Best is trial 23 with value: 35.89328384399414.


Trial: 29 - Loss: 322.6041259765625 - Val Loss: 520.1480102539062


[I 2024-06-20 01:24:23,535] Trial 30 finished with value: 35.719642639160156 and parameters: {'num_heads': 7, 'model_dim': 196, 'num_layers': 4, 'dropout': 0.22387117212188407, 'learning_rate': 0.0040168550584568224, 'weight_decay': 0.000551067620373332, 'num_epochs': 84}. Best is trial 30 with value: 35.719642639160156.


Trial: 30 - Loss: 17.52692985534668 - Val Loss: 35.719642639160156


[I 2024-06-20 01:24:24,229] Trial 31 finished with value: 35.48155212402344 and parameters: {'num_heads': 7, 'model_dim': 182, 'num_layers': 4, 'dropout': 0.2299561229541976, 'learning_rate': 0.005424123152846859, 'weight_decay': 0.0006196042143194827, 'num_epochs': 83}. Best is trial 31 with value: 35.48155212402344.


Trial: 31 - Loss: 18.0626277923584 - Val Loss: 35.48155212402344


[I 2024-06-20 01:24:24,968] Trial 32 finished with value: 33.942134857177734 and parameters: {'num_heads': 7, 'model_dim': 203, 'num_layers': 4, 'dropout': 0.15453319375833197, 'learning_rate': 0.005618459018670182, 'weight_decay': 0.0004987437788475981, 'num_epochs': 85}. Best is trial 32 with value: 33.942134857177734.


Trial: 32 - Loss: 17.52376365661621 - Val Loss: 33.942134857177734


[I 2024-06-20 01:24:25,818] Trial 33 finished with value: 34.85960006713867 and parameters: {'num_heads': 6, 'model_dim': 198, 'num_layers': 5, 'dropout': 0.16294196811568928, 'learning_rate': 0.005337730740571215, 'weight_decay': 0.0005953619857450018, 'num_epochs': 85}. Best is trial 32 with value: 33.942134857177734.


Trial: 33 - Loss: 17.68158721923828 - Val Loss: 34.85960006713867


[I 2024-06-20 01:24:26,836] Trial 34 finished with value: 39.044166564941406 and parameters: {'num_heads': 7, 'model_dim': 203, 'num_layers': 5, 'dropout': 0.18270211816156234, 'learning_rate': 0.0018700805745217035, 'weight_decay': 0.000627051932878354, 'num_epochs': 95}. Best is trial 32 with value: 33.942134857177734.


Trial: 34 - Loss: 17.38994026184082 - Val Loss: 39.044166564941406


[I 2024-06-20 01:24:27,507] Trial 35 finished with value: 35.12820816040039 and parameters: {'num_heads': 3, 'model_dim': 147, 'num_layers': 5, 'dropout': 0.22699320603230055, 'learning_rate': 0.005495100857640995, 'weight_decay': 0.004120205554945208, 'num_epochs': 72}. Best is trial 32 with value: 33.942134857177734.


Trial: 35 - Loss: 17.721202850341797 - Val Loss: 35.12820816040039


[I 2024-06-20 01:24:28,150] Trial 36 finished with value: 35.87468719482422 and parameters: {'num_heads': 2, 'model_dim': 108, 'num_layers': 5, 'dropout': 0.10013507178270717, 'learning_rate': 0.005990371734780722, 'weight_decay': 0.004627601973936042, 'num_epochs': 72}. Best is trial 32 with value: 33.942134857177734.


Trial: 36 - Loss: 17.313730239868164 - Val Loss: 35.87468719482422


[I 2024-06-20 01:24:28,702] Trial 37 finished with value: 39.836570739746094 and parameters: {'num_heads': 3, 'model_dim': 129, 'num_layers': 5, 'dropout': 0.14541378117898462, 'learning_rate': 0.005395621274977859, 'weight_decay': 0.003339637575265499, 'num_epochs': 65}. Best is trial 32 with value: 33.942134857177734.


Trial: 37 - Loss: 17.977489471435547 - Val Loss: 39.836570739746094


[I 2024-06-20 01:24:29,524] Trial 38 finished with value: 83.81692504882812 and parameters: {'num_heads': 3, 'model_dim': 141, 'num_layers': 6, 'dropout': 0.18593075582865404, 'learning_rate': 0.001262464715524677, 'weight_decay': 0.009014930037083594, 'num_epochs': 85}. Best is trial 32 with value: 33.942134857177734.


Trial: 38 - Loss: 28.664718627929688 - Val Loss: 83.81692504882812


[I 2024-06-20 01:24:30,308] Trial 39 finished with value: 37.044673919677734 and parameters: {'num_heads': 2, 'model_dim': 90, 'num_layers': 5, 'dropout': 0.23174905878764507, 'learning_rate': 0.0055093632627135234, 'weight_decay': 0.0019721138970123517, 'num_epochs': 93}. Best is trial 32 with value: 33.942134857177734.


Trial: 39 - Loss: 17.582685470581055 - Val Loss: 37.044673919677734


[I 2024-06-20 01:24:30,814] Trial 40 finished with value: 74.02933502197266 and parameters: {'num_heads': 4, 'model_dim': 176, 'num_layers': 6, 'dropout': 0.20382057653554642, 'learning_rate': 0.0018696311687082952, 'weight_decay': 0.0008830894323876153, 'num_epochs': 50}. Best is trial 32 with value: 33.942134857177734.


Trial: 40 - Loss: 26.379985809326172 - Val Loss: 74.02933502197266


[I 2024-06-20 01:24:31,656] Trial 41 finished with value: 33.78989028930664 and parameters: {'num_heads': 8, 'model_dim': 232, 'num_layers': 4, 'dropout': 0.2250748596606079, 'learning_rate': 0.009758954365879327, 'weight_decay': 0.0005228246353070906, 'num_epochs': 87}. Best is trial 41 with value: 33.78989028930664.


Trial: 41 - Loss: 17.236186981201172 - Val Loss: 33.78989028930664


[I 2024-06-20 01:24:32,377] Trial 42 finished with value: 34.21421813964844 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 4, 'dropout': 0.23545644972095928, 'learning_rate': 0.009604761304662084, 'weight_decay': 0.0014758971933128862, 'num_epochs': 75}. Best is trial 41 with value: 33.78989028930664.


Trial: 42 - Loss: 18.057817459106445 - Val Loss: 34.21421813964844


[I 2024-06-20 01:24:33,264] Trial 43 finished with value: 33.69407272338867 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 5, 'dropout': 0.2923868425569631, 'learning_rate': 0.009021730764649344, 'weight_decay': 0.001390468782252346, 'num_epochs': 75}. Best is trial 43 with value: 33.69407272338867.


Trial: 43 - Loss: 17.895845413208008 - Val Loss: 33.69407272338867


[I 2024-06-20 01:24:33,997] Trial 44 finished with value: 33.11648178100586 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 4, 'dropout': 0.3139525991714957, 'learning_rate': 0.008037586034373616, 'weight_decay': 0.001429542570813773, 'num_epochs': 76}. Best is trial 44 with value: 33.11648178100586.


Trial: 44 - Loss: 17.659503936767578 - Val Loss: 33.11648178100586


[I 2024-06-20 01:24:34,722] Trial 45 finished with value: 35.134986877441406 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 4, 'dropout': 0.3015629844826098, 'learning_rate': 0.009635689216359513, 'weight_decay': 0.0011927881920751847, 'num_epochs': 76}. Best is trial 44 with value: 33.11648178100586.


Trial: 45 - Loss: 17.872966766357422 - Val Loss: 35.134986877441406


[I 2024-06-20 01:24:35,386] Trial 46 finished with value: 36.25827407836914 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 4, 'dropout': 0.324031593002347, 'learning_rate': 0.007965171675953646, 'weight_decay': 0.00238011357283215, 'num_epochs': 61}. Best is trial 44 with value: 33.11648178100586.


Trial: 46 - Loss: 17.607580184936523 - Val Loss: 36.25827407836914


[I 2024-06-20 01:24:36,009] Trial 47 finished with value: 33.33131408691406 and parameters: {'num_heads': 8, 'model_dim': 224, 'num_layers': 4, 'dropout': 0.2831381800640481, 'learning_rate': 0.009798530984781425, 'weight_decay': 0.001417160655898792, 'num_epochs': 66}. Best is trial 44 with value: 33.11648178100586.


Trial: 47 - Loss: 17.541793823242188 - Val Loss: 33.33131408691406


[I 2024-06-20 01:24:36,796] Trial 48 finished with value: 35.15205001831055 and parameters: {'num_heads': 8, 'model_dim': 224, 'num_layers': 5, 'dropout': 0.288717790188823, 'learning_rate': 0.0070022930914110226, 'weight_decay': 0.00022276859184231222, 'num_epochs': 68}. Best is trial 44 with value: 33.11648178100586.
[I 2024-06-20 01:24:36,950] Trial 49 finished with value: 18.234451293945312 and parameters: {'num_heads': 8, 'model_dim': 264, 'num_layers': 4, 'dropout': 0.35614664010047575, 'learning_rate': 0.009950757747095477, 'weight_decay': 0.0008579245645970467, 'num_epochs': 11}. Best is trial 49 with value: 18.234451293945312.


Trial: 48 - Loss: 17.875829696655273 - Val Loss: 35.15205001831055
Trial: 49 - Loss: 25.821487426757812 - Val Loss: 18.234451293945312


[I 2024-06-20 01:24:37,173] Trial 50 finished with value: 31.64242935180664 and parameters: {'num_heads': 8, 'model_dim': 264, 'num_layers': 4, 'dropout': 0.3770257713919657, 'learning_rate': 0.00957453206706965, 'weight_decay': 0.0009517694366643621, 'num_epochs': 18}. Best is trial 49 with value: 18.234451293945312.
[I 2024-06-20 01:24:37,326] Trial 51 finished with value: 38.87461471557617 and parameters: {'num_heads': 8, 'model_dim': 264, 'num_layers': 4, 'dropout': 0.3614261890208007, 'learning_rate': 0.007681448851548451, 'weight_decay': 0.00084778430210816, 'num_epochs': 11}. Best is trial 49 with value: 18.234451293945312.


Trial: 50 - Loss: 23.691936492919922 - Val Loss: 31.64242935180664
Trial: 51 - Loss: 22.36124610900879 - Val Loss: 38.87461471557617


[I 2024-06-20 01:24:37,566] Trial 52 finished with value: 69.69807434082031 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 4, 'dropout': 0.37183168207540296, 'learning_rate': 0.009939488194538068, 'weight_decay': 0.002339390066158201, 'num_epochs': 19}. Best is trial 49 with value: 18.234451293945312.


Trial: 52 - Loss: 24.49581527709961 - Val Loss: 69.69807434082031


[I 2024-06-20 01:24:37,882] Trial 53 finished with value: 45.83162307739258 and parameters: {'num_heads': 8, 'model_dim': 336, 'num_layers': 4, 'dropout': 0.3478571678386115, 'learning_rate': 0.004317599394365499, 'weight_decay': 0.0011120204705735324, 'num_epochs': 25}. Best is trial 49 with value: 18.234451293945312.
[I 2024-06-20 01:24:37,998] Trial 54 finished with value: 69.61945343017578 and parameters: {'num_heads': 8, 'model_dim': 272, 'num_layers': 3, 'dropout': 0.324456945508209, 'learning_rate': 0.006883262261354665, 'weight_decay': 0.0015877901843761148, 'num_epochs': 10}. Best is trial 49 with value: 18.234451293945312.


Trial: 53 - Loss: 18.315959930419922 - Val Loss: 45.83162307739258
Trial: 54 - Loss: 38.72071075439453 - Val Loss: 69.61945343017578


[I 2024-06-20 01:24:38,251] Trial 55 finished with value: 19.058975219726562 and parameters: {'num_heads': 8, 'model_dim': 224, 'num_layers': 5, 'dropout': 0.40253227072592124, 'learning_rate': 0.0075746931653454435, 'weight_decay': 0.0030207547986582, 'num_epochs': 19}. Best is trial 49 with value: 18.234451293945312.


Trial: 55 - Loss: 38.49321746826172 - Val Loss: 19.058975219726562


[I 2024-06-20 01:24:38,537] Trial 56 finished with value: 50.267364501953125 and parameters: {'num_heads': 7, 'model_dim': 252, 'num_layers': 6, 'dropout': 0.4090918821399239, 'learning_rate': 0.004350165312185479, 'weight_decay': 0.002877101754129204, 'num_epochs': 18}. Best is trial 49 with value: 18.234451293945312.


Trial: 56 - Loss: 23.010969161987305 - Val Loss: 50.267364501953125


[I 2024-06-20 01:24:38,971] Trial 57 finished with value: 28.8251953125 and parameters: {'num_heads': 8, 'model_dim': 328, 'num_layers': 5, 'dropout': 0.4233667980673984, 'learning_rate': 0.007264562724306775, 'weight_decay': 0.0012951473599667148, 'num_epochs': 29}. Best is trial 49 with value: 18.234451293945312.


Trial: 57 - Loss: 18.811382293701172 - Val Loss: 28.8251953125


[I 2024-06-20 01:24:39,422] Trial 58 finished with value: 432.9831237792969 and parameters: {'num_heads': 8, 'model_dim': 400, 'num_layers': 5, 'dropout': 0.42874437560803214, 'learning_rate': 0.0002018260036303793, 'weight_decay': 0.0008347101494749646, 'num_epochs': 26}. Best is trial 49 with value: 18.234451293945312.


Trial: 58 - Loss: 258.1681213378906 - Val Loss: 432.9831237792969


[I 2024-06-20 01:24:40,039] Trial 59 finished with value: 38.494964599609375 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 6, 'dropout': 0.3849686184119937, 'learning_rate': 0.0024058897061631007, 'weight_decay': 0.0019488192666691345, 'num_epochs': 37}. Best is trial 49 with value: 18.234451293945312.


Trial: 59 - Loss: 18.080503463745117 - Val Loss: 38.494964599609375


[I 2024-06-20 01:24:40,251] Trial 60 finished with value: 18.7248592376709 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.4742753048537519, 'learning_rate': 0.006869792459394316, 'weight_decay': 0.0060190713636594445, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 60 - Loss: 22.161907196044922 - Val Loss: 18.7248592376709


[I 2024-06-20 01:24:40,474] Trial 61 finished with value: 19.059186935424805 and parameters: {'num_heads': 8, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.47616745625256257, 'learning_rate': 0.006957495699074528, 'weight_decay': 0.0052827677130104665, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 61 - Loss: 20.95928192138672 - Val Loss: 19.059186935424805


[I 2024-06-20 01:24:40,686] Trial 62 finished with value: 18.56583595275879 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.4431901773824785, 'learning_rate': 0.006943580451418799, 'weight_decay': 0.007735067822239839, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 62 - Loss: 22.48019790649414 - Val Loss: 18.56583595275879


[I 2024-06-20 01:24:40,921] Trial 63 finished with value: 20.311203002929688 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 5, 'dropout': 0.47019219373773125, 'learning_rate': 0.007038365887943855, 'weight_decay': 0.009104023456998385, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 63 - Loss: 38.071502685546875 - Val Loss: 20.311203002929688


[I 2024-06-20 01:24:41,142] Trial 64 finished with value: 18.235780715942383 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 5, 'dropout': 0.47164184661336606, 'learning_rate': 0.006685150856630296, 'weight_decay': 0.008089922882639362, 'num_epochs': 13}. Best is trial 49 with value: 18.234451293945312.


Trial: 64 - Loss: 25.755300521850586 - Val Loss: 18.235780715942383


[I 2024-06-20 01:24:41,383] Trial 65 finished with value: 88.5094223022461 and parameters: {'num_heads': 7, 'model_dim': 350, 'num_layers': 5, 'dropout': 0.46757220242734887, 'learning_rate': 0.0034491427529830866, 'weight_decay': 0.00919304058805251, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 65 - Loss: 43.89646530151367 - Val Loss: 88.5094223022461


[I 2024-06-20 01:24:41,691] Trial 66 finished with value: 18.666790008544922 and parameters: {'num_heads': 7, 'model_dim': 287, 'num_layers': 5, 'dropout': 0.4672082035584925, 'learning_rate': 0.004639325125708668, 'weight_decay': 0.0069907459887343605, 'num_epochs': 21}. Best is trial 49 with value: 18.234451293945312.


Trial: 66 - Loss: 25.89372444152832 - Val Loss: 18.666790008544922


[I 2024-06-20 01:24:41,977] Trial 67 finished with value: 18.435392379760742 and parameters: {'num_heads': 6, 'model_dim': 288, 'num_layers': 5, 'dropout': 0.45390292071035077, 'learning_rate': 0.004767904050284296, 'weight_decay': 0.006452634900123431, 'num_epochs': 21}. Best is trial 49 with value: 18.234451293945312.


Trial: 67 - Loss: 27.017684936523438 - Val Loss: 18.435392379760742


[I 2024-06-20 01:24:42,337] Trial 68 finished with value: 20.505985260009766 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 6, 'dropout': 0.4558168928252718, 'learning_rate': 0.004200486580135735, 'weight_decay': 0.0073235576618758715, 'num_epochs': 22}. Best is trial 49 with value: 18.234451293945312.


Trial: 68 - Loss: 22.74203872680664 - Val Loss: 20.505985260009766


[I 2024-06-20 01:24:42,625] Trial 69 finished with value: 66.23165130615234 and parameters: {'num_heads': 5, 'model_dim': 285, 'num_layers': 5, 'dropout': 0.4900916103178891, 'learning_rate': 0.002846920194093988, 'weight_decay': 0.007006577446383879, 'num_epochs': 22}. Best is trial 49 with value: 18.234451293945312.


Trial: 69 - Loss: 26.634021759033203 - Val Loss: 66.23165130615234


[I 2024-06-20 01:24:43,145] Trial 70 finished with value: 231.04031372070312 and parameters: {'num_heads': 6, 'model_dim': 306, 'num_layers': 6, 'dropout': 0.49834176854727463, 'learning_rate': 0.0009150846362985559, 'weight_decay': 0.006387582793461892, 'num_epochs': 32}. Best is trial 49 with value: 18.234451293945312.


Trial: 70 - Loss: 117.9800033569336 - Val Loss: 231.04031372070312


[I 2024-06-20 01:24:43,364] Trial 71 finished with value: 70.05523681640625 and parameters: {'num_heads': 7, 'model_dim': 273, 'num_layers': 5, 'dropout': 0.45000391437201515, 'learning_rate': 0.004699163393745458, 'weight_decay': 0.004918089539417787, 'num_epochs': 14}. Best is trial 49 with value: 18.234451293945312.


Trial: 71 - Loss: 34.226226806640625 - Val Loss: 70.05523681640625


[I 2024-06-20 01:24:43,681] Trial 72 finished with value: 100.2027587890625 and parameters: {'num_heads': 7, 'model_dim': 322, 'num_layers': 5, 'dropout': 0.47569025230756723, 'learning_rate': 0.0022864792667528144, 'weight_decay': 0.005259559423582955, 'num_epochs': 21}. Best is trial 49 with value: 18.234451293945312.


Trial: 72 - Loss: 43.06985092163086 - Val Loss: 100.2027587890625


[I 2024-06-20 01:24:43,928] Trial 73 finished with value: 631.0264892578125 and parameters: {'num_heads': 7, 'model_dim': 287, 'num_layers': 5, 'dropout': 0.4587054617516423, 'learning_rate': 1.0229073898837155e-05, 'weight_decay': 0.004182271247073704, 'num_epochs': 16}. Best is trial 49 with value: 18.234451293945312.
[I 2024-06-20 01:24:44,126] Trial 74 finished with value: 32.5045166015625 and parameters: {'num_heads': 6, 'model_dim': 306, 'num_layers': 5, 'dropout': 0.43714804819061676, 'learning_rate': 0.006273320111721315, 'weight_decay': 0.0058329576684318695, 'num_epochs': 12}. Best is trial 49 with value: 18.234451293945312.


Trial: 73 - Loss: 406.9243469238281 - Val Loss: 631.0264892578125
Trial: 74 - Loss: 19.220808029174805 - Val Loss: 32.5045166015625


[I 2024-06-20 01:24:44,491] Trial 75 finished with value: 547.03759765625 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.40052823947338556, 'learning_rate': 2.4648441308980805e-05, 'weight_decay': 0.0037253102573893753, 'num_epochs': 26}. Best is trial 49 with value: 18.234451293945312.


Trial: 75 - Loss: 340.90301513671875 - Val Loss: 547.03759765625


[I 2024-06-20 01:24:44,761] Trial 76 finished with value: 136.123046875 and parameters: {'num_heads': 7, 'model_dim': 259, 'num_layers': 6, 'dropout': 0.41616527368617234, 'learning_rate': 0.0035638284901192527, 'weight_decay': 0.007623373391020281, 'num_epochs': 15}. Best is trial 49 with value: 18.234451293945312.


Trial: 76 - Loss: 70.45787811279297 - Val Loss: 136.123046875


[I 2024-06-20 01:24:45,056] Trial 77 finished with value: 500.0721435546875 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 5, 'dropout': 0.48173929946975785, 'learning_rate': 9.964229971932119e-05, 'weight_decay': 0.0029826260892648862, 'num_epochs': 20}. Best is trial 49 with value: 18.234451293945312.


Trial: 77 - Loss: 307.6679992675781 - Val Loss: 500.0721435546875


[I 2024-06-20 01:24:45,426] Trial 78 finished with value: 48.94922637939453 and parameters: {'num_heads': 7, 'model_dim': 350, 'num_layers': 5, 'dropout': 0.4406719692646, 'learning_rate': 0.004849104110146227, 'weight_decay': 0.005952110282053236, 'num_epochs': 24}. Best is trial 49 with value: 18.234451293945312.


Trial: 78 - Loss: 17.823246002197266 - Val Loss: 48.94922637939453


[I 2024-06-20 01:24:45,666] Trial 79 finished with value: 292.0928649902344 and parameters: {'num_heads': 6, 'model_dim': 270, 'num_layers': 5, 'dropout': 0.466167616507916, 'learning_rate': 0.0015703992292180181, 'weight_decay': 0.005059745923080045, 'num_epochs': 17}. Best is trial 49 with value: 18.234451293945312.
[I 2024-06-20 01:24:45,843] Trial 80 finished with value: 83.50780487060547 and parameters: {'num_heads': 5, 'model_dim': 280, 'num_layers': 6, 'dropout': 0.49839688253737224, 'learning_rate': 0.006279085872053852, 'weight_decay': 0.007412759359883361, 'num_epochs': 10}. Best is trial 49 with value: 18.234451293945312.


Trial: 79 - Loss: 164.7393341064453 - Val Loss: 292.0928649902344
Trial: 80 - Loss: 47.062255859375 - Val Loss: 83.50780487060547


[I 2024-06-20 01:24:46,086] Trial 81 finished with value: 19.228322982788086 and parameters: {'num_heads': 7, 'model_dim': 371, 'num_layers': 5, 'dropout': 0.47370393913959463, 'learning_rate': 0.006657109713100051, 'weight_decay': 0.00956694778803139, 'num_epochs': 13}. Best is trial 49 with value: 18.234451293945312.


Trial: 81 - Loss: 33.37879180908203 - Val Loss: 19.228322982788086


[I 2024-06-20 01:24:46,310] Trial 82 finished with value: 49.425697326660156 and parameters: {'num_heads': 7, 'model_dim': 371, 'num_layers': 5, 'dropout': 0.48126029073423815, 'learning_rate': 0.004742373156848042, 'weight_decay': 0.009612134788444231, 'num_epochs': 12}. Best is trial 49 with value: 18.234451293945312.


Trial: 82 - Loss: 25.785865783691406 - Val Loss: 49.425697326660156


[I 2024-06-20 01:24:46,533] Trial 83 finished with value: 21.41472625732422 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 5, 'dropout': 0.4599551051895096, 'learning_rate': 0.006284952401394373, 'weight_decay': 0.0036679296306886613, 'num_epochs': 13}. Best is trial 49 with value: 18.234451293945312.


Trial: 83 - Loss: 18.230709075927734 - Val Loss: 21.41472625732422


[I 2024-06-20 01:24:47,004] Trial 84 finished with value: 37.80650329589844 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 5, 'dropout': 0.43451363910198665, 'learning_rate': 0.007962931939773535, 'weight_decay': 0.008304748825409766, 'num_epochs': 29}. Best is trial 49 with value: 18.234451293945312.


Trial: 84 - Loss: 17.371726989746094 - Val Loss: 37.80650329589844


[I 2024-06-20 01:24:47,206] Trial 85 finished with value: 183.69808959960938 and parameters: {'num_heads': 6, 'model_dim': 216, 'num_layers': 5, 'dropout': 0.44433660533427694, 'learning_rate': 0.003358532957614147, 'weight_decay': 0.006244591848948698, 'num_epochs': 16}. Best is trial 49 with value: 18.234451293945312.


Trial: 85 - Loss: 97.41349792480469 - Val Loss: 183.69808959960938


[I 2024-06-20 01:24:47,529] Trial 86 finished with value: 18.365934371948242 and parameters: {'num_heads': 7, 'model_dim': 357, 'num_layers': 5, 'dropout': 0.3948557676703237, 'learning_rate': 0.0051958932595973, 'weight_decay': 0.0054575633273853015, 'num_epochs': 20}. Best is trial 49 with value: 18.234451293945312.


Trial: 86 - Loss: 36.85951232910156 - Val Loss: 18.365934371948242


[I 2024-06-20 01:24:47,891] Trial 87 finished with value: 52.93950653076172 and parameters: {'num_heads': 8, 'model_dim': 344, 'num_layers': 5, 'dropout': 0.3943306563851989, 'learning_rate': 0.0053529487956447365, 'weight_decay': 0.00297415527450233, 'num_epochs': 23}. Best is trial 49 with value: 18.234451293945312.


Trial: 87 - Loss: 18.555871963500977 - Val Loss: 52.93950653076172


[I 2024-06-20 01:24:48,300] Trial 88 finished with value: 19.606618881225586 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 5, 'dropout': 0.419637788536297, 'learning_rate': 0.004064434709930078, 'weight_decay': 0.004214842787038461, 'num_epochs': 20}. Best is trial 49 with value: 18.234451293945312.


Trial: 88 - Loss: 32.464683532714844 - Val Loss: 19.606618881225586


[I 2024-06-20 01:24:48,712] Trial 89 finished with value: 35.14390563964844 and parameters: {'num_heads': 7, 'model_dim': 252, 'num_layers': 5, 'dropout': 0.4099690535716762, 'learning_rate': 0.002659051380473618, 'weight_decay': 0.005297293611548941, 'num_epochs': 32}. Best is trial 49 with value: 18.234451293945312.


Trial: 89 - Loss: 17.58647346496582 - Val Loss: 35.14390563964844


[I 2024-06-20 01:24:49,027] Trial 90 finished with value: 18.256494522094727 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 6, 'dropout': 0.4541270997993919, 'learning_rate': 0.005215664602064469, 'weight_decay': 0.0025194165454945567, 'num_epochs': 18}. Best is trial 49 with value: 18.234451293945312.


Trial: 90 - Loss: 29.882112503051758 - Val Loss: 18.256494522094727


[I 2024-06-20 01:24:49,311] Trial 91 finished with value: 18.43042755126953 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 6, 'dropout': 0.4478514317175036, 'learning_rate': 0.00594602893923698, 'weight_decay': 0.00237526036171331, 'num_epochs': 17}. Best is trial 49 with value: 18.234451293945312.


Trial: 91 - Loss: 31.387451171875 - Val Loss: 18.43042755126953


[I 2024-06-20 01:24:49,612] Trial 92 finished with value: 20.860353469848633 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 6, 'dropout': 0.4518896083500751, 'learning_rate': 0.004785640722368135, 'weight_decay': 0.002324261795405715, 'num_epochs': 18}. Best is trial 49 with value: 18.234451293945312.


Trial: 92 - Loss: 19.829248428344727 - Val Loss: 20.860353469848633


[I 2024-06-20 01:24:50,052] Trial 93 finished with value: 31.042266845703125 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 6, 'dropout': 0.4327877912898545, 'learning_rate': 0.008482741819527087, 'weight_decay': 0.003459109598965954, 'num_epochs': 27}. Best is trial 49 with value: 18.234451293945312.


Trial: 93 - Loss: 18.66336441040039 - Val Loss: 31.042266845703125


[I 2024-06-20 01:24:50,314] Trial 94 finished with value: 193.2900848388672 and parameters: {'num_heads': 4, 'model_dim': 248, 'num_layers': 6, 'dropout': 0.44706985488407286, 'learning_rate': 0.0021818160334697552, 'weight_decay': 0.004298526770715896, 'num_epochs': 20}. Best is trial 49 with value: 18.234451293945312.


Trial: 94 - Loss: 98.62249755859375 - Val Loss: 193.2900848388672


[I 2024-06-20 01:24:50,588] Trial 95 finished with value: 18.90048599243164 and parameters: {'num_heads': 5, 'model_dim': 290, 'num_layers': 6, 'dropout': 0.46270124969030046, 'learning_rate': 0.005610150709556562, 'weight_decay': 0.0026020659821745316, 'num_epochs': 17}. Best is trial 49 with value: 18.234451293945312.
[I 2024-06-20 01:24:50,773] Trial 96 finished with value: 255.18922424316406 and parameters: {'num_heads': 5, 'model_dim': 290, 'num_layers': 6, 'dropout': 0.4628994459908637, 'learning_rate': 0.0031004034619933, 'weight_decay': 0.0017216764543656836, 'num_epochs': 10}. Best is trial 49 with value: 18.234451293945312.


Trial: 95 - Loss: 23.05292320251465 - Val Loss: 18.90048599243164
Trial: 96 - Loss: 151.61587524414062 - Val Loss: 255.18922424316406


[I 2024-06-20 01:24:51,064] Trial 97 finished with value: 60.415653228759766 and parameters: {'num_heads': 5, 'model_dim': 300, 'num_layers': 6, 'dropout': 0.4903231703209344, 'learning_rate': 0.003790967461906788, 'weight_decay': 0.002640642258023436, 'num_epochs': 17}. Best is trial 49 with value: 18.234451293945312.


Trial: 97 - Loss: 27.91978645324707 - Val Loss: 60.415653228759766


[I 2024-06-20 01:24:51,437] Trial 98 finished with value: 35.96021270751953 and parameters: {'num_heads': 5, 'model_dim': 310, 'num_layers': 6, 'dropout': 0.4253063268852321, 'learning_rate': 0.005612825073207685, 'weight_decay': 0.0007357129787785941, 'num_epochs': 23}. Best is trial 49 with value: 18.234451293945312.


Trial: 98 - Loss: 20.196374893188477 - Val Loss: 35.96021270751953


[I 2024-06-20 01:24:52,132] Trial 99 finished with value: 31.852649688720703 and parameters: {'num_heads': 6, 'model_dim': 330, 'num_layers': 6, 'dropout': 0.45456156631975075, 'learning_rate': 0.004867028636817728, 'weight_decay': 0.0019897507015170113, 'num_epochs': 43}. Best is trial 49 with value: 18.234451293945312.


Trial: 99 - Loss: 17.811973571777344 - Val Loss: 31.852649688720703


In [21]:
# Results
print(f'Trials quantity: {len(study_mm.trials)}')
trial = study_mm.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Trials quantity: 100
Mejor prueba: 49
Mejores parametros: {'num_heads': 8, 'model_dim': 264, 'num_layers': 4, 'dropout': 0.35614664010047575, 'learning_rate': 0.009950757747095477, 'weight_decay': 0.0008579245645970467, 'num_epochs': 11}
Mejor valor de pérdida en validación: 18.234451293945312


# Training

In [22]:
input_dim = len(features)
output_dim = 1

## General

In [23]:
# hyperparameters
num_heads = study_g.best_trial.params['num_heads']
model_dim = study_g.best_trial.params['model_dim']
num_layers = study_g.best_trial.params['num_layers']
dropout = study_g.best_trial.params['dropout']
lr = study_g.best_trial.params['learning_rate']
wd = study_g.best_trial.params['weight_decay']
num_epochs = study_g.best_trial.params['num_epochs']

study_g.best_trial.params

{'num_heads': 8,
 'model_dim': 224,
 'num_layers': 6,
 'dropout': 0.2320146403887001,
 'learning_rate': 0.009731200416128362,
 'weight_decay': 1.8538654992265227e-05,
 'num_epochs': 14}

In [24]:
# general model initialization
model_g = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_g = model_g.to(DEVICE)
criterion_g = nn.MSELoss()
optimizer_g = optim.AdamW(model_g.parameters(), lr=lr, weight_decay=wd)

model_g.train()

for epoch in range(num_epochs):
	optimizer_g.zero_grad()
	output = model_g(X_g_train)
	loss = criterion_g(output, y_g_train)
	loss.backward()
	optimizer_g.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_g.eval()
		with torch.no_grad():
			val_predictions = model_g(X_g_test)
			val_loss = criterion_g(val_predictions, y_g_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_g.train()

Epoch 10/14, Loss: 55.85844802856445, Val Loss: 84.19432830810547
Epoch 14/14, Loss: 59.4248046875, Val Loss: 95.29679870605469


## Single Thread

In [25]:
# hyperparameters
num_heads = study_st.best_trial.params['num_heads']
model_dim = study_st.best_trial.params['model_dim']
num_layers = study_st.best_trial.params['num_layers']
dropout = study_st.best_trial.params['dropout']
lr = study_st.best_trial.params['learning_rate']
wd = study_st.best_trial.params['weight_decay']
num_epochs = study_st.best_trial.params['num_epochs']
study_st.best_trial.params

{'num_heads': 7,
 'model_dim': 392,
 'num_layers': 3,
 'dropout': 0.3360505706259301,
 'learning_rate': 0.003928416878757786,
 'weight_decay': 3.886347853547162e-05,
 'num_epochs': 19}

In [26]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_st = model_st.to(DEVICE)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

Epoch 10/19, Loss: 17.42947769165039, Val Loss: 28.5018310546875
Epoch 19/19, Loss: 17.57338523864746, Val Loss: 40.58216857910156


## Multi Thread

In [27]:
# hyperparameters
num_heads = study_mm.best_trial.params['num_heads']
model_dim = study_mm.best_trial.params['model_dim']
num_layers = study_mm.best_trial.params['num_layers']
dropout = study_mm.best_trial.params['dropout']
lr = study_mm.best_trial.params['learning_rate']
wd = study_mm.best_trial.params['weight_decay']
num_epochs = study_mm.best_trial.params['num_epochs']

study_mm.best_trial.params

{'num_heads': 8,
 'model_dim': 264,
 'num_layers': 4,
 'dropout': 0.35614664010047575,
 'learning_rate': 0.009950757747095477,
 'weight_decay': 0.0008579245645970467,
 'num_epochs': 11}

In [28]:
# multi thread model initialization
model_mm = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_mm = model_mm.to(DEVICE)
criterion_mm = nn.MSELoss()
optimizer_mm = optim.AdamW(model_mm.parameters(), lr=lr, weight_decay=wd)

model_mm.train()

for epoch in range(num_epochs):
	optimizer_mm.zero_grad()
	output = model_mm(X_mm_train)
	loss = criterion_mm(output, y_mm_train)
	loss.backward()
	optimizer_mm.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_mm.eval()
		with torch.no_grad():
			val_predictions = model_mm(X_mm_test)
			val_loss = criterion_mm(val_predictions, y_mm_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_mm.train()

Epoch 10/11, Loss: 38.065513610839844, Val Loss: 19.448427200317383
Epoch 11/11, Loss: 41.414222717285156, Val Loss: 18.498815536499023


# Conclusion
Queda trabajo que hacer en la red, además de conseguir más datos para un entrenamiento más robusto. Queda por ahora descartado el uso de solo un modelo para multi-threading y single-threading, ya que el modelo tiene más del triple de *loss*.