In [22]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

In [23]:
# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE.type

'cuda'

In [24]:
# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Pre-processing input data

In [25]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [26]:
results_df = pd.read_csv('../results/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['l3_cache_size'] = results_df['l3_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [27]:
# remove one computer for testing
g_train = results_df[results_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])
g_test = results_df[results_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])

In [28]:
mm_df = results_df[results_df['benchmark']=='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
mm_train = mm_df[mm_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
mm_test = mm_df[mm_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [29]:
st_df = results_df[results_df['benchmark']!='MATRIX_MULT'].drop(columns=['benchmark','count'])
# remove one computer for testing
st_train = st_df[st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
st_test = st_df[st_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [30]:
target = 'total_time'
features = mm_test.columns.copy().drop(target)
features_st = features.copy().drop(['count'])

In [31]:
# general data
## split data
X_g_train = g_train[features]
y_g_train = g_train[target]

X_g_test = g_test[features]
y_g_test = g_test[target]

## normalize data
scaler = StandardScaler()
X_g_train = scaler.fit_transform(X_g_train)
X_g_test = scaler.transform(X_g_test)

## convert to tensor
X_g_train = torch.tensor(X_g_train, dtype=torch.float32).unsqueeze(1)
X_g_test = torch.tensor(X_g_test, dtype=torch.float32).unsqueeze(1)
y_g_train = torch.tensor(y_g_train.values, dtype=torch.float32).view(-1, 1)
y_g_test = torch.tensor(y_g_test.values, dtype=torch.float32).view(-1, 1)

In [32]:
# single thread data
## split data
X_st_train = st_train[features_st]
y_st_train = st_train[target]

X_st_test = st_test[features_st]
y_st_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_st_train = scaler.fit_transform(X_st_train)
X_st_test = scaler.transform(X_st_test)

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32).unsqueeze(1)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32).unsqueeze(1)
y_st_train = torch.tensor(y_st_train.values, dtype=torch.float32).view(-1, 1)
y_st_test = torch.tensor(y_st_test.values, dtype=torch.float32).view(-1, 1)

In [33]:
# multi thread data
## split data
X_mm_train = mm_train[features]
y_mm_train = mm_train[target]

X_mm_test = mm_test[features]
y_mm_test = mm_test[target]

## normalize data
scaler = StandardScaler()
X_mm_train = scaler.fit_transform(X_mm_train)
X_mm_test = scaler.transform(X_mm_test)

## convert to tensor
X_mm_train = torch.tensor(X_mm_train, dtype=torch.float32).unsqueeze(1)
X_mm_test = torch.tensor(X_mm_test, dtype=torch.float32).unsqueeze(1)
y_mm_train = torch.tensor(y_mm_train.values, dtype=torch.float32).view(-1, 1)
y_mm_test = torch.tensor(y_mm_test.values, dtype=torch.float32).view(-1, 1)

In [34]:
if DEVICE.type == 'cuda':
	# move to DEVICE
	X_g_train = X_g_train.to(DEVICE)
	y_g_train = y_g_train.to(DEVICE)
	X_g_test = X_g_test.to(DEVICE)
	y_g_test = y_g_test.to(DEVICE)

	X_st_train = X_st_train.to(DEVICE)
	y_st_train = y_st_train.to(DEVICE)
	X_st_test = X_st_test.to(DEVICE)
	y_st_test = y_st_test.to(DEVICE)

	X_mm_train = X_mm_train.to(DEVICE)
	y_mm_train = y_mm_train.to(DEVICE)
	X_mm_test = X_mm_test.to(DEVICE)
	y_mm_test = y_mm_test.to(DEVICE)

# Model

In [35]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, dropout=0.1):
		super(TransformerModel, self).__init__()
		# layers
		self.embedding = nn.Linear(input_dim, model_dim)
		encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
		self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
		self.fc = nn.Linear(model_dim, output_dim)
		self.dropout = nn.Dropout(dropout)
	
	def forward(self, x):
		x = self.embedding(x)
		x = self.dropout(x)
		x = self.transformer(x)
		x = self.fc(x.mean(dim=1))
		return x

In [36]:
def objective(trial: optuna.Trial, X_train, y_train, X_test, y_test, input_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	model_dim = trial.suggest_int('model_dim', num_heads * 4, num_heads * 64, step=num_heads)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	dropout = trial.suggest_float('dropout', 0.1, 0.5)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
	if DEVICE.type == 'cuda':
		model = model.to(DEVICE)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()
	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

		# trial.report(val_loss.item(), epoch+1)
		# if trial.should_prune():
		# 	raise optuna.TrialPruned()
	print(f"Trial: {trial.number} - Loss: {loss.item()} - Val Loss: {val_loss.item()}")
	return val_loss.item()

# Hyperparameters Optimization

In [37]:
n_trials = 100

## General

In [38]:
# configuration optuna
study_g = optuna.create_study(direction='minimize')
study_g.optimize(lambda trial: objective(trial, X_g_train, y_g_train, X_g_test, y_g_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 05:14:35,859] A new study created in memory with name: no-name-8e475bdb-b813-46eb-85f3-ee8d5b669858
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 05:14:36,320] Trial 0 finished with value: 870.7213745117188 and parameters: {'num_heads': 2, 'model_dim': 18, 'num_layers': 5, 'dropout': 0.20771216836837492, 'learning_rate': 5.638512125605611e-05, 'weight_decay': 0.000280267216959323, 'num_epochs': 44}. Best is trial 0 with value: 870.7213745117188.


Trial: 0 - Loss: 386.89605712890625 - Val Loss: 870.7213745117188


[I 2024-06-20 05:14:36,693] Trial 1 finished with value: 152.57757568359375 and parameters: {'num_heads': 4, 'model_dim': 112, 'num_layers': 4, 'dropout': 0.1124448114212703, 'learning_rate': 0.0036271528609092764, 'weight_decay': 7.074711477758549e-05, 'num_epochs': 52}. Best is trial 1 with value: 152.57757568359375.
[I 2024-06-20 05:14:36,859] Trial 2 finished with value: 358.3475341796875 and parameters: {'num_heads': 6, 'model_dim': 42, 'num_layers': 1, 'dropout': 0.13064577969827562, 'learning_rate': 0.008237486522403943, 'weight_decay': 0.0005536388059568095, 'num_epochs': 70}. Best is trial 1 with value: 152.57757568359375.


Trial: 1 - Loss: 47.18220520019531 - Val Loss: 152.57757568359375
Trial: 2 - Loss: 12.7821044921875 - Val Loss: 358.3475341796875


[I 2024-06-20 05:14:37,216] Trial 3 finished with value: 600.9075927734375 and parameters: {'num_heads': 6, 'model_dim': 54, 'num_layers': 3, 'dropout': 0.3503944347509199, 'learning_rate': 0.001086155949063208, 'weight_decay': 0.004744834918674038, 'num_epochs': 66}. Best is trial 1 with value: 152.57757568359375.


Trial: 3 - Loss: 225.43182373046875 - Val Loss: 600.9075927734375


[I 2024-06-20 05:14:37,737] Trial 4 finished with value: 144.86387634277344 and parameters: {'num_heads': 6, 'model_dim': 60, 'num_layers': 3, 'dropout': 0.2758938010824402, 'learning_rate': 0.008469001821045036, 'weight_decay': 2.877037484387232e-05, 'num_epochs': 96}. Best is trial 4 with value: 144.86387634277344.
[I 2024-06-20 05:14:37,841] Trial 5 finished with value: 784.7589721679688 and parameters: {'num_heads': 7, 'model_dim': 252, 'num_layers': 2, 'dropout': 0.4933999187722301, 'learning_rate': 7.519112950806595e-05, 'weight_decay': 0.00922448485757095, 'num_epochs': 17}. Best is trial 4 with value: 144.86387634277344.


Trial: 4 - Loss: 46.858482360839844 - Val Loss: 144.86387634277344
Trial: 5 - Loss: 329.7919616699219 - Val Loss: 784.7589721679688


[I 2024-06-20 05:14:38,245] Trial 6 finished with value: 853.3906860351562 and parameters: {'num_heads': 2, 'model_dim': 44, 'num_layers': 5, 'dropout': 0.23524824525860147, 'learning_rate': 1.789219615842148e-05, 'weight_decay': 0.001262812730216208, 'num_epochs': 49}. Best is trial 4 with value: 144.86387634277344.


Trial: 6 - Loss: 375.3460388183594 - Val Loss: 853.3906860351562


[I 2024-06-20 05:14:38,487] Trial 7 finished with value: 858.7440185546875 and parameters: {'num_heads': 2, 'model_dim': 24, 'num_layers': 2, 'dropout': 0.4237893404179245, 'learning_rate': 3.608936052041928e-05, 'weight_decay': 6.160444273170153e-05, 'num_epochs': 43}. Best is trial 4 with value: 144.86387634277344.


Trial: 7 - Loss: 383.33074951171875 - Val Loss: 858.7440185546875


[I 2024-06-20 05:14:38,906] Trial 8 finished with value: 761.5990600585938 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 2, 'dropout': 0.3259042944730215, 'learning_rate': 2.5495239064655674e-05, 'weight_decay': 1.0006237746084801e-05, 'num_epochs': 67}. Best is trial 4 with value: 144.86387634277344.


Trial: 8 - Loss: 320.0384216308594 - Val Loss: 761.5990600585938


[I 2024-06-20 05:14:39,230] Trial 9 finished with value: 779.7514038085938 and parameters: {'num_heads': 2, 'model_dim': 102, 'num_layers': 4, 'dropout': 0.35838264431680167, 'learning_rate': 0.00015084272288662094, 'weight_decay': 0.0029529827107851857, 'num_epochs': 47}. Best is trial 4 with value: 144.86387634277344.


Trial: 9 - Loss: 330.7565612792969 - Val Loss: 779.7514038085938


[I 2024-06-20 05:14:41,599] Trial 10 finished with value: 149.7149200439453 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 6, 'dropout': 0.25255833336872185, 'learning_rate': 0.000743834063497688, 'weight_decay': 1.1530369313847676e-05, 'num_epochs': 98}. Best is trial 4 with value: 144.86387634277344.


Trial: 10 - Loss: 46.73980712890625 - Val Loss: 149.7149200439453


[I 2024-06-20 05:14:43,889] Trial 11 finished with value: 151.0055694580078 and parameters: {'num_heads': 8, 'model_dim': 504, 'num_layers': 6, 'dropout': 0.24329619402372044, 'learning_rate': 0.000727700928387081, 'weight_decay': 1.13342145398265e-05, 'num_epochs': 98}. Best is trial 4 with value: 144.86387634277344.


Trial: 11 - Loss: 46.997440338134766 - Val Loss: 151.0055694580078


[I 2024-06-20 05:14:46,107] Trial 12 finished with value: 144.65296936035156 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 6, 'dropout': 0.2685239754140771, 'learning_rate': 0.0016216329839815532, 'weight_decay': 4.051209289587754e-05, 'num_epochs': 99}. Best is trial 12 with value: 144.65296936035156.


Trial: 12 - Loss: 46.780181884765625 - Val Loss: 144.65296936035156


[I 2024-06-20 05:14:46,593] Trial 13 finished with value: 143.697509765625 and parameters: {'num_heads': 4, 'model_dim': 188, 'num_layers': 3, 'dropout': 0.1722111721082854, 'learning_rate': 0.0033222766219579794, 'weight_decay': 5.0800030177543205e-05, 'num_epochs': 87}. Best is trial 13 with value: 143.697509765625.


Trial: 13 - Loss: 46.988548278808594 - Val Loss: 143.697509765625


[I 2024-06-20 05:14:47,441] Trial 14 finished with value: 146.17782592773438 and parameters: {'num_heads': 4, 'model_dim': 244, 'num_layers': 5, 'dropout': 0.1670147261578162, 'learning_rate': 0.00226805240893809, 'weight_decay': 0.0001367430201443351, 'num_epochs': 83}. Best is trial 13 with value: 143.697509765625.


Trial: 14 - Loss: 46.903995513916016 - Val Loss: 146.17782592773438


[I 2024-06-20 05:14:48,076] Trial 15 finished with value: 542.5781860351562 and parameters: {'num_heads': 3, 'model_dim': 177, 'num_layers': 4, 'dropout': 0.1941740110595257, 'learning_rate': 0.0003322685728222711, 'weight_decay': 3.904913422097408e-05, 'num_epochs': 83}. Best is trial 13 with value: 143.697509765625.


Trial: 15 - Loss: 192.8468780517578 - Val Loss: 542.5781860351562


[I 2024-06-20 05:14:48,546] Trial 16 finished with value: 149.42391967773438 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 3, 'dropout': 0.2943604683018601, 'learning_rate': 0.0024587745386158437, 'weight_decay': 0.00014596174642687122, 'num_epochs': 85}. Best is trial 13 with value: 143.697509765625.
[I 2024-06-20 05:14:48,660] Trial 17 finished with value: 615.8699951171875 and parameters: {'num_heads': 5, 'model_dim': 310, 'num_layers': 1, 'dropout': 0.15684799446168818, 'learning_rate': 0.0003314192490497604, 'weight_decay': 2.5044774018300574e-05, 'num_epochs': 29}. Best is trial 13 with value: 143.697509765625.


Trial: 16 - Loss: 46.55558776855469 - Val Loss: 149.42391967773438
Trial: 17 - Loss: 238.9776611328125 - Val Loss: 615.8699951171875


[I 2024-06-20 05:14:49,375] Trial 18 finished with value: 156.0446014404297 and parameters: {'num_heads': 1, 'model_dim': 60, 'num_layers': 6, 'dropout': 0.4086457149550389, 'learning_rate': 0.003911864454726001, 'weight_decay': 0.0001358400058490555, 'num_epochs': 76}. Best is trial 13 with value: 143.697509765625.


Trial: 18 - Loss: 46.82484817504883 - Val Loss: 156.0446014404297


[I 2024-06-20 05:14:50,835] Trial 19 finished with value: 149.10598754882812 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 5, 'dropout': 0.19783272059369914, 'learning_rate': 0.0014157411597462265, 'weight_decay': 0.0004794153475909239, 'num_epochs': 88}. Best is trial 13 with value: 143.697509765625.


Trial: 19 - Loss: 46.43134689331055 - Val Loss: 149.10598754882812


[I 2024-06-20 05:14:51,349] Trial 20 finished with value: 149.04835510253906 and parameters: {'num_heads': 3, 'model_dim': 93, 'num_layers': 4, 'dropout': 0.1654072169725792, 'learning_rate': 0.0052503144423720095, 'weight_decay': 7.1003284118183e-05, 'num_epochs': 61}. Best is trial 13 with value: 143.697509765625.


Trial: 20 - Loss: 47.155269622802734 - Val Loss: 149.04835510253906


[I 2024-06-20 05:14:51,957] Trial 21 finished with value: 143.31039428710938 and parameters: {'num_heads': 6, 'model_dim': 144, 'num_layers': 3, 'dropout': 0.27504329921037546, 'learning_rate': 0.00944154032944348, 'weight_decay': 2.3692225994026454e-05, 'num_epochs': 100}. Best is trial 21 with value: 143.31039428710938.


Trial: 21 - Loss: 46.76848220825195 - Val Loss: 143.31039428710938


[I 2024-06-20 05:14:52,579] Trial 22 finished with value: 387.75927734375 and parameters: {'num_heads': 7, 'model_dim': 154, 'num_layers': 3, 'dropout': 0.2773899929676876, 'learning_rate': 0.0020526679998205746, 'weight_decay': 2.2084473342890726e-05, 'num_epochs': 90}. Best is trial 21 with value: 143.31039428710938.


Trial: 22 - Loss: 21.48850440979004 - Val Loss: 387.75927734375


[I 2024-06-20 05:14:53,253] Trial 23 finished with value: 327.6854248046875 and parameters: {'num_heads': 8, 'model_dim': 344, 'num_layers': 2, 'dropout': 0.30887271604531247, 'learning_rate': 0.00828424009337018, 'weight_decay': 4.750024556982539e-05, 'num_epochs': 100}. Best is trial 21 with value: 143.31039428710938.


Trial: 23 - Loss: 11.873092651367188 - Val Loss: 327.6854248046875


[I 2024-06-20 05:14:53,773] Trial 24 finished with value: 146.64144897460938 and parameters: {'num_heads': 5, 'model_dim': 145, 'num_layers': 3, 'dropout': 0.21804443275131857, 'learning_rate': 0.004734569586775232, 'weight_decay': 2.0740268461704034e-05, 'num_epochs': 77}. Best is trial 21 with value: 143.31039428710938.


Trial: 24 - Loss: 46.724029541015625 - Val Loss: 146.64144897460938


[I 2024-06-20 05:14:55,062] Trial 25 finished with value: 155.2125701904297 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 4, 'dropout': 0.37996682016313266, 'learning_rate': 0.0006969494930247202, 'weight_decay': 0.00021981144053012875, 'num_epochs': 93}. Best is trial 21 with value: 143.31039428710938.


Trial: 25 - Loss: 46.753604888916016 - Val Loss: 155.2125701904297


[I 2024-06-20 05:14:55,456] Trial 26 finished with value: 379.39459228515625 and parameters: {'num_heads': 6, 'model_dim': 210, 'num_layers': 2, 'dropout': 0.2621059388613658, 'learning_rate': 0.0014665931996059275, 'weight_decay': 9.357809388376509e-05, 'num_epochs': 77}. Best is trial 21 with value: 143.31039428710938.


Trial: 26 - Loss: 18.293140411376953 - Val Loss: 379.39459228515625


[I 2024-06-20 05:14:55,932] Trial 27 finished with value: 334.5282287597656 and parameters: {'num_heads': 3, 'model_dim': 123, 'num_layers': 3, 'dropout': 0.3230532263979738, 'learning_rate': 0.0031980073251963587, 'weight_decay': 3.613639474892621e-05, 'num_epochs': 89}. Best is trial 21 with value: 143.31039428710938.


Trial: 27 - Loss: 23.492408752441406 - Val Loss: 334.5282287597656


[I 2024-06-20 05:14:56,754] Trial 28 finished with value: 141.153564453125 and parameters: {'num_heads': 5, 'model_dim': 205, 'num_layers': 4, 'dropout': 0.13698904627817288, 'learning_rate': 0.005564722765398976, 'weight_decay': 1.5093664155900806e-05, 'num_epochs': 93}. Best is trial 28 with value: 141.153564453125.


Trial: 28 - Loss: 47.1828498840332 - Val Loss: 141.153564453125


[I 2024-06-20 05:14:57,482] Trial 29 finished with value: 144.64801025390625 and parameters: {'num_heads': 5, 'model_dim': 200, 'num_layers': 4, 'dropout': 0.10434427518846202, 'learning_rate': 0.006457571095130437, 'weight_decay': 1.9065442771662665e-05, 'num_epochs': 81}. Best is trial 28 with value: 141.153564453125.


Trial: 29 - Loss: 46.5997314453125 - Val Loss: 144.64801025390625


[I 2024-06-20 05:14:57,862] Trial 30 finished with value: 734.3934326171875 and parameters: {'num_heads': 4, 'model_dim': 168, 'num_layers': 5, 'dropout': 0.14221154670648564, 'learning_rate': 0.00017473499061483605, 'weight_decay': 1.5716682053381492e-05, 'num_epochs': 39}. Best is trial 28 with value: 141.153564453125.


Trial: 30 - Loss: 303.50958251953125 - Val Loss: 734.3934326171875


[I 2024-06-20 05:14:58,552] Trial 31 finished with value: 142.9305877685547 and parameters: {'num_heads': 5, 'model_dim': 200, 'num_layers': 4, 'dropout': 0.10279921907372855, 'learning_rate': 0.005570559972566489, 'weight_decay': 1.761411479217737e-05, 'num_epochs': 80}. Best is trial 28 with value: 141.153564453125.


Trial: 31 - Loss: 47.355838775634766 - Val Loss: 142.9305877685547


[I 2024-06-20 05:14:59,079] Trial 32 finished with value: 147.62948608398438 and parameters: {'num_heads': 5, 'model_dim': 210, 'num_layers': 4, 'dropout': 0.12859443821574434, 'learning_rate': 0.009904356820476813, 'weight_decay': 1.6363369915185826e-05, 'num_epochs': 58}. Best is trial 28 with value: 141.153564453125.


Trial: 32 - Loss: 46.632118225097656 - Val Loss: 147.62948608398438


[I 2024-06-20 05:14:59,672] Trial 33 finished with value: 144.7353515625 and parameters: {'num_heads': 4, 'model_dim': 192, 'num_layers': 3, 'dropout': 0.10350978601049937, 'learning_rate': 0.0034152148301993283, 'weight_decay': 2.8867332468572306e-05, 'num_epochs': 92}. Best is trial 28 with value: 141.153564453125.


Trial: 33 - Loss: 46.86606979370117 - Val Loss: 144.7353515625


[I 2024-06-20 05:15:00,401] Trial 34 finished with value: 141.6764373779297 and parameters: {'num_heads': 6, 'model_dim': 252, 'num_layers': 4, 'dropout': 0.1839810717687009, 'learning_rate': 0.005442972331051704, 'weight_decay': 6.93593653022524e-05, 'num_epochs': 72}. Best is trial 28 with value: 141.153564453125.


Trial: 34 - Loss: 46.83710479736328 - Val Loss: 141.6764373779297


[I 2024-06-20 05:15:01,256] Trial 35 finished with value: 143.4123077392578 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 4, 'dropout': 0.1301768613267583, 'learning_rate': 0.006769530192382132, 'weight_decay': 1.5170204167001812e-05, 'num_epochs': 79}. Best is trial 28 with value: 141.153564453125.


Trial: 35 - Loss: 46.78474807739258 - Val Loss: 143.4123077392578


[I 2024-06-20 05:15:02,184] Trial 36 finished with value: 140.43899536132812 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 5, 'dropout': 0.185351838991001, 'learning_rate': 0.005507609910461661, 'weight_decay': 8.707697922376897e-05, 'num_epochs': 69}. Best is trial 36 with value: 140.43899536132812.


Trial: 36 - Loss: 46.6870231628418 - Val Loss: 140.43899536132812


[I 2024-06-20 05:15:03,078] Trial 37 finished with value: 140.6184539794922 and parameters: {'num_heads': 5, 'model_dim': 270, 'num_layers': 5, 'dropout': 0.18620601872581968, 'learning_rate': 0.005120722500805923, 'weight_decay': 0.00027223672042218683, 'num_epochs': 71}. Best is trial 36 with value: 140.43899536132812.


Trial: 37 - Loss: 47.219825744628906 - Val Loss: 140.6184539794922


[I 2024-06-20 05:15:04,032] Trial 38 finished with value: 163.50889587402344 and parameters: {'num_heads': 6, 'model_dim': 276, 'num_layers': 5, 'dropout': 0.18760698402775017, 'learning_rate': 0.0010221960936239959, 'weight_decay': 0.0002875230964468396, 'num_epochs': 71}. Best is trial 36 with value: 140.43899536132812.


Trial: 38 - Loss: 47.022621154785156 - Val Loss: 163.50889587402344


[I 2024-06-20 05:15:04,952] Trial 39 finished with value: 146.44293212890625 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.22160406658297796, 'learning_rate': 0.00275576649617534, 'weight_decay': 0.0008157481491745866, 'num_epochs': 64}. Best is trial 36 with value: 140.43899536132812.


Trial: 39 - Loss: 46.65630340576172 - Val Loss: 146.44293212890625


[I 2024-06-20 05:15:05,722] Trial 40 finished with value: 774.8640747070312 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 5, 'dropout': 0.14824514286435014, 'learning_rate': 1.1073655752701203e-05, 'weight_decay': 0.0004156793203623469, 'num_epochs': 55}. Best is trial 36 with value: 140.43899536132812.


Trial: 40 - Loss: 325.39404296875 - Val Loss: 774.8640747070312


[I 2024-06-20 05:15:06,538] Trial 41 finished with value: 144.38308715820312 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 5, 'dropout': 0.12009505825184211, 'learning_rate': 0.00515355786757072, 'weight_decay': 8.332237797820701e-05, 'num_epochs': 72}. Best is trial 36 with value: 140.43899536132812.


Trial: 41 - Loss: 47.01620864868164 - Val Loss: 144.38308715820312


[I 2024-06-20 05:15:07,217] Trial 42 finished with value: 139.5071258544922 and parameters: {'num_heads': 5, 'model_dim': 255, 'num_layers': 4, 'dropout': 0.18089868267396622, 'learning_rate': 0.006816148424612225, 'weight_decay': 0.00019927033120561795, 'num_epochs': 68}. Best is trial 42 with value: 139.5071258544922.


Trial: 42 - Loss: 47.171875 - Val Loss: 139.5071258544922


[I 2024-06-20 05:15:08,129] Trial 43 finished with value: 143.87411499023438 and parameters: {'num_heads': 6, 'model_dim': 258, 'num_layers': 5, 'dropout': 0.17676754930586236, 'learning_rate': 0.0040910220070506315, 'weight_decay': 0.00020619729199065835, 'num_epochs': 67}. Best is trial 42 with value: 139.5071258544922.


Trial: 43 - Loss: 46.622596740722656 - Val Loss: 143.87411499023438


[I 2024-06-20 05:15:08,723] Trial 44 finished with value: 140.6508331298828 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 4, 'dropout': 0.21468054164880634, 'learning_rate': 0.007107532826970392, 'weight_decay': 0.0009904399898924877, 'num_epochs': 61}. Best is trial 42 with value: 139.5071258544922.


Trial: 44 - Loss: 47.36079025268555 - Val Loss: 140.6508331298828


[I 2024-06-20 05:15:09,454] Trial 45 finished with value: 143.18429565429688 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 6, 'dropout': 0.2092574842660115, 'learning_rate': 0.007289453341548056, 'weight_decay': 0.001670769945393727, 'num_epochs': 52}. Best is trial 42 with value: 139.5071258544922.


Trial: 45 - Loss: 46.96637725830078 - Val Loss: 143.18429565429688


[I 2024-06-20 05:15:10,249] Trial 46 finished with value: 697.3739013671875 and parameters: {'num_heads': 5, 'model_dim': 265, 'num_layers': 5, 'dropout': 0.2256177189260266, 'learning_rate': 8.141715102029475e-05, 'weight_decay': 0.0007018753529233682, 'num_epochs': 62}. Best is trial 42 with value: 139.5071258544922.


Trial: 46 - Loss: 281.0628662109375 - Val Loss: 697.3739013671875


[I 2024-06-20 05:15:10,729] Trial 47 finished with value: 151.20399475097656 and parameters: {'num_heads': 4, 'model_dim': 216, 'num_layers': 4, 'dropout': 0.14859059494591167, 'learning_rate': 0.0019480795279511371, 'weight_decay': 0.0019714008976282516, 'num_epochs': 57}. Best is trial 42 with value: 139.5071258544922.


Trial: 47 - Loss: 47.012977600097656 - Val Loss: 151.20399475097656


[I 2024-06-20 05:15:11,152] Trial 48 finished with value: 545.337890625 and parameters: {'num_heads': 5, 'model_dim': 235, 'num_layers': 4, 'dropout': 0.24339435768768386, 'learning_rate': 0.0005241829540665369, 'weight_decay': 0.0008517831743928947, 'num_epochs': 41}. Best is trial 42 with value: 139.5071258544922.


Trial: 48 - Loss: 196.7195587158203 - Val Loss: 545.337890625


[I 2024-06-20 05:15:12,010] Trial 49 finished with value: 134.9644775390625 and parameters: {'num_heads': 6, 'model_dim': 312, 'num_layers': 6, 'dropout': 0.2061628663721058, 'learning_rate': 0.0076888577744823145, 'weight_decay': 0.0003524352976624008, 'num_epochs': 48}. Best is trial 49 with value: 134.9644775390625.


Trial: 49 - Loss: 47.164939880371094 - Val Loss: 134.9644775390625


[I 2024-06-20 05:15:12,970] Trial 50 finished with value: 133.2702178955078 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.49278164368266697, 'learning_rate': 0.007120355999882976, 'weight_decay': 0.00035740100342637315, 'num_epochs': 46}. Best is trial 50 with value: 133.2702178955078.


Trial: 50 - Loss: 47.00811767578125 - Val Loss: 133.2702178955078


[I 2024-06-20 05:15:13,641] Trial 51 finished with value: 142.919921875 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 6, 'dropout': 0.20718552255065806, 'learning_rate': 0.007630445173078381, 'weight_decay': 0.00036215136288642323, 'num_epochs': 35}. Best is trial 50 with value: 133.2702178955078.


Trial: 51 - Loss: 46.99552917480469 - Val Loss: 142.919921875


[I 2024-06-20 05:15:14,556] Trial 52 finished with value: 140.40464782714844 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 6, 'dropout': 0.4863423073505372, 'learning_rate': 0.009789443876501642, 'weight_decay': 0.00020648472605399375, 'num_epochs': 51}. Best is trial 50 with value: 133.2702178955078.


Trial: 52 - Loss: 47.216529846191406 - Val Loss: 140.40464782714844


[I 2024-06-20 05:15:15,499] Trial 53 finished with value: 138.2161865234375 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 6, 'dropout': 0.4864696007395327, 'learning_rate': 0.009919490537374944, 'weight_decay': 0.00023748736541390355, 'num_epochs': 47}. Best is trial 50 with value: 133.2702178955078.


Trial: 53 - Loss: 47.13510513305664 - Val Loss: 138.2161865234375


[I 2024-06-20 05:15:16,520] Trial 54 finished with value: 137.96458435058594 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 6, 'dropout': 0.4952943820284314, 'learning_rate': 0.009814476442358137, 'weight_decay': 0.0001724144336659496, 'num_epochs': 49}. Best is trial 50 with value: 133.2702178955078.


Trial: 54 - Loss: 47.04899597167969 - Val Loss: 137.96458435058594


[I 2024-06-20 05:15:17,493] Trial 55 finished with value: 143.9300994873047 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 6, 'dropout': 0.49993536047286563, 'learning_rate': 0.009640946349645203, 'weight_decay': 0.0005699165863580295, 'num_epochs': 47}. Best is trial 50 with value: 133.2702178955078.


Trial: 55 - Loss: 46.61547088623047 - Val Loss: 143.9300994873047


[I 2024-06-20 05:15:18,659] Trial 56 finished with value: 138.80294799804688 and parameters: {'num_heads': 8, 'model_dim': 392, 'num_layers': 6, 'dropout': 0.47209825039614134, 'learning_rate': 0.003994925214686299, 'weight_decay': 0.00019025094070286284, 'num_epochs': 51}. Best is trial 50 with value: 133.2702178955078.


Trial: 56 - Loss: 46.99013900756836 - Val Loss: 138.80294799804688


[I 2024-06-20 05:15:19,513] Trial 57 finished with value: 151.6669158935547 and parameters: {'num_heads': 8, 'model_dim': 400, 'num_layers': 6, 'dropout': 0.4715572911769108, 'learning_rate': 0.004117757668717224, 'weight_decay': 0.00012536816616931875, 'num_epochs': 36}. Best is trial 50 with value: 133.2702178955078.


Trial: 57 - Loss: 47.49205780029297 - Val Loss: 151.6669158935547


[I 2024-06-20 05:15:20,541] Trial 58 finished with value: 140.7151336669922 and parameters: {'num_heads': 8, 'model_dim': 368, 'num_layers': 6, 'dropout': 0.4542138829911048, 'learning_rate': 0.002790824330359906, 'weight_decay': 0.00017598080290053036, 'num_epochs': 46}. Best is trial 50 with value: 133.2702178955078.


Trial: 58 - Loss: 47.19291305541992 - Val Loss: 140.7151336669922


[I 2024-06-20 05:15:21,171] Trial 59 finished with value: 158.35679626464844 and parameters: {'num_heads': 7, 'model_dim': 364, 'num_layers': 6, 'dropout': 0.450526267381318, 'learning_rate': 0.007364824241853909, 'weight_decay': 0.00011220765364096281, 'num_epochs': 29}. Best is trial 50 with value: 133.2702178955078.


Trial: 59 - Loss: 47.6458854675293 - Val Loss: 158.35679626464844


[I 2024-06-20 05:15:22,194] Trial 60 finished with value: 155.7694854736328 and parameters: {'num_heads': 8, 'model_dim': 416, 'num_layers': 6, 'dropout': 0.43368625127864224, 'learning_rate': 0.004189397476942862, 'weight_decay': 0.000313579888474693, 'num_epochs': 44}. Best is trial 50 with value: 133.2702178955078.


Trial: 60 - Loss: 47.200504302978516 - Val Loss: 155.7694854736328


[I 2024-06-20 05:15:23,213] Trial 61 finished with value: 132.93299865722656 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.48790748121090427, 'learning_rate': 0.00968772560227767, 'weight_decay': 0.000204263453602587, 'num_epochs': 52}. Best is trial 61 with value: 132.93299865722656.


Trial: 61 - Loss: 47.01339340209961 - Val Loss: 132.93299865722656


[I 2024-06-20 05:15:24,199] Trial 62 finished with value: 148.9320068359375 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 6, 'dropout': 0.47645729588458025, 'learning_rate': 0.007927314883710946, 'weight_decay': 0.00016441558982102674, 'num_epochs': 49}. Best is trial 61 with value: 132.93299865722656.


Trial: 62 - Loss: 47.076595306396484 - Val Loss: 148.9320068359375


[I 2024-06-20 05:15:25,228] Trial 63 finished with value: 134.7506561279297 and parameters: {'num_heads': 7, 'model_dim': 301, 'num_layers': 6, 'dropout': 0.4613051969729375, 'learning_rate': 0.006502180152335845, 'weight_decay': 0.0002546686254201906, 'num_epochs': 55}. Best is trial 61 with value: 132.93299865722656.


Trial: 63 - Loss: 47.423004150390625 - Val Loss: 134.7506561279297


[I 2024-06-20 05:15:26,592] Trial 64 finished with value: 136.90476989746094 and parameters: {'num_heads': 8, 'model_dim': 440, 'num_layers': 6, 'dropout': 0.458173489014348, 'learning_rate': 0.009865179855754738, 'weight_decay': 0.00025369624828686246, 'num_epochs': 55}. Best is trial 61 with value: 132.93299865722656.


Trial: 64 - Loss: 47.24835968017578 - Val Loss: 136.90476989746094


[I 2024-06-20 05:15:27,586] Trial 65 finished with value: 138.27203369140625 and parameters: {'num_heads': 7, 'model_dim': 294, 'num_layers': 6, 'dropout': 0.40614079236973877, 'learning_rate': 0.008656520586165391, 'weight_decay': 0.0005562892608408677, 'num_epochs': 55}. Best is trial 61 with value: 132.93299865722656.


Trial: 65 - Loss: 46.831478118896484 - Val Loss: 138.27203369140625


[I 2024-06-20 05:15:28,436] Trial 66 finished with value: 153.37506103515625 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.45533493380974116, 'learning_rate': 0.0060719476379482214, 'weight_decay': 0.0002497623378065442, 'num_epochs': 43}. Best is trial 61 with value: 132.93299865722656.


Trial: 66 - Loss: 47.22349166870117 - Val Loss: 153.37506103515625


[I 2024-06-20 05:15:29,692] Trial 67 finished with value: 140.32313537597656 and parameters: {'num_heads': 7, 'model_dim': 427, 'num_layers': 6, 'dropout': 0.48849096996468294, 'learning_rate': 0.0031209416642505802, 'weight_decay': 0.000404602933247235, 'num_epochs': 55}. Best is trial 61 with value: 132.93299865722656.


Trial: 67 - Loss: 46.96745681762695 - Val Loss: 140.32313537597656


[I 2024-06-20 05:15:30,624] Trial 68 finished with value: 135.4474334716797 and parameters: {'num_heads': 8, 'model_dim': 440, 'num_layers': 6, 'dropout': 0.4304121033592402, 'learning_rate': 0.008840762485635166, 'weight_decay': 0.00010555084666580256, 'num_epochs': 38}. Best is trial 61 with value: 132.93299865722656.


Trial: 68 - Loss: 47.34021759033203 - Val Loss: 135.4474334716797


[I 2024-06-20 05:15:31,377] Trial 69 finished with value: 157.52053833007812 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 6, 'dropout': 0.4320574328082475, 'learning_rate': 0.006260874963693631, 'weight_decay': 0.00010459196922467288, 'num_epochs': 29}. Best is trial 61 with value: 132.93299865722656.


Trial: 69 - Loss: 47.17530059814453 - Val Loss: 157.52053833007812


[I 2024-06-20 05:15:31,840] Trial 70 finished with value: 82.09996032714844 and parameters: {'num_heads': 8, 'model_dim': 448, 'num_layers': 6, 'dropout': 0.4038300440426975, 'learning_rate': 0.0045750695722829105, 'weight_decay': 0.0003426957899221643, 'num_epochs': 17}. Best is trial 70 with value: 82.09996032714844.


Trial: 70 - Loss: 66.40109252929688 - Val Loss: 82.09996032714844


[I 2024-06-20 05:15:32,165] Trial 71 finished with value: 126.81652069091797 and parameters: {'num_heads': 8, 'model_dim': 448, 'num_layers': 6, 'dropout': 0.41088973743315993, 'learning_rate': 0.0046137697630933075, 'weight_decay': 0.0003383109261596664, 'num_epochs': 11}. Best is trial 70 with value: 82.09996032714844.


Trial: 71 - Loss: 47.09978103637695 - Val Loss: 126.81652069091797


[I 2024-06-20 05:15:32,637] Trial 72 finished with value: 103.68849182128906 and parameters: {'num_heads': 8, 'model_dim': 440, 'num_layers': 6, 'dropout': 0.3911670610883755, 'learning_rate': 0.004829561428472482, 'weight_decay': 0.0003411465872287043, 'num_epochs': 17}. Best is trial 70 with value: 82.09996032714844.


Trial: 72 - Loss: 55.70536422729492 - Val Loss: 103.68849182128906


[I 2024-06-20 05:15:32,972] Trial 73 finished with value: 334.33056640625 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 6, 'dropout': 0.3800492487455341, 'learning_rate': 0.0022638239228454844, 'weight_decay': 0.0003413751694733303, 'num_epochs': 11}. Best is trial 70 with value: 82.09996032714844.


Trial: 73 - Loss: 110.13878631591797 - Val Loss: 334.33056640625


[I 2024-06-20 05:15:33,410] Trial 74 finished with value: 78.46845245361328 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 6, 'dropout': 0.4051282113428471, 'learning_rate': 0.004309566957857929, 'weight_decay': 0.0005126147306951872, 'num_epochs': 15}. Best is trial 74 with value: 78.46845245361328.


Trial: 74 - Loss: 63.155269622802734 - Val Loss: 78.46845245361328


[I 2024-06-20 05:15:33,954] Trial 75 finished with value: 161.3043975830078 and parameters: {'num_heads': 8, 'model_dim': 504, 'num_layers': 6, 'dropout': 0.3998488180431877, 'learning_rate': 0.0045583356306127506, 'weight_decay': 0.00048703035238302505, 'num_epochs': 19}. Best is trial 74 with value: 78.46845245361328.


Trial: 75 - Loss: 47.96115493774414 - Val Loss: 161.3043975830078


[I 2024-06-20 05:15:34,510] Trial 76 finished with value: 207.00413513183594 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 6, 'dropout': 0.36470636327688, 'learning_rate': 0.0017581319926745164, 'weight_decay': 0.0006515039691591262, 'num_epochs': 20}. Best is trial 74 with value: 78.46845245361328.


Trial: 76 - Loss: 57.2056884765625 - Val Loss: 207.00413513183594


[I 2024-06-20 05:15:34,902] Trial 77 finished with value: 131.39181518554688 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 6, 'dropout': 0.41791295460360717, 'learning_rate': 0.003442787920523253, 'weight_decay': 0.00043687977243062856, 'num_epochs': 13}. Best is trial 74 with value: 78.46845245361328.
[I 2024-06-20 05:15:34,993] Trial 78 finished with value: 205.72515869140625 and parameters: {'num_heads': 8, 'model_dim': 488, 'num_layers': 1, 'dropout': 0.41636665574938175, 'learning_rate': 0.0034658271690976837, 'weight_decay': 0.00047458160445116844, 'num_epochs': 10}. Best is trial 74 with value: 78.46845245361328.


Trial: 77 - Loss: 47.17639923095703 - Val Loss: 131.39181518554688
Trial: 78 - Loss: 64.99606323242188 - Val Loss: 205.72515869140625


[I 2024-06-20 05:15:35,376] Trial 79 finished with value: 223.08453369140625 and parameters: {'num_heads': 8, 'model_dim': 448, 'num_layers': 6, 'dropout': 0.3950632623797272, 'learning_rate': 0.002491815698716428, 'weight_decay': 0.0004401157942988386, 'num_epochs': 14}. Best is trial 74 with value: 78.46845245361328.


Trial: 79 - Loss: 66.5067138671875 - Val Loss: 223.08453369140625


[I 2024-06-20 05:15:35,930] Trial 80 finished with value: 220.64637756347656 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 5, 'dropout': 0.3466467555128506, 'learning_rate': 0.0012526851359409635, 'weight_decay': 0.0010083144514891253, 'num_epochs': 24}. Best is trial 74 with value: 78.46845245361328.


Trial: 80 - Loss: 59.527671813964844 - Val Loss: 220.64637756347656


[I 2024-06-20 05:15:36,314] Trial 81 finished with value: 80.06111907958984 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 6, 'dropout': 0.4444745470367433, 'learning_rate': 0.004845740621260392, 'weight_decay': 0.0003162621887042913, 'num_epochs': 15}. Best is trial 74 with value: 78.46845245361328.


Trial: 81 - Loss: 60.1749382019043 - Val Loss: 80.06111907958984


[I 2024-06-20 05:15:36,680] Trial 82 finished with value: 91.13455200195312 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 6, 'dropout': 0.4443079968549086, 'learning_rate': 0.004629793449335245, 'weight_decay': 0.00029964633919657516, 'num_epochs': 14}. Best is trial 74 with value: 78.46845245361328.


Trial: 82 - Loss: 52.46673583984375 - Val Loss: 91.13455200195312


[I 2024-06-20 05:15:37,108] Trial 83 finished with value: 142.2643585205078 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 6, 'dropout': 0.4418223626095024, 'learning_rate': 0.0031762056064396408, 'weight_decay': 0.0003007450600428219, 'num_epochs': 15}. Best is trial 74 with value: 78.46845245361328.


Trial: 83 - Loss: 47.92249298095703 - Val Loss: 142.2643585205078


[I 2024-06-20 05:15:37,663] Trial 84 finished with value: 159.22874450683594 and parameters: {'num_heads': 7, 'model_dim': 427, 'num_layers': 6, 'dropout': 0.4195306382347367, 'learning_rate': 0.004822376487367229, 'weight_decay': 0.0003788350874044209, 'num_epochs': 22}. Best is trial 74 with value: 78.46845245361328.


Trial: 84 - Loss: 48.417301177978516 - Val Loss: 159.22874450683594


[I 2024-06-20 05:15:37,998] Trial 85 finished with value: 121.76954650878906 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 5, 'dropout': 0.390035746996021, 'learning_rate': 0.003645633904329324, 'weight_decay': 0.0005529120533624522, 'num_epochs': 13}. Best is trial 74 with value: 78.46845245361328.


Trial: 85 - Loss: 46.854671478271484 - Val Loss: 121.76954650878906


[I 2024-06-20 05:15:38,336] Trial 86 finished with value: 198.3337860107422 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 5, 'dropout': 0.38671525917944993, 'learning_rate': 0.00272117206366496, 'weight_decay': 0.0006365623514608644, 'num_epochs': 13}. Best is trial 74 with value: 78.46845245361328.


Trial: 86 - Loss: 59.85538864135742 - Val Loss: 198.3337860107422


[I 2024-06-20 05:15:38,769] Trial 87 finished with value: 87.72489166259766 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 5, 'dropout': 0.35538951889289594, 'learning_rate': 0.003671022035389791, 'weight_decay': 0.0011467318399304726, 'num_epochs': 18}. Best is trial 74 with value: 78.46845245361328.


Trial: 87 - Loss: 60.39889907836914 - Val Loss: 87.72489166259766


[I 2024-06-20 05:15:39,194] Trial 88 finished with value: 200.84210205078125 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 5, 'dropout': 0.349749167714112, 'learning_rate': 0.002071606080597295, 'weight_decay': 0.0053006011619391785, 'num_epochs': 17}. Best is trial 74 with value: 78.46845245361328.


Trial: 88 - Loss: 57.47016525268555 - Val Loss: 200.84210205078125


[I 2024-06-20 05:15:39,775] Trial 89 finished with value: 128.04945373535156 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 5, 'dropout': 0.3641305408674628, 'learning_rate': 0.0032214153630189136, 'weight_decay': 0.0007657183171531103, 'num_epochs': 26}. Best is trial 74 with value: 78.46845245361328.


Trial: 89 - Loss: 46.841705322265625 - Val Loss: 128.04945373535156


[I 2024-06-20 05:15:40,346] Trial 90 finished with value: 590.7612915039062 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 5, 'dropout': 0.3302007464461444, 'learning_rate': 0.0002410850560236362, 'weight_decay': 0.0014483512594901527, 'num_epochs': 26}. Best is trial 74 with value: 78.46845245361328.


Trial: 90 - Loss: 221.85462951660156 - Val Loss: 590.7612915039062


[I 2024-06-20 05:15:40,751] Trial 91 finished with value: 83.51968383789062 and parameters: {'num_heads': 8, 'model_dim': 488, 'num_layers': 5, 'dropout': 0.3661903929840836, 'learning_rate': 0.003850747765385546, 'weight_decay': 0.0007835156612784238, 'num_epochs': 17}. Best is trial 74 with value: 78.46845245361328.


Trial: 91 - Loss: 61.11174774169922 - Val Loss: 83.51968383789062


[I 2024-06-20 05:15:41,167] Trial 92 finished with value: 102.05580139160156 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 5, 'dropout': 0.3635971363556264, 'learning_rate': 0.003723405074742698, 'weight_decay': 0.0010982720480503902, 'num_epochs': 16}. Best is trial 74 with value: 78.46845245361328.
[I 2024-06-20 05:15:41,346] Trial 93 finished with value: 627.0355224609375 and parameters: {'num_heads': 1, 'model_dim': 50, 'num_layers': 5, 'dropout': 0.3746472078091089, 'learning_rate': 0.004800480598398222, 'weight_decay': 0.0010509695813158116, 'num_epochs': 17}. Best is trial 74 with value: 78.46845245361328.


Trial: 92 - Loss: 49.8747444152832 - Val Loss: 102.05580139160156
Trial: 93 - Loss: 250.24569702148438 - Val Loss: 627.0355224609375


[I 2024-06-20 05:15:41,759] Trial 94 finished with value: 94.85445404052734 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 5, 'dropout': 0.3920063063628192, 'learning_rate': 0.004085776914908555, 'weight_decay': 0.002081350129554139, 'num_epochs': 19}. Best is trial 74 with value: 78.46845245361328.


Trial: 94 - Loss: 60.24797439575195 - Val Loss: 94.85445404052734


[I 2024-06-20 05:15:42,186] Trial 95 finished with value: 254.09286499023438 and parameters: {'num_heads': 8, 'model_dim': 416, 'num_layers': 5, 'dropout': 0.38750827962893764, 'learning_rate': 0.0016805166600149842, 'weight_decay': 0.0013565351436612015, 'num_epochs': 20}. Best is trial 74 with value: 78.46845245361328.


Trial: 95 - Loss: 70.77802276611328 - Val Loss: 254.09286499023438


[I 2024-06-20 05:15:42,521] Trial 96 finished with value: 112.5662841796875 and parameters: {'num_heads': 8, 'model_dim': 384, 'num_layers': 5, 'dropout': 0.37302570085795383, 'learning_rate': 0.003821283559520058, 'weight_decay': 0.0022171233219100794, 'num_epochs': 16}. Best is trial 74 with value: 78.46845245361328.


Trial: 96 - Loss: 47.917381286621094 - Val Loss: 112.5662841796875


[I 2024-06-20 05:15:42,859] Trial 97 finished with value: 501.974609375 and parameters: {'num_heads': 8, 'model_dim': 384, 'num_layers': 5, 'dropout': 0.3375915520771616, 'learning_rate': 0.0009380200576164935, 'weight_decay': 0.002554078093318126, 'num_epochs': 16}. Best is trial 74 with value: 78.46845245361328.


Trial: 97 - Loss: 180.19195556640625 - Val Loss: 501.974609375


[I 2024-06-20 05:15:43,371] Trial 98 finished with value: 125.54471588134766 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 5, 'dropout': 0.3108038702202631, 'learning_rate': 0.0027864372008787244, 'weight_decay': 0.002041763338677023, 'num_epochs': 22}. Best is trial 74 with value: 78.46845245361328.


Trial: 98 - Loss: 50.62577819824219 - Val Loss: 125.54471588134766


[I 2024-06-20 05:15:43,758] Trial 99 finished with value: 136.9536590576172 and parameters: {'num_heads': 8, 'model_dim': 408, 'num_layers': 5, 'dropout': 0.3672526079848649, 'learning_rate': 0.005570518825588713, 'weight_decay': 0.003152519827176785, 'num_epochs': 18}. Best is trial 74 with value: 78.46845245361328.


Trial: 99 - Loss: 50.76043701171875 - Val Loss: 136.9536590576172


In [39]:
# Results
print(f'Número de pruebas: {len(study_g.trials)}')
trial = study_g.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida: {trial.value}')

Número de pruebas: 100
Mejor prueba: 74
Mejores parametros: {'num_heads': 8, 'model_dim': 496, 'num_layers': 6, 'dropout': 0.4051282113428471, 'learning_rate': 0.004309566957857929, 'weight_decay': 0.0005126147306951872, 'num_epochs': 15}
Mejor valor de pérdida: 78.46845245361328


## Single Thread

In [40]:
# configuration optuna
study_st = optuna.create_study(direction='minimize')
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features_st), len(target)), n_trials=n_trials)

[I 2024-06-20 05:15:43,789] A new study created in memory with name: no-name-9dc7cf60-4bd9-4f75-a316-8a9fbde6de7c
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 05:15:44,019] Trial 0 finished with value: 553.1808471679688 and parameters: {'num_heads': 1, 'model_dim': 9, 'num_layers': 1, 'dropout': 0.11733177299352615, 'learning_rate': 0.0014942984696280986, 'weight_decay': 2.2933336863983602e-05, 'num_epochs': 87}. Best is trial 0 with value: 553.1808471679688.


Trial: 0 - Loss: 348.5758361816406 - Val Loss: 553.1808471679688


[I 2024-06-20 05:15:44,651] Trial 1 finished with value: 585.1207885742188 and parameters: {'num_heads': 3, 'model_dim': 66, 'num_layers': 5, 'dropout': 0.13740167606562004, 'learning_rate': 6.610409328626004e-05, 'weight_decay': 0.0004543254939001672, 'num_epochs': 84}. Best is trial 0 with value: 553.1808471679688.
[I 2024-06-20 05:15:44,717] Trial 2 finished with value: 87.58348083496094 and parameters: {'num_heads': 5, 'model_dim': 115, 'num_layers': 1, 'dropout': 0.4657496616949821, 'learning_rate': 0.005281045421205645, 'weight_decay': 0.00032554083501477204, 'num_epochs': 24}. Best is trial 2 with value: 87.58348083496094.


Trial: 1 - Loss: 371.4375305175781 - Val Loss: 585.1207885742188
Trial: 2 - Loss: 39.95188522338867 - Val Loss: 87.58348083496094


[I 2024-06-20 05:15:45,286] Trial 3 finished with value: 446.05084228515625 and parameters: {'num_heads': 2, 'model_dim': 46, 'num_layers': 6, 'dropout': 0.39156839668455323, 'learning_rate': 0.0011196332640909215, 'weight_decay': 0.0010555210109952533, 'num_epochs': 60}. Best is trial 2 with value: 87.58348083496094.
[I 2024-06-20 05:15:45,359] Trial 4 finished with value: 26.357051849365234 and parameters: {'num_heads': 7, 'model_dim': 133, 'num_layers': 2, 'dropout': 0.1780524768826179, 'learning_rate': 0.00968967984581368, 'weight_decay': 3.710073967520182e-05, 'num_epochs': 16}. Best is trial 4 with value: 26.357051849365234.


Trial: 3 - Loss: 266.30328369140625 - Val Loss: 446.05084228515625
Trial: 4 - Loss: 17.59310531616211 - Val Loss: 26.357051849365234


[I 2024-06-20 05:15:45,545] Trial 5 finished with value: 639.9856567382812 and parameters: {'num_heads': 2, 'model_dim': 124, 'num_layers': 3, 'dropout': 0.1406532035080014, 'learning_rate': 1.1326267834432216e-05, 'weight_decay': 7.777115198622085e-05, 'num_epochs': 35}. Best is trial 4 with value: 26.357051849365234.


Trial: 5 - Loss: 408.3043518066406 - Val Loss: 639.9856567382812


[I 2024-06-20 05:15:46,165] Trial 6 finished with value: 462.4554748535156 and parameters: {'num_heads': 6, 'model_dim': 144, 'num_layers': 4, 'dropout': 0.14863680061289963, 'learning_rate': 0.00017664482982389604, 'weight_decay': 8.636802427987267e-05, 'num_epochs': 95}. Best is trial 4 with value: 26.357051849365234.


Trial: 6 - Loss: 278.08587646484375 - Val Loss: 462.4554748535156


[I 2024-06-20 05:15:46,574] Trial 7 finished with value: 318.636474609375 and parameters: {'num_heads': 5, 'model_dim': 55, 'num_layers': 3, 'dropout': 0.4826113830452554, 'learning_rate': 0.0013502667613164586, 'weight_decay': 2.960458277504003e-05, 'num_epochs': 75}. Best is trial 4 with value: 26.357051849365234.


Trial: 7 - Loss: 173.10707092285156 - Val Loss: 318.636474609375


[I 2024-06-20 05:15:46,930] Trial 8 finished with value: 117.49163055419922 and parameters: {'num_heads': 4, 'model_dim': 120, 'num_layers': 2, 'dropout': 0.4870656693666249, 'learning_rate': 0.0029884235370029573, 'weight_decay': 0.00509685997592207, 'num_epochs': 96}. Best is trial 4 with value: 26.357051849365234.


Trial: 8 - Loss: 6.467168807983398 - Val Loss: 117.49163055419922


[I 2024-06-20 05:15:47,259] Trial 9 finished with value: 606.2106323242188 and parameters: {'num_heads': 1, 'model_dim': 38, 'num_layers': 4, 'dropout': 0.21433710664411365, 'learning_rate': 0.0001642913102113108, 'weight_decay': 1.5619389299977177e-05, 'num_epochs': 40}. Best is trial 4 with value: 26.357051849365234.
[I 2024-06-20 05:15:47,384] Trial 10 finished with value: 21.16092300415039 and parameters: {'num_heads': 8, 'model_dim': 408, 'num_layers': 2, 'dropout': 0.2716785782046021, 'learning_rate': 0.009778465900619053, 'weight_decay': 9.273832779832591e-05, 'num_epochs': 12}. Best is trial 10 with value: 21.16092300415039.


Trial: 9 - Loss: 387.7408447265625 - Val Loss: 606.2106323242188
Trial: 10 - Loss: 61.61594772338867 - Val Loss: 21.16092300415039


[I 2024-06-20 05:15:47,526] Trial 11 finished with value: 45.00516891479492 and parameters: {'num_heads': 8, 'model_dim': 432, 'num_layers': 2, 'dropout': 0.28296838234156807, 'learning_rate': 0.00827432210429101, 'weight_decay': 8.666519279043083e-05, 'num_epochs': 15}. Best is trial 10 with value: 21.16092300415039.
[I 2024-06-20 05:15:47,633] Trial 12 finished with value: 20.73723793029785 and parameters: {'num_heads': 8, 'model_dim': 376, 'num_layers': 2, 'dropout': 0.24322921420323695, 'learning_rate': 0.008887223549782842, 'weight_decay': 6.609541065252174e-05, 'num_epochs': 10}. Best is trial 12 with value: 20.73723793029785.


Trial: 11 - Loss: 18.972915649414062 - Val Loss: 45.00516891479492
Trial: 12 - Loss: 32.982872009277344 - Val Loss: 20.73723793029785


[I 2024-06-20 05:15:47,740] Trial 13 finished with value: 407.0429382324219 and parameters: {'num_heads': 8, 'model_dim': 408, 'num_layers': 2, 'dropout': 0.30951289698369366, 'learning_rate': 0.0007207164678726581, 'weight_decay': 0.00016347889955543174, 'num_epochs': 10}. Best is trial 12 with value: 20.73723793029785.
[I 2024-06-20 05:15:47,876] Trial 14 finished with value: 56.16273498535156 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 1, 'dropout': 0.26390004692797703, 'learning_rate': 0.0033684375204214626, 'weight_decay': 1.0561071279573706e-05, 'num_epochs': 35}. Best is trial 12 with value: 20.73723793029785.


Trial: 13 - Loss: 246.802978515625 - Val Loss: 407.0429382324219
Trial: 14 - Loss: 19.814525604248047 - Val Loss: 56.16273498535156


[I 2024-06-20 05:15:48,335] Trial 15 finished with value: 208.57550048828125 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 3, 'dropout': 0.34677570552132453, 'learning_rate': 0.0004896068755779234, 'weight_decay': 0.0010419273042945392, 'num_epochs': 54}. Best is trial 12 with value: 20.73723793029785.


Trial: 15 - Loss: 100.73916625976562 - Val Loss: 208.57550048828125


[I 2024-06-20 05:15:48,558] Trial 16 finished with value: 40.06589126586914 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 2, 'dropout': 0.23274809561534443, 'learning_rate': 0.0026831785865799648, 'weight_decay': 0.00018989109013022178, 'num_epochs': 25}. Best is trial 12 with value: 20.73723793029785.


Trial: 16 - Loss: 17.608007431030273 - Val Loss: 40.06589126586914


[I 2024-06-20 05:15:48,954] Trial 17 finished with value: 560.8413696289062 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 3, 'dropout': 0.35080423795131543, 'learning_rate': 1.67135769216648e-05, 'weight_decay': 0.009586962973060805, 'num_epochs': 47}. Best is trial 12 with value: 20.73723793029785.
[I 2024-06-20 05:15:49,045] Trial 18 finished with value: 612.873046875 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 1, 'dropout': 0.20055380002590883, 'learning_rate': 4.992185739660803e-05, 'weight_decay': 5.0032640576311956e-05, 'num_epochs': 24}. Best is trial 12 with value: 20.73723793029785.


Trial: 17 - Loss: 357.1701354980469 - Val Loss: 560.8413696289062
Trial: 18 - Loss: 380.2933654785156 - Val Loss: 612.873046875


[I 2024-06-20 05:15:49,231] Trial 19 finished with value: 38.64103317260742 and parameters: {'num_heads': 8, 'model_dim': 448, 'num_layers': 4, 'dropout': 0.4295003545485234, 'learning_rate': 0.004965328621329504, 'weight_decay': 0.0007931567385524069, 'num_epochs': 10}. Best is trial 12 with value: 20.73723793029785.


Trial: 19 - Loss: 21.98247528076172 - Val Loss: 38.64103317260742


[I 2024-06-20 05:15:49,835] Trial 20 finished with value: 436.6977233886719 and parameters: {'num_heads': 4, 'model_dim': 180, 'num_layers': 5, 'dropout': 0.2604410414873085, 'learning_rate': 0.0002250216163554427, 'weight_decay': 0.00016743844399465512, 'num_epochs': 72}. Best is trial 12 with value: 20.73723793029785.
[I 2024-06-20 05:15:49,977] Trial 21 finished with value: 62.879844665527344 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 2, 'dropout': 0.18166987179444496, 'learning_rate': 0.009027241317918959, 'weight_decay': 3.9176066475936964e-05, 'num_epochs': 19}. Best is trial 12 with value: 20.73723793029785.


Trial: 20 - Loss: 258.8982238769531 - Val Loss: 436.6977233886719
Trial: 21 - Loss: 20.139381408691406 - Val Loss: 62.879844665527344


[I 2024-06-20 05:15:50,151] Trial 22 finished with value: 29.2661075592041 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 2, 'dropout': 0.18281848551160215, 'learning_rate': 0.006252543346859708, 'weight_decay': 5.924455820089751e-05, 'num_epochs': 30}. Best is trial 12 with value: 20.73723793029785.
[I 2024-06-20 05:15:50,244] Trial 23 finished with value: 43.786766052246094 and parameters: {'num_heads': 8, 'model_dim': 392, 'num_layers': 1, 'dropout': 0.31252940846321864, 'learning_rate': 0.009918205478513497, 'weight_decay': 1.9638315625760825e-05, 'num_epochs': 17}. Best is trial 12 with value: 20.73723793029785.


Trial: 22 - Loss: 17.779010772705078 - Val Loss: 29.2661075592041
Trial: 23 - Loss: 22.868526458740234 - Val Loss: 43.786766052246094


[I 2024-06-20 05:15:50,378] Trial 24 finished with value: 193.0010528564453 and parameters: {'num_heads': 7, 'model_dim': 371, 'num_layers': 3, 'dropout': 0.24124792816427995, 'learning_rate': 0.0027695218212852387, 'weight_decay': 0.0001251039002332816, 'num_epochs': 11}. Best is trial 12 with value: 20.73723793029785.


Trial: 24 - Loss: 109.92449188232422 - Val Loss: 193.0010528564453


[I 2024-06-20 05:15:50,586] Trial 25 finished with value: 19.274085998535156 and parameters: {'num_heads': 6, 'model_dim': 288, 'num_layers': 2, 'dropout': 0.1747904448232519, 'learning_rate': 0.004368027473204212, 'weight_decay': 4.4575618191414555e-05, 'num_epochs': 22}. Best is trial 25 with value: 19.274085998535156.


Trial: 25 - Loss: 24.770845413208008 - Val Loss: 19.274085998535156


[I 2024-06-20 05:15:50,844] Trial 26 finished with value: 37.94383239746094 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 2, 'dropout': 0.33728113203903437, 'learning_rate': 0.004323873036611875, 'weight_decay': 0.00032867893516931856, 'num_epochs': 43}. Best is trial 25 with value: 19.274085998535156.


Trial: 26 - Loss: 17.373991012573242 - Val Loss: 37.94383239746094
Trial: 27 - Loss: 47.687599182128906 - Val Loss: 110.94593811035156


[I 2024-06-20 05:15:51,043] Trial 27 finished with value: 110.94593811035156 and parameters: {'num_heads': 5, 'model_dim': 250, 'num_layers': 3, 'dropout': 0.1004146936613236, 'learning_rate': 0.002296337938959732, 'weight_decay': 1.1483380927970389e-05, 'num_epochs': 25}. Best is trial 25 with value: 19.274085998535156.
[I 2024-06-20 05:15:51,214] Trial 28 finished with value: 21.96181297302246 and parameters: {'num_heads': 8, 'model_dim': 480, 'num_layers': 1, 'dropout': 0.27919545970327986, 'learning_rate': 0.0017248797015368626, 'weight_decay': 0.00010775924944067684, 'num_epochs': 32}. Best is trial 25 with value: 19.274085998535156.
[I 2024-06-20 05:15:51,310] Trial 29 finished with value: 317.69793701171875 and parameters: {'num_heads': 6, 'model_dim': 354, 'num_layers': 1, 'dropout': 0.22890200617382114, 'learning_rate': 0.0007996397310887314, 'weight_decay': 2.589659137826685e-05, 'num_epochs': 21}. Best is trial 25 with value: 19.274085998535156.


Trial: 28 - Loss: 20.6373348236084 - Val Loss: 21.96181297302246
Trial: 29 - Loss: 184.1362762451172 - Val Loss: 317.69793701171875


[I 2024-06-20 05:15:51,569] Trial 30 finished with value: 59.27729415893555 and parameters: {'num_heads': 7, 'model_dim': 196, 'num_layers': 2, 'dropout': 0.3794895035403352, 'learning_rate': 0.005999444437814983, 'weight_decay': 0.002569747430381604, 'num_epochs': 56}. Best is trial 25 with value: 19.274085998535156.
[I 2024-06-20 05:15:51,729] Trial 31 finished with value: 21.064754486083984 and parameters: {'num_heads': 8, 'model_dim': 488, 'num_layers': 1, 'dropout': 0.283039825634414, 'learning_rate': 0.002044710208593224, 'weight_decay': 0.00011176449034363022, 'num_epochs': 33}. Best is trial 25 with value: 19.274085998535156.


Trial: 30 - Loss: 13.035360336303711 - Val Loss: 59.27729415893555
Trial: 31 - Loss: 23.983489990234375 - Val Loss: 21.064754486083984


[I 2024-06-20 05:15:51,869] Trial 32 finished with value: 25.21645164489746 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 1, 'dropout': 0.2542675472807794, 'learning_rate': 0.0018297640010255581, 'weight_decay': 6.160001531375616e-05, 'num_epochs': 29}. Best is trial 25 with value: 19.274085998535156.
[I 2024-06-20 05:15:52,019] Trial 33 finished with value: 19.148828506469727 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 1, 'dropout': 0.290548361759721, 'learning_rate': 0.004528237928404283, 'weight_decay': 0.00021824162816915549, 'num_epochs': 16}. Best is trial 33 with value: 19.148828506469727.


Trial: 32 - Loss: 18.346412658691406 - Val Loss: 25.21645164489746
Trial: 33 - Loss: 41.18135070800781 - Val Loss: 19.148828506469727


[I 2024-06-20 05:15:52,177] Trial 34 finished with value: 18.249019622802734 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.30410134135080924, 'learning_rate': 0.004194119858034377, 'weight_decay': 0.0006282980804356609, 'num_epochs': 20}. Best is trial 34 with value: 18.249019622802734.
[I 2024-06-20 05:15:52,272] Trial 35 finished with value: 18.28067398071289 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.3231170898566288, 'learning_rate': 0.004244318344886109, 'weight_decay': 0.000689085862201119, 'num_epochs': 18}. Best is trial 34 with value: 18.249019622802734.
[I 2024-06-20 05:15:52,359] Trial 36 finished with value: 327.80816650390625 and parameters: {'num_heads': 5, 'model_dim': 270, 'num_layers': 1, 'dropout': 0.39051745871468074, 'learning_rate': 0.0009945180393196263, 'weight_decay': 0.0006606338818357519, 'num_epochs': 21}. Best is trial 34 with value: 18.249019622802734.


Trial: 34 - Loss: 30.975908279418945 - Val Loss: 18.249019622802734
Trial: 35 - Loss: 26.733556747436523 - Val Loss: 18.28067398071289
Trial: 36 - Loss: 191.44664001464844 - Val Loss: 327.80816650390625


[I 2024-06-20 05:15:52,506] Trial 37 finished with value: 30.262174606323242 and parameters: {'num_heads': 6, 'model_dim': 312, 'num_layers': 1, 'dropout': 0.4198319124671357, 'learning_rate': 0.003887942850435392, 'weight_decay': 0.0019689946337590547, 'num_epochs': 39}. Best is trial 34 with value: 18.249019622802734.
[I 2024-06-20 05:15:52,611] Trial 38 finished with value: 450.13885498046875 and parameters: {'num_heads': 7, 'model_dim': 210, 'num_layers': 1, 'dropout': 0.3257268494169371, 'learning_rate': 0.00044394816412781854, 'weight_decay': 0.00039574784040578013, 'num_epochs': 27}. Best is trial 34 with value: 18.249019622802734.


Trial: 37 - Loss: 17.139572143554688 - Val Loss: 30.262174606323242
Trial: 38 - Loss: 275.52716064453125 - Val Loss: 450.13885498046875


[I 2024-06-20 05:15:52,801] Trial 39 finished with value: 227.93724060058594 and parameters: {'num_heads': 4, 'model_dim': 100, 'num_layers': 6, 'dropout': 0.35207757223005326, 'learning_rate': 0.0060806444079088235, 'weight_decay': 0.0005412266175439702, 'num_epochs': 16}. Best is trial 34 with value: 18.249019622802734.
[I 2024-06-20 05:15:52,971] Trial 40 finished with value: 32.00625991821289 and parameters: {'num_heads': 6, 'model_dim': 342, 'num_layers': 1, 'dropout': 0.15676367675257818, 'learning_rate': 0.0014446811753804658, 'weight_decay': 0.00023996325507159795, 'num_epochs': 47}. Best is trial 34 with value: 18.249019622802734.


Trial: 39 - Loss: 130.1746063232422 - Val Loss: 227.93724060058594
Trial: 40 - Loss: 17.399442672729492 - Val Loss: 32.00625991821289


[I 2024-06-20 05:15:53,074] Trial 41 finished with value: 18.24604606628418 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.3721970849272195, 'learning_rate': 0.004148610460793607, 'weight_decay': 0.0012317598778010266, 'num_epochs': 19}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:53,188] Trial 42 finished with value: 18.296218872070312 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.3667242674650398, 'learning_rate': 0.0038889702833063045, 'weight_decay': 0.00134990714361698, 'num_epochs': 20}. Best is trial 41 with value: 18.24604606628418.


Trial: 41 - Loss: 28.117250442504883 - Val Loss: 18.24604606628418
Trial: 42 - Loss: 27.17732048034668 - Val Loss: 18.296218872070312


[I 2024-06-20 05:15:53,283] Trial 43 finished with value: 32.52521514892578 and parameters: {'num_heads': 7, 'model_dim': 399, 'num_layers': 1, 'dropout': 0.3684505095997122, 'learning_rate': 0.0034905428927607413, 'weight_decay': 0.0016404170349235496, 'num_epochs': 16}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:53,369] Trial 44 finished with value: 43.752899169921875 and parameters: {'num_heads': 3, 'model_dim': 153, 'num_layers': 1, 'dropout': 0.4305890110532449, 'learning_rate': 0.006307065992088338, 'weight_decay': 0.0012850197561523815, 'num_epochs': 19}. Best is trial 41 with value: 18.24604606628418.


Trial: 43 - Loss: 18.139047622680664 - Val Loss: 32.52521514892578
Trial: 44 - Loss: 21.36028289794922 - Val Loss: 43.752899169921875


[I 2024-06-20 05:15:53,628] Trial 45 finished with value: 43.89727783203125 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.29565978145588817, 'learning_rate': 0.0010404161340744507, 'weight_decay': 0.0004970772162940215, 'num_epochs': 69}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:53,728] Trial 46 finished with value: 134.00210571289062 and parameters: {'num_heads': 7, 'model_dim': 413, 'num_layers': 1, 'dropout': 0.32536316686419575, 'learning_rate': 0.002261244528864835, 'weight_decay': 0.003333613175776856, 'num_epochs': 14}. Best is trial 41 with value: 18.24604606628418.


Trial: 45 - Loss: 14.339395523071289 - Val Loss: 43.89727783203125
Trial: 46 - Loss: 70.12531280517578 - Val Loss: 134.00210571289062


[I 2024-06-20 05:15:53,982] Trial 47 finished with value: 421.9248046875 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 1, 'dropout': 0.41545537381992154, 'learning_rate': 9.249533727717249e-05, 'weight_decay': 0.0008844680329218905, 'num_epochs': 79}. Best is trial 41 with value: 18.24604606628418.


Trial: 47 - Loss: 252.930419921875 - Val Loss: 421.9248046875


[I 2024-06-20 05:15:54,322] Trial 48 finished with value: 481.6212158203125 and parameters: {'num_heads': 1, 'model_dim': 20, 'num_layers': 5, 'dropout': 0.36486407913601293, 'learning_rate': 0.0032697064432526245, 'weight_decay': 0.0013591105613605112, 'num_epochs': 38}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:54,460] Trial 49 finished with value: 458.1384582519531 and parameters: {'num_heads': 2, 'model_dim': 84, 'num_layers': 2, 'dropout': 0.46301114533065324, 'learning_rate': 0.0013017749706581538, 'weight_decay': 0.00027767008306633413, 'num_epochs': 28}. Best is trial 41 with value: 18.24604606628418.


Trial: 48 - Loss: 295.5270690917969 - Val Loss: 481.6212158203125
Trial: 49 - Loss: 280.2736511230469 - Val Loss: 458.1384582519531


[I 2024-06-20 05:15:54,608] Trial 50 finished with value: 54.644779205322266 and parameters: {'num_heads': 7, 'model_dim': 420, 'num_layers': 1, 'dropout': 0.29759461017679023, 'learning_rate': 0.004913320238581873, 'weight_decay': 0.0038579535014383955, 'num_epochs': 35}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:54,761] Trial 51 finished with value: 18.246702194213867 and parameters: {'num_heads': 6, 'model_dim': 354, 'num_layers': 2, 'dropout': 0.3303711788332658, 'learning_rate': 0.004037687744859818, 'weight_decay': 0.000636960071059073, 'num_epochs': 22}. Best is trial 41 with value: 18.24604606628418.


Trial: 50 - Loss: 14.310009002685547 - Val Loss: 54.644779205322266
Trial: 51 - Loss: 31.82982635498047 - Val Loss: 18.246702194213867


[I 2024-06-20 05:15:54,842] Trial 52 finished with value: 18.79729461669922 and parameters: {'num_heads': 7, 'model_dim': 357, 'num_layers': 1, 'dropout': 0.32245715432318955, 'learning_rate': 0.007219454975145755, 'weight_decay': 0.0007352385760058681, 'num_epochs': 14}. Best is trial 41 with value: 18.24604606628418.


Trial: 52 - Loss: 40.614112854003906 - Val Loss: 18.79729461669922


[I 2024-06-20 05:15:55,445] Trial 53 finished with value: 34.060848236083984 and parameters: {'num_heads': 6, 'model_dim': 354, 'num_layers': 2, 'dropout': 0.3175931283118292, 'learning_rate': 0.007225232043020346, 'weight_decay': 0.000702264352940192, 'num_epochs': 100}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:55,557] Trial 54 finished with value: 24.161779403686523 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 1, 'dropout': 0.3361201513092549, 'learning_rate': 0.0027689194980849686, 'weight_decay': 0.0011056954242325383, 'num_epochs': 23}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:55,633] Trial 55 finished with value: 22.57073402404785 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 1, 'dropout': 0.39974631925489373, 'learning_rate': 0.008115404986132951, 'weight_decay': 0.002199893137787125, 'num_epochs': 14}. Best is trial 41 with value: 18.24604606628418.


Trial: 53 - Loss: 17.517724990844727 - Val Loss: 34.060848236083984
Trial: 54 - Loss: 18.450647354125977 - Val Loss: 24.161779403686523
Trial: 55 - Loss: 50.16481018066406 - Val Loss: 22.57073402404785


[I 2024-06-20 05:15:55,774] Trial 56 finished with value: 24.986221313476562 and parameters: {'num_heads': 6, 'model_dim': 360, 'num_layers': 2, 'dropout': 0.3624040998805469, 'learning_rate': 0.003476748354413935, 'weight_decay': 0.0006120545197344106, 'num_epochs': 19}. Best is trial 41 with value: 18.24604606628418.
[I 2024-06-20 05:15:55,958] Trial 57 finished with value: 24.05054473876953 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 2, 'dropout': 0.306485153298543, 'learning_rate': 0.007185659026295518, 'weight_decay': 0.000419784902182068, 'num_epochs': 25}. Best is trial 41 with value: 18.24604606628418.


Trial: 56 - Loss: 18.075984954833984 - Val Loss: 24.986221313476562
Trial: 57 - Loss: 19.31900405883789 - Val Loss: 24.05054473876953


[I 2024-06-20 05:15:56,037] Trial 58 finished with value: 644.2015991210938 and parameters: {'num_heads': 6, 'model_dim': 336, 'num_layers': 1, 'dropout': 0.33754414048584996, 'learning_rate': 2.3950246867467697e-05, 'weight_decay': 0.0010112745098192063, 'num_epochs': 13}. Best is trial 41 with value: 18.24604606628418.


Trial: 58 - Loss: 418.5084228515625 - Val Loss: 644.2015991210938


[I 2024-06-20 05:15:56,426] Trial 59 finished with value: 34.379432678222656 and parameters: {'num_heads': 7, 'model_dim': 371, 'num_layers': 2, 'dropout': 0.3772089874136302, 'learning_rate': 0.00276132012607109, 'weight_decay': 0.001556131883083077, 'num_epochs': 66}. Best is trial 41 with value: 18.24604606628418.


Trial: 59 - Loss: 17.39006805419922 - Val Loss: 34.379432678222656


[I 2024-06-20 05:15:56,689] Trial 60 finished with value: 18.217418670654297 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 4, 'dropout': 0.3944508870189298, 'learning_rate': 0.0052776377703212364, 'weight_decay': 0.0007789453603024966, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.


Trial: 60 - Loss: 30.09351348876953 - Val Loss: 18.217418670654297


[I 2024-06-20 05:15:56,907] Trial 61 finished with value: 18.2850284576416 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 4, 'dropout': 0.4000908395347996, 'learning_rate': 0.005317228302739591, 'weight_decay': 0.0008321480348288606, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.


Trial: 61 - Loss: 32.76743698120117 - Val Loss: 18.2850284576416


[I 2024-06-20 05:15:57,139] Trial 62 finished with value: 18.316726684570312 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 4, 'dropout': 0.3987044642338011, 'learning_rate': 0.005503563765229637, 'weight_decay': 0.0003402674324664869, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.


Trial: 62 - Loss: 33.91288375854492 - Val Loss: 18.316726684570312


[I 2024-06-20 05:15:57,341] Trial 63 finished with value: 33.24732208251953 and parameters: {'num_heads': 4, 'model_dim': 248, 'num_layers': 4, 'dropout': 0.45518142369630876, 'learning_rate': 0.004031948676151339, 'weight_decay': 0.00089479101105145, 'num_epochs': 22}. Best is trial 60 with value: 18.217418670654297.


Trial: 63 - Loss: 18.065216064453125 - Val Loss: 33.24732208251953


[I 2024-06-20 05:15:57,732] Trial 64 finished with value: 29.24492835998535 and parameters: {'num_heads': 5, 'model_dim': 305, 'num_layers': 5, 'dropout': 0.4385156781803253, 'learning_rate': 0.00527398696602703, 'weight_decay': 0.0005289136121311039, 'num_epochs': 31}. Best is trial 60 with value: 18.217418670654297.


Trial: 64 - Loss: 18.50860023498535 - Val Loss: 29.24492835998535


[I 2024-06-20 05:15:58,022] Trial 65 finished with value: 103.14952087402344 and parameters: {'num_heads': 4, 'model_dim': 248, 'num_layers': 4, 'dropout': 0.39673995124474354, 'learning_rate': 0.0021683598306853572, 'weight_decay': 0.0011605671564565861, 'num_epochs': 27}. Best is trial 60 with value: 18.217418670654297.


Trial: 65 - Loss: 42.900672912597656 - Val Loss: 103.14952087402344


[I 2024-06-20 05:15:58,278] Trial 66 finished with value: 65.86647033691406 and parameters: {'num_heads': 5, 'model_dim': 295, 'num_layers': 5, 'dropout': 0.38131034849346884, 'learning_rate': 0.009984301585099262, 'weight_decay': 0.0016937730433142675, 'num_epochs': 18}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:15:58,426] Trial 67 finished with value: 220.7423553466797 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 4, 'dropout': 0.3529190162672448, 'learning_rate': 0.0031804077427192634, 'weight_decay': 0.0008294055319405426, 'num_epochs': 10}. Best is trial 60 with value: 18.217418670654297.


Trial: 66 - Loss: 21.62723731994629 - Val Loss: 65.86647033691406
Trial: 67 - Loss: 130.302001953125 - Val Loss: 220.7423553466797


[I 2024-06-20 05:15:58,684] Trial 68 finished with value: 34.04172134399414 and parameters: {'num_heads': 5, 'model_dim': 275, 'num_layers': 4, 'dropout': 0.40874924736931023, 'learning_rate': 0.004618732238980128, 'weight_decay': 0.0025060983758704307, 'num_epochs': 25}. Best is trial 60 with value: 18.217418670654297.


Trial: 68 - Loss: 18.166994094848633 - Val Loss: 34.04172134399414


[I 2024-06-20 05:15:58,899] Trial 69 finished with value: 139.5942840576172 and parameters: {'num_heads': 6, 'model_dim': 348, 'num_layers': 3, 'dropout': 0.44514990670928595, 'learning_rate': 0.0016863364143987045, 'weight_decay': 0.0004143473127322002, 'num_epochs': 22}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:15:59,012] Trial 70 finished with value: 378.4442138671875 and parameters: {'num_heads': 3, 'model_dim': 186, 'num_layers': 3, 'dropout': 0.3422197795902723, 'learning_rate': 0.0023342432361098315, 'weight_decay': 0.006945476198484388, 'num_epochs': 12}. Best is trial 60 with value: 18.217418670654297.


Trial: 69 - Loss: 64.8638687133789 - Val Loss: 139.5942840576172
Trial: 70 - Loss: 230.24423217773438 - Val Loss: 378.4442138671875


[I 2024-06-20 05:15:59,238] Trial 71 finished with value: 18.389122009277344 and parameters: {'num_heads': 5, 'model_dim': 310, 'num_layers': 4, 'dropout': 0.3853249835733543, 'learning_rate': 0.005453617226238782, 'weight_decay': 0.0003141159581969741, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.


Trial: 71 - Loss: 35.390045166015625 - Val Loss: 18.389122009277344


[I 2024-06-20 05:15:59,512] Trial 72 finished with value: 34.99715042114258 and parameters: {'num_heads': 5, 'model_dim': 300, 'num_layers': 4, 'dropout': 0.40732897915901267, 'learning_rate': 0.0036400402924580207, 'weight_decay': 0.00035811399122795065, 'num_epochs': 20}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:15:59,675] Trial 73 finished with value: 22.293155670166016 and parameters: {'num_heads': 4, 'model_dim': 252, 'num_layers': 4, 'dropout': 0.36935624721880683, 'learning_rate': 0.005767636768468456, 'weight_decay': 0.0005698434126460027, 'num_epochs': 17}. Best is trial 60 with value: 18.217418670654297.


Trial: 72 - Loss: 17.62860870361328 - Val Loss: 34.99715042114258
Trial: 73 - Loss: 18.575847625732422 - Val Loss: 22.293155670166016


[I 2024-06-20 05:16:00,015] Trial 74 finished with value: 29.112377166748047 and parameters: {'num_heads': 5, 'model_dim': 310, 'num_layers': 5, 'dropout': 0.35906798638545717, 'learning_rate': 0.007975808791516338, 'weight_decay': 0.000943484801118703, 'num_epochs': 26}. Best is trial 60 with value: 18.217418670654297.


Trial: 74 - Loss: 17.267532348632812 - Val Loss: 29.112377166748047


[I 2024-06-20 05:16:00,351] Trial 75 finished with value: 48.82176208496094 and parameters: {'num_heads': 6, 'model_dim': 366, 'num_layers': 4, 'dropout': 0.4933361697533121, 'learning_rate': 0.003958658388062055, 'weight_decay': 0.000469513318998921, 'num_epochs': 29}. Best is trial 60 with value: 18.217418670654297.


Trial: 75 - Loss: 20.99380874633789 - Val Loss: 48.82176208496094


[I 2024-06-20 05:16:00,582] Trial 76 finished with value: 32.97034454345703 and parameters: {'num_heads': 6, 'model_dim': 342, 'num_layers': 3, 'dropout': 0.4225335245682278, 'learning_rate': 0.002708938268667217, 'weight_decay': 0.0006844113989609239, 'num_epochs': 24}. Best is trial 60 with value: 18.217418670654297.


Trial: 76 - Loss: 17.50055503845215 - Val Loss: 32.97034454345703


[I 2024-06-20 05:16:00,833] Trial 77 finished with value: 21.217317581176758 and parameters: {'num_heads': 5, 'model_dim': 290, 'num_layers': 4, 'dropout': 0.39887715885176744, 'learning_rate': 0.006704350400458411, 'weight_decay': 0.0012649560646882227, 'num_epochs': 21}. Best is trial 60 with value: 18.217418670654297.


Trial: 77 - Loss: 29.57936668395996 - Val Loss: 21.217317581176758


[I 2024-06-20 05:16:01,416] Trial 78 finished with value: 25.869022369384766 and parameters: {'num_heads': 8, 'model_dim': 440, 'num_layers': 5, 'dropout': 0.30899177369738196, 'learning_rate': 0.004563205572467919, 'weight_decay': 0.0002874281468055579, 'num_epochs': 33}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:01,559] Trial 79 finished with value: 39.9833984375 and parameters: {'num_heads': 4, 'model_dim': 252, 'num_layers': 3, 'dropout': 0.3714756881725923, 'learning_rate': 0.005268395238234167, 'weight_decay': 0.00014401245834685773, 'num_epochs': 16}. Best is trial 60 with value: 18.217418670654297.


Trial: 78 - Loss: 18.656312942504883 - Val Loss: 25.869022369384766
Trial: 79 - Loss: 20.504526138305664 - Val Loss: 39.9833984375


[I 2024-06-20 05:16:01,733] Trial 80 finished with value: 480.73614501953125 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 4, 'dropout': 0.3876470899942006, 'learning_rate': 0.0003521950965059001, 'weight_decay': 0.0008225931408335878, 'num_epochs': 12}. Best is trial 60 with value: 18.217418670654297.


Trial: 80 - Loss: 295.33831787109375 - Val Loss: 480.73614501953125


[I 2024-06-20 05:16:01,957] Trial 81 finished with value: 18.263124465942383 and parameters: {'num_heads': 5, 'model_dim': 310, 'num_layers': 4, 'dropout': 0.3881460829770379, 'learning_rate': 0.0053052202765253, 'weight_decay': 0.00033395475351884484, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:02,153] Trial 82 finished with value: 21.54165267944336 and parameters: {'num_heads': 5, 'model_dim': 265, 'num_layers': 4, 'dropout': 0.33154049446696904, 'learning_rate': 0.008439143617723474, 'weight_decay': 0.00024869317772605265, 'num_epochs': 18}. Best is trial 60 with value: 18.217418670654297.


Trial: 81 - Loss: 32.678768157958984 - Val Loss: 18.263124465942383
Trial: 82 - Loss: 30.937227249145508 - Val Loss: 21.54165267944336


[I 2024-06-20 05:16:02,435] Trial 83 finished with value: 25.520978927612305 and parameters: {'num_heads': 5, 'model_dim': 315, 'num_layers': 4, 'dropout': 0.348412618241143, 'learning_rate': 0.00334341744137771, 'weight_decay': 0.00020501418563490895, 'num_epochs': 23}. Best is trial 60 with value: 18.217418670654297.


Trial: 83 - Loss: 19.01015853881836 - Val Loss: 25.520978927612305


[I 2024-06-20 05:16:03,210] Trial 84 finished with value: 34.83238983154297 and parameters: {'num_heads': 5, 'model_dim': 280, 'num_layers': 4, 'dropout': 0.3572372140717318, 'learning_rate': 0.004242581994222749, 'weight_decay': 0.00037018636264831323, 'num_epochs': 88}. Best is trial 60 with value: 18.217418670654297.


Trial: 84 - Loss: 17.87065315246582 - Val Loss: 34.83238983154297


[I 2024-06-20 05:16:03,519] Trial 85 finished with value: 38.54035949707031 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 4, 'dropout': 0.40893293587629576, 'learning_rate': 0.00616687643300476, 'weight_decay': 0.0015064553547155409, 'num_epochs': 20}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:03,608] Trial 86 finished with value: 295.5793762207031 and parameters: {'num_heads': 4, 'model_dim': 236, 'num_layers': 1, 'dropout': 0.39194326798892376, 'learning_rate': 0.0019042022854177385, 'weight_decay': 0.000595781345485578, 'num_epochs': 16}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:03,691] Trial 87 finished with value: 23.976072311401367 and parameters: {'num_heads': 6, 'model_dim': 366, 'num_layers': 1, 'dropout': 0.26817076453124367, 'learning_rate': 0.005204925010305698, 'weight_decay': 0.0004835759220437396, 'num_epochs': 13}. Best is trial 60 with value: 18.217418670654297.


Trial: 85 - Loss: 19.390539169311523 - Val Loss: 38.54035949707031
Trial: 86 - Loss: 172.77532958984375 - Val Loss: 295.5793762207031
Trial: 87 - Loss: 17.645822525024414 - Val Loss: 23.976072311401367


[I 2024-06-20 05:16:03,943] Trial 88 finished with value: 30.09083366394043 and parameters: {'num_heads': 7, 'model_dim': 385, 'num_layers': 3, 'dropout': 0.43239117867445365, 'learning_rate': 0.0026206414841421825, 'weight_decay': 0.0018500430490414126, 'num_epochs': 23}. Best is trial 60 with value: 18.217418670654297.


Trial: 88 - Loss: 17.792564392089844 - Val Loss: 30.09083366394043


[I 2024-06-20 05:16:04,262] Trial 89 finished with value: 288.3927307128906 and parameters: {'num_heads': 6, 'model_dim': 348, 'num_layers': 4, 'dropout': 0.3768953907210748, 'learning_rate': 0.0006904943545132548, 'weight_decay': 0.001063231914708009, 'num_epochs': 28}. Best is trial 60 with value: 18.217418670654297.


Trial: 89 - Loss: 156.9557647705078 - Val Loss: 288.3927307128906


[I 2024-06-20 05:16:04,908] Trial 90 finished with value: 31.10697364807129 and parameters: {'num_heads': 8, 'model_dim': 416, 'num_layers': 5, 'dropout': 0.2858158585561312, 'learning_rate': 0.0037611480652855627, 'weight_decay': 0.0007239537761276657, 'num_epochs': 36}. Best is trial 60 with value: 18.217418670654297.


Trial: 90 - Loss: 17.54404067993164 - Val Loss: 31.10697364807129


[I 2024-06-20 05:16:05,134] Trial 91 finished with value: 18.34684944152832 and parameters: {'num_heads': 5, 'model_dim': 315, 'num_layers': 4, 'dropout': 0.3897050816933936, 'learning_rate': 0.005879906302828575, 'weight_decay': 0.0001671781958345356, 'num_epochs': 19}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:05,316] Trial 92 finished with value: 21.306119918823242 and parameters: {'num_heads': 5, 'model_dim': 315, 'num_layers': 4, 'dropout': 0.41545494897240764, 'learning_rate': 0.0073965539150217165, 'weight_decay': 0.00017267744703432702, 'num_epochs': 15}. Best is trial 60 with value: 18.217418670654297.


Trial: 91 - Loss: 36.67012023925781 - Val Loss: 18.34684944152832
Trial: 92 - Loss: 46.157310485839844 - Val Loss: 21.306119918823242


[I 2024-06-20 05:16:05,514] Trial 93 finished with value: 88.1248779296875 and parameters: {'num_heads': 5, 'model_dim': 300, 'num_layers': 4, 'dropout': 0.39983295354697046, 'learning_rate': 0.0030444534374155195, 'weight_decay': 0.000334220164743961, 'num_epochs': 18}. Best is trial 60 with value: 18.217418670654297.


Trial: 93 - Loss: 38.64114761352539 - Val Loss: 88.1248779296875


[I 2024-06-20 05:16:05,754] Trial 94 finished with value: 26.174816131591797 and parameters: {'num_heads': 5, 'model_dim': 315, 'num_layers': 4, 'dropout': 0.37358467585389366, 'learning_rate': 0.006369092410331719, 'weight_decay': 0.0006116801188505418, 'num_epochs': 21}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:05,827] Trial 95 finished with value: 18.310789108276367 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 1, 'dropout': 0.3176075301995246, 'learning_rate': 0.008814461555143427, 'weight_decay': 0.0013713538121964412, 'num_epochs': 10}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:05,898] Trial 96 finished with value: 22.01729965209961 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 1, 'dropout': 0.3201536706481669, 'learning_rate': 0.009230369614169764, 'weight_decay': 0.0012906275369375928, 'num_epochs': 11}. Best is trial 60 with value: 18.217418670654297.


Trial: 94 - Loss: 23.919204711914062 - Val Loss: 26.174816131591797
Trial: 95 - Loss: 20.95515251159668 - Val Loss: 18.310789108276367
Trial: 96 - Loss: 37.481632232666016 - Val Loss: 22.01729965209961


[I 2024-06-20 05:16:05,973] Trial 97 finished with value: 506.68499755859375 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 1, 'dropout': 0.3044650466291039, 'learning_rate': 0.0002183287025599589, 'weight_decay': 0.0020186047922250965, 'num_epochs': 10}. Best is trial 60 with value: 18.217418670654297.
[I 2024-06-20 05:16:06,077] Trial 98 finished with value: 38.292816162109375 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 1, 'dropout': 0.34463495945821787, 'learning_rate': 0.004672473212254047, 'weight_decay': 0.0009508262969350424, 'num_epochs': 14}. Best is trial 60 with value: 18.217418670654297.


Trial: 97 - Loss: 316.8341979980469 - Val Loss: 506.68499755859375
Trial: 98 - Loss: 20.514841079711914 - Val Loss: 38.292816162109375


[I 2024-06-20 05:16:06,215] Trial 99 finished with value: 28.21881866455078 and parameters: {'num_heads': 7, 'model_dim': 399, 'num_layers': 1, 'dropout': 0.3272770550863353, 'learning_rate': 0.008273060955363027, 'weight_decay': 0.0024668450979883016, 'num_epochs': 26}. Best is trial 60 with value: 18.217418670654297.


Trial: 99 - Loss: 17.451608657836914 - Val Loss: 28.21881866455078


In [41]:
# Results
print(f'Número de pruebas: {len(study_st.trials)}')
trial = study_st.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Número de pruebas: 100
Mejor prueba: 60
Mejores parametros: {'num_heads': 5, 'model_dim': 305, 'num_layers': 4, 'dropout': 0.3944508870189298, 'learning_rate': 0.0052776377703212364, 'weight_decay': 0.0007789453603024966, 'num_epochs': 19}
Mejor valor de pérdida en validación: 18.217418670654297


## Multi Thread

In [51]:
# configuration optuna
study_mm = optuna.create_study(direction='minimize')
study_mm.optimize(lambda trial: objective(trial, X_mm_train, y_mm_train, X_mm_test, y_mm_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 05:18:59,957] A new study created in memory with name: no-name-7b56daef-3104-41c0-bbb0-11a969de7cbc
[I 2024-06-20 05:19:00,174] Trial 0 finished with value: 1367.906982421875 and parameters: {'num_heads': 6, 'model_dim': 342, 'num_layers': 3, 'dropout': 0.35729459721182055, 'learning_rate': 4.41927159482419e-05, 'weight_decay': 0.005492516187005042, 'num_epochs': 34}. Best is trial 0 with value: 1367.906982421875.
[I 2024-06-20 05:19:00,310] Trial 1 finished with value: 1391.062744140625 and parameters: {'num_heads': 2, 'model_dim': 64, 'num_layers': 2, 'dropout': 0.17007905297777648, 'learning_rate': 0.0008346083363926891, 'weight_decay': 0.0005018227075829743, 'num_epochs': 35}. Best is trial 0 with value: 1367.906982421875.


Trial: 0 - Loss: 247.82350158691406 - Val Loss: 1367.906982421875
Trial: 1 - Loss: 257.7347106933594 - Val Loss: 1391.062744140625


[I 2024-06-20 05:19:00,609] Trial 2 finished with value: 1587.4559326171875 and parameters: {'num_heads': 1, 'model_dim': 15, 'num_layers': 2, 'dropout': 0.35126842549103066, 'learning_rate': 5.2298128469062746e-05, 'weight_decay': 0.00024029470137962227, 'num_epochs': 80}. Best is trial 0 with value: 1367.906982421875.
[I 2024-06-20 05:19:00,780] Trial 3 finished with value: 1457.5709228515625 and parameters: {'num_heads': 6, 'model_dim': 48, 'num_layers': 1, 'dropout': 0.17460812334645084, 'learning_rate': 0.0003653766634842455, 'weight_decay': 0.0024674739592383654, 'num_epochs': 71}. Best is trial 0 with value: 1367.906982421875.


Trial: 2 - Loss: 321.6813049316406 - Val Loss: 1587.4559326171875
Trial: 3 - Loss: 278.43524169921875 - Val Loss: 1457.5709228515625


[I 2024-06-20 05:19:01,052] Trial 4 finished with value: 1438.416259765625 and parameters: {'num_heads': 3, 'model_dim': 129, 'num_layers': 5, 'dropout': 0.4425061033003763, 'learning_rate': 0.00011597522524470304, 'weight_decay': 0.0003389139749388, 'num_epochs': 26}. Best is trial 0 with value: 1367.906982421875.
[I 2024-06-20 05:19:01,238] Trial 5 finished with value: 1559.0189208984375 and parameters: {'num_heads': 1, 'model_dim': 55, 'num_layers': 1, 'dropout': 0.1647183033809928, 'learning_rate': 4.137807220691872e-05, 'weight_decay': 3.577739784888944e-05, 'num_epochs': 69}. Best is trial 0 with value: 1367.906982421875.


Trial: 4 - Loss: 271.7032470703125 - Val Loss: 1438.416259765625
Trial: 5 - Loss: 308.0973815917969 - Val Loss: 1559.0189208984375


[I 2024-06-20 05:19:01,529] Trial 6 finished with value: 591.8261108398438 and parameters: {'num_heads': 8, 'model_dim': 144, 'num_layers': 5, 'dropout': 0.2132241301237135, 'learning_rate': 0.007523538613818884, 'weight_decay': 0.00028590349043248553, 'num_epochs': 35}. Best is trial 6 with value: 591.8261108398438.
[I 2024-06-20 05:19:01,643] Trial 7 finished with value: 1247.741943359375 and parameters: {'num_heads': 2, 'model_dim': 128, 'num_layers': 2, 'dropout': 0.24436277485692626, 'learning_rate': 0.0010694781681941956, 'weight_decay': 0.0003359694867120269, 'num_epochs': 26}. Best is trial 6 with value: 591.8261108398438.


Trial: 6 - Loss: 114.13185119628906 - Val Loss: 591.8261108398438
Trial: 7 - Loss: 217.28677368164062 - Val Loss: 1247.741943359375


[I 2024-06-20 05:19:02,484] Trial 8 finished with value: 624.1157836914062 and parameters: {'num_heads': 5, 'model_dim': 140, 'num_layers': 6, 'dropout': 0.38435169599028696, 'learning_rate': 0.004475366847201709, 'weight_decay': 4.0043742748308376e-05, 'num_epochs': 94}. Best is trial 6 with value: 591.8261108398438.
[I 2024-06-20 05:19:02,555] Trial 9 finished with value: 1560.476806640625 and parameters: {'num_heads': 4, 'model_dim': 256, 'num_layers': 3, 'dropout': 0.10370368598645699, 'learning_rate': 3.288505966918519e-05, 'weight_decay': 7.273939847952647e-05, 'num_epochs': 10}. Best is trial 6 with value: 591.8261108398438.


Trial: 8 - Loss: 112.3540267944336 - Val Loss: 624.1157836914062
Trial: 9 - Loss: 307.63922119140625 - Val Loss: 1560.476806640625


[I 2024-06-20 05:19:03,182] Trial 10 finished with value: 597.4546508789062 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 5, 'dropout': 0.272335909176389, 'learning_rate': 0.00973017670551833, 'weight_decay': 1.4641569186041513e-05, 'num_epochs': 53}. Best is trial 6 with value: 591.8261108398438.


Trial: 10 - Loss: 113.35771179199219 - Val Loss: 597.4546508789062


[I 2024-06-20 05:19:03,737] Trial 11 finished with value: 629.2282104492188 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 5, 'dropout': 0.26650274658244877, 'learning_rate': 0.008171290227452126, 'weight_decay': 1.289421092024182e-05, 'num_epochs': 47}. Best is trial 6 with value: 591.8261108398438.


Trial: 11 - Loss: 113.86947631835938 - Val Loss: 629.2282104492188


[I 2024-06-20 05:19:04,305] Trial 12 finished with value: 600.0194702148438 and parameters: {'num_heads': 8, 'model_dim': 464, 'num_layers': 5, 'dropout': 0.2426606637774808, 'learning_rate': 0.0029896641127178233, 'weight_decay': 0.0016764119294765635, 'num_epochs': 48}. Best is trial 6 with value: 591.8261108398438.


Trial: 12 - Loss: 113.91366577148438 - Val Loss: 600.0194702148438


[I 2024-06-20 05:19:04,876] Trial 13 finished with value: 608.8510131835938 and parameters: {'num_heads': 7, 'model_dim': 245, 'num_layers': 6, 'dropout': 0.3027219290456525, 'learning_rate': 0.00981309986554564, 'weight_decay': 1.0908853376151148e-05, 'num_epochs': 58}. Best is trial 6 with value: 591.8261108398438.


Trial: 13 - Loss: 113.76994323730469 - Val Loss: 608.8510131835938


[I 2024-06-20 05:19:05,320] Trial 14 finished with value: 624.0811767578125 and parameters: {'num_heads': 7, 'model_dim': 357, 'num_layers': 4, 'dropout': 0.3096618809603233, 'learning_rate': 0.001931570243913095, 'weight_decay': 0.0001211190018261846, 'num_epochs': 57}. Best is trial 6 with value: 591.8261108398438.
[I 2024-06-20 05:19:05,444] Trial 15 finished with value: 1417.53857421875 and parameters: {'num_heads': 8, 'model_dim': 192, 'num_layers': 4, 'dropout': 0.21032545701100175, 'learning_rate': 0.0003043227931147273, 'weight_decay': 0.0010356480514016237, 'num_epochs': 13}. Best is trial 6 with value: 591.8261108398438.


Trial: 14 - Loss: 112.75798034667969 - Val Loss: 624.0811767578125
Trial: 15 - Loss: 265.7070007324219 - Val Loss: 1417.53857421875


[I 2024-06-20 05:19:05,851] Trial 16 finished with value: 596.4447021484375 and parameters: {'num_heads': 6, 'model_dim': 192, 'num_layers': 5, 'dropout': 0.12208816063418762, 'learning_rate': 0.006152817580805805, 'weight_decay': 0.006171196747697194, 'num_epochs': 42}. Best is trial 6 with value: 591.8261108398438.


Trial: 16 - Loss: 112.19577026367188 - Val Loss: 596.4447021484375


[I 2024-06-20 05:19:06,217] Trial 17 finished with value: 1516.3922119140625 and parameters: {'num_heads': 6, 'model_dim': 186, 'num_layers': 6, 'dropout': 0.10708481739143136, 'learning_rate': 1.083026725021445e-05, 'weight_decay': 0.008960904040456364, 'num_epochs': 34}. Best is trial 6 with value: 591.8261108398438.


Trial: 17 - Loss: 296.4713134765625 - Val Loss: 1516.3922119140625


[I 2024-06-20 05:19:06,612] Trial 18 finished with value: 602.9662475585938 and parameters: {'num_heads': 5, 'model_dim': 220, 'num_layers': 4, 'dropout': 0.13723533109001715, 'learning_rate': 0.003927634578826886, 'weight_decay': 0.0006887061525378328, 'num_epochs': 43}. Best is trial 6 with value: 591.8261108398438.


Trial: 18 - Loss: 112.04196166992188 - Val Loss: 602.9662475585938


[I 2024-06-20 05:19:06,840] Trial 19 finished with value: 911.4846801757812 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 5, 'dropout': 0.2031285055476403, 'learning_rate': 0.001444528111545446, 'weight_decay': 0.0031073102080163696, 'num_epochs': 20}. Best is trial 6 with value: 591.8261108398438.


Trial: 19 - Loss: 142.0367889404297 - Val Loss: 911.4846801757812


[I 2024-06-20 05:19:07,275] Trial 20 finished with value: 1339.161865234375 and parameters: {'num_heads': 4, 'model_dim': 100, 'num_layers': 6, 'dropout': 0.2042362538286931, 'learning_rate': 0.0006062942452477576, 'weight_decay': 0.00013911104779447934, 'num_epochs': 40}. Best is trial 6 with value: 591.8261108398438.


Trial: 20 - Loss: 241.0247344970703 - Val Loss: 1339.161865234375


[I 2024-06-20 05:19:07,802] Trial 21 finished with value: 625.1806030273438 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 5, 'dropout': 0.2651785459928561, 'learning_rate': 0.006231904920097786, 'weight_decay': 2.55376028196563e-05, 'num_epochs': 52}. Best is trial 6 with value: 591.8261108398438.


Trial: 21 - Loss: 113.52078247070312 - Val Loss: 625.1806030273438


[I 2024-06-20 05:19:08,326] Trial 22 finished with value: 621.3854370117188 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 4, 'dropout': 0.4939945250075909, 'learning_rate': 0.0021083689040275683, 'weight_decay': 0.0011556672981715805, 'num_epochs': 61}. Best is trial 6 with value: 591.8261108398438.


Trial: 22 - Loss: 112.89698028564453 - Val Loss: 621.3854370117188


[I 2024-06-20 05:19:08,580] Trial 23 finished with value: 506.2815856933594 and parameters: {'num_heads': 6, 'model_dim': 162, 'num_layers': 5, 'dropout': 0.1299600456318201, 'learning_rate': 0.005499555061882289, 'weight_decay': 7.970048557227937e-05, 'num_epochs': 25}. Best is trial 23 with value: 506.2815856933594.


Trial: 23 - Loss: 118.32936096191406 - Val Loss: 506.2815856933594


[I 2024-06-20 05:19:08,788] Trial 24 finished with value: 708.312744140625 and parameters: {'num_heads': 5, 'model_dim': 165, 'num_layers': 4, 'dropout': 0.13666160072584765, 'learning_rate': 0.003197059835899757, 'weight_decay': 0.00015126916021228996, 'num_epochs': 25}. Best is trial 23 with value: 506.2815856933594.


Trial: 24 - Loss: 116.44338989257812 - Val Loss: 708.312744140625


[I 2024-06-20 05:19:09,051] Trial 25 finished with value: 587.91650390625 and parameters: {'num_heads': 6, 'model_dim': 156, 'num_layers': 6, 'dropout': 0.13278611383113495, 'learning_rate': 0.004883048043258813, 'weight_decay': 9.754174649451921e-05, 'num_epochs': 21}. Best is trial 23 with value: 506.2815856933594.


Trial: 25 - Loss: 113.21260070800781 - Val Loss: 587.91650390625


[I 2024-06-20 05:19:09,268] Trial 26 finished with value: 1255.7183837890625 and parameters: {'num_heads': 7, 'model_dim': 98, 'num_layers': 6, 'dropout': 0.15468488290655014, 'learning_rate': 0.0021555843561220345, 'weight_decay': 7.59742930736305e-05, 'num_epochs': 18}. Best is trial 23 with value: 506.2815856933594.


Trial: 26 - Loss: 221.3553924560547 - Val Loss: 1255.7183837890625


[I 2024-06-20 05:19:09,579] Trial 27 finished with value: 651.1996459960938 and parameters: {'num_heads': 6, 'model_dim': 168, 'num_layers': 6, 'dropout': 0.21752000287522202, 'learning_rate': 0.004735632810961515, 'weight_decay': 8.392487987237712e-05, 'num_epochs': 19}. Best is trial 23 with value: 506.2815856933594.


Trial: 27 - Loss: 114.81119537353516 - Val Loss: 651.1996459960938


[I 2024-06-20 05:19:09,915] Trial 28 finished with value: 1326.1519775390625 and parameters: {'num_heads': 4, 'model_dim': 152, 'num_layers': 6, 'dropout': 0.18473289640357748, 'learning_rate': 0.0005395821133749008, 'weight_decay': 0.00020666354427869175, 'num_epochs': 29}. Best is trial 23 with value: 506.2815856933594.


Trial: 28 - Loss: 237.7691650390625 - Val Loss: 1326.1519775390625


[I 2024-06-20 05:19:10,151] Trial 29 finished with value: 1368.7957763671875 and parameters: {'num_heads': 6, 'model_dim': 222, 'num_layers': 3, 'dropout': 0.13338358573814038, 'learning_rate': 0.00014841409396140098, 'weight_decay': 3.001774495669443e-05, 'num_epochs': 34}. Best is trial 23 with value: 506.2815856933594.
[I 2024-06-20 05:19:10,298] Trial 30 finished with value: 1360.1258544921875 and parameters: {'num_heads': 5, 'model_dim': 100, 'num_layers': 5, 'dropout': 0.15078130038042548, 'learning_rate': 0.0015212093987557228, 'weight_decay': 5.4210365755193254e-05, 'num_epochs': 14}. Best is trial 23 with value: 506.2815856933594.


Trial: 29 - Loss: 249.7531280517578 - Val Loss: 1368.7957763671875
Trial: 30 - Loss: 250.59417724609375 - Val Loss: 1360.1258544921875


[I 2024-06-20 05:19:10,668] Trial 31 finished with value: 638.8923950195312 and parameters: {'num_heads': 6, 'model_dim': 180, 'num_layers': 5, 'dropout': 0.11095278556528744, 'learning_rate': 0.005944136822380831, 'weight_decay': 0.005721519814034848, 'num_epochs': 41}. Best is trial 23 with value: 506.2815856933594.


Trial: 31 - Loss: 112.625732421875 - Val Loss: 638.8923950195312


[I 2024-06-20 05:19:11,051] Trial 32 finished with value: 607.2184448242188 and parameters: {'num_heads': 6, 'model_dim': 210, 'num_layers': 5, 'dropout': 0.12566751822269911, 'learning_rate': 0.005968650396013452, 'weight_decay': 0.0005262890828029113, 'num_epochs': 32}. Best is trial 23 with value: 506.2815856933594.


Trial: 32 - Loss: 112.7651596069336 - Val Loss: 607.2184448242188


[I 2024-06-20 05:19:11,496] Trial 33 finished with value: 617.7057495117188 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 6, 'dropout': 0.1797317917800208, 'learning_rate': 0.0029820447419675804, 'weight_decay': 0.00022008001238905486, 'num_epochs': 38}. Best is trial 23 with value: 506.2815856933594.


Trial: 33 - Loss: 112.82007598876953 - Val Loss: 617.7057495117188


[I 2024-06-20 05:19:11,850] Trial 34 finished with value: 606.7855224609375 and parameters: {'num_heads': 5, 'model_dim': 115, 'num_layers': 4, 'dropout': 0.15787808765751993, 'learning_rate': 0.006719294983075209, 'weight_decay': 9.50241637348115e-05, 'num_epochs': 45}. Best is trial 23 with value: 506.2815856933594.


Trial: 34 - Loss: 114.21836853027344 - Val Loss: 606.7855224609375


[I 2024-06-20 05:19:12,096] Trial 35 finished with value: 1207.2919921875 and parameters: {'num_heads': 6, 'model_dim': 162, 'num_layers': 5, 'dropout': 0.18991008189874, 'learning_rate': 0.0011083082936466182, 'weight_decay': 0.0004516109792580361, 'num_epochs': 22}. Best is trial 23 with value: 506.2815856933594.


Trial: 35 - Loss: 205.83653259277344 - Val Loss: 1207.2919921875


[I 2024-06-20 05:19:12,450] Trial 36 finished with value: 624.7872314453125 and parameters: {'num_heads': 3, 'model_dim': 147, 'num_layers': 6, 'dropout': 0.23802768517370176, 'learning_rate': 0.0044352832374906686, 'weight_decay': 5.40274263118029e-05, 'num_epochs': 31}. Best is trial 23 with value: 506.2815856933594.


Trial: 36 - Loss: 114.0683364868164 - Val Loss: 624.7872314453125


[I 2024-06-20 05:19:12,817] Trial 37 finished with value: 1023.8430786132812 and parameters: {'num_heads': 3, 'model_dim': 84, 'num_layers': 5, 'dropout': 0.10437168805288503, 'learning_rate': 0.0027240215616731365, 'weight_decay': 0.0002544866605773804, 'num_epochs': 29}. Best is trial 23 with value: 506.2815856933594.


Trial: 37 - Loss: 162.05226135253906 - Val Loss: 1023.8430786132812


[I 2024-06-20 05:19:13,112] Trial 38 finished with value: 597.828857421875 and parameters: {'num_heads': 7, 'model_dim': 35, 'num_layers': 4, 'dropout': 0.3242467497299474, 'learning_rate': 0.009650680765609293, 'weight_decay': 0.00017877256120441186, 'num_epochs': 38}. Best is trial 23 with value: 506.2815856933594.


Trial: 38 - Loss: 113.72346496582031 - Val Loss: 597.828857421875


[I 2024-06-20 05:19:13,633] Trial 39 finished with value: 1115.3035888671875 and parameters: {'num_heads': 5, 'model_dim': 200, 'num_layers': 3, 'dropout': 0.40180729139788596, 'learning_rate': 0.0040177486459696266, 'weight_decay': 0.00032214502226510136, 'num_epochs': 87}. Best is trial 23 with value: 506.2815856933594.


Trial: 39 - Loss: 105.04696655273438 - Val Loss: 1115.3035888671875


[I 2024-06-20 05:19:13,922] Trial 40 finished with value: 1281.8721923828125 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 2, 'dropout': 0.12487165753879526, 'learning_rate': 0.00013620442675594442, 'weight_decay': 0.003217386705259325, 'num_epochs': 63}. Best is trial 23 with value: 506.2815856933594.


Trial: 40 - Loss: 223.52545166015625 - Val Loss: 1281.8721923828125


[I 2024-06-20 05:19:14,538] Trial 41 finished with value: 604.92138671875 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 5, 'dropout': 0.269332394138919, 'learning_rate': 0.007034434015984029, 'weight_decay': 4.862341766102301e-05, 'num_epochs': 50}. Best is trial 23 with value: 506.2815856933594.


Trial: 41 - Loss: 111.75505828857422 - Val Loss: 604.92138671875


[I 2024-06-20 05:19:15,285] Trial 42 finished with value: 613.4445190429688 and parameters: {'num_heads': 8, 'model_dim': 384, 'num_layers': 5, 'dropout': 0.337433990949366, 'learning_rate': 0.009345423119153007, 'weight_decay': 2.7022929151744314e-05, 'num_epochs': 72}. Best is trial 23 with value: 506.2815856933594.


Trial: 42 - Loss: 115.11595153808594 - Val Loss: 613.4445190429688


[I 2024-06-20 05:19:15,762] Trial 43 finished with value: 621.4237670898438 and parameters: {'num_heads': 8, 'model_dim': 72, 'num_layers': 5, 'dropout': 0.22627420803586476, 'learning_rate': 0.0055202916174869575, 'weight_decay': 1.926675213970645e-05, 'num_epochs': 53}. Best is trial 23 with value: 506.2815856933594.


Trial: 43 - Loss: 111.9942398071289 - Val Loss: 621.4237670898438


[I 2024-06-20 05:19:16,138] Trial 44 finished with value: 586.605224609375 and parameters: {'num_heads': 7, 'model_dim': 119, 'num_layers': 6, 'dropout': 0.170609959284043, 'learning_rate': 0.009761668108239854, 'weight_decay': 1.626640343273923e-05, 'num_epochs': 36}. Best is trial 23 with value: 506.2815856933594.


Trial: 44 - Loss: 111.16170501708984 - Val Loss: 586.605224609375


[I 2024-06-20 05:19:16,416] Trial 45 finished with value: 754.9677124023438 and parameters: {'num_heads': 7, 'model_dim': 126, 'num_layers': 6, 'dropout': 0.17144824319354263, 'learning_rate': 0.003695312942376488, 'weight_decay': 0.00010937108920864302, 'num_epochs': 25}. Best is trial 23 with value: 506.2815856933594.
[I 2024-06-20 05:19:16,536] Trial 46 finished with value: 1218.898193359375 and parameters: {'num_heads': 7, 'model_dim': 140, 'num_layers': 1, 'dropout': 0.1611728417913062, 'learning_rate': 0.007006742853676983, 'weight_decay': 0.0004504650446314789, 'num_epochs': 37}. Best is trial 23 with value: 506.2815856933594.


Trial: 45 - Loss: 120.57291412353516 - Val Loss: 754.9677124023438
Trial: 46 - Loss: 17.515546798706055 - Val Loss: 1218.898193359375


[I 2024-06-20 05:19:16,722] Trial 47 finished with value: 1209.06884765625 and parameters: {'num_heads': 6, 'model_dim': 120, 'num_layers': 6, 'dropout': 0.14443382463373292, 'learning_rate': 0.0023530867102312, 'weight_decay': 1.7356216625541074e-05, 'num_epochs': 15}. Best is trial 23 with value: 506.2815856933594.


Trial: 47 - Loss: 209.00198364257812 - Val Loss: 1209.06884765625


[I 2024-06-20 05:19:17,019] Trial 48 finished with value: 1399.2730712890625 and parameters: {'num_heads': 7, 'model_dim': 259, 'num_layers': 6, 'dropout': 0.12096446186423593, 'learning_rate': 8.610301203889103e-05, 'weight_decay': 0.0007148408441892931, 'num_epochs': 23}. Best is trial 23 with value: 506.2815856933594.


Trial: 48 - Loss: 259.2189636230469 - Val Loss: 1399.2730712890625


[I 2024-06-20 05:19:17,425] Trial 49 finished with value: 1390.379638671875 and parameters: {'num_heads': 6, 'model_dim': 180, 'num_layers': 6, 'dropout': 0.19263194516862497, 'learning_rate': 0.0002463555766907793, 'weight_decay': 0.0016727280695536433, 'num_epochs': 28}. Best is trial 23 with value: 506.2815856933594.


Trial: 49 - Loss: 256.6553039550781 - Val Loss: 1390.379638671875


[I 2024-06-20 05:19:17,789] Trial 50 finished with value: 1589.399658203125 and parameters: {'num_heads': 1, 'model_dim': 4, 'num_layers': 5, 'dropout': 0.17517419660462258, 'learning_rate': 0.0015492219469779008, 'weight_decay': 6.482590998311183e-05, 'num_epochs': 43}. Best is trial 23 with value: 506.2815856933594.


Trial: 50 - Loss: 322.9173583984375 - Val Loss: 1589.399658203125


[I 2024-06-20 05:19:18,298] Trial 51 finished with value: 635.92138671875 and parameters: {'num_heads': 8, 'model_dim': 432, 'num_layers': 5, 'dropout': 0.2715504490884452, 'learning_rate': 0.008254519359051533, 'weight_decay': 1.4232527496754993e-05, 'num_epochs': 46}. Best is trial 23 with value: 506.2815856933594.


Trial: 51 - Loss: 113.15917205810547 - Val Loss: 635.92138671875


[I 2024-06-20 05:19:18,597] Trial 52 finished with value: 782.407470703125 and parameters: {'num_heads': 8, 'model_dim': 56, 'num_layers': 5, 'dropout': 0.25588300533399816, 'learning_rate': 0.005135340122480441, 'weight_decay': 1.0145727656751447e-05, 'num_epochs': 34}. Best is trial 23 with value: 506.2815856933594.


Trial: 52 - Loss: 122.11323547363281 - Val Loss: 782.407470703125


[I 2024-06-20 05:19:19,021] Trial 53 finished with value: 606.4086303710938 and parameters: {'num_heads': 8, 'model_dim': 344, 'num_layers': 4, 'dropout': 0.28639399465643467, 'learning_rate': 0.009770735843819312, 'weight_decay': 3.685727183348321e-05, 'num_epochs': 56}. Best is trial 23 with value: 506.2815856933594.


Trial: 53 - Loss: 113.17111206054688 - Val Loss: 606.4086303710938


[I 2024-06-20 05:19:19,598] Trial 54 finished with value: 599.3325805664062 and parameters: {'num_heads': 7, 'model_dim': 140, 'num_layers': 6, 'dropout': 0.22788867308707303, 'learning_rate': 0.007824892651880237, 'weight_decay': 1.9902261077906842e-05, 'num_epochs': 50}. Best is trial 23 with value: 506.2815856933594.


Trial: 54 - Loss: 112.4666519165039 - Val Loss: 599.3325805664062


[I 2024-06-20 05:19:20,040] Trial 55 finished with value: 633.351806640625 and parameters: {'num_heads': 8, 'model_dim': 232, 'num_layers': 6, 'dropout': 0.1445805298732537, 'learning_rate': 0.004814406208132892, 'weight_decay': 1.3515266344571539e-05, 'num_epochs': 42}. Best is trial 23 with value: 506.2815856933594.
[I 2024-06-20 05:19:20,166] Trial 56 finished with value: 1132.2412109375 and parameters: {'num_heads': 7, 'model_dim': 168, 'num_layers': 4, 'dropout': 0.3634554362669693, 'learning_rate': 0.0031050152895805644, 'weight_decay': 3.964874677860076e-05, 'num_epochs': 11}. Best is trial 23 with value: 506.2815856933594.


Trial: 55 - Loss: 113.85545349121094 - Val Loss: 633.351806640625
Trial: 56 - Loss: 193.40284729003906 - Val Loss: 1132.2412109375


[I 2024-06-20 05:19:20,715] Trial 57 finished with value: 616.229736328125 and parameters: {'num_heads': 6, 'model_dim': 108, 'num_layers': 5, 'dropout': 0.1157627047790202, 'learning_rate': 0.0035399882471233595, 'weight_decay': 2.364828672984761e-05, 'num_epochs': 65}. Best is trial 23 with value: 506.2815856933594.


Trial: 57 - Loss: 113.26611328125 - Val Loss: 616.229736328125


[I 2024-06-20 05:19:20,991] Trial 58 finished with value: 1531.668701171875 and parameters: {'num_heads': 5, 'model_dim': 155, 'num_layers': 5, 'dropout': 0.10150692754228247, 'learning_rate': 1.5821344539459357e-05, 'weight_decay': 0.00892541193092013, 'num_epochs': 26}. Best is trial 23 with value: 506.2815856933594.


Trial: 58 - Loss: 301.5643005371094 - Val Loss: 1531.668701171875


[I 2024-06-20 05:19:21,202] Trial 59 finished with value: 470.4664306640625 and parameters: {'num_heads': 7, 'model_dim': 203, 'num_layers': 6, 'dropout': 0.2033652026507959, 'learning_rate': 0.007583715910293533, 'weight_decay': 0.0001450312408107907, 'num_epochs': 16}. Best is trial 59 with value: 470.4664306640625.


Trial: 59 - Loss: 123.50868225097656 - Val Loss: 470.4664306640625


[I 2024-06-20 05:19:21,431] Trial 60 finished with value: 1231.29541015625 and parameters: {'num_heads': 7, 'model_dim': 203, 'num_layers': 6, 'dropout': 0.1641673390148463, 'learning_rate': 0.0009739805638812107, 'weight_decay': 0.00016050773620940106, 'num_epochs': 18}. Best is trial 59 with value: 470.4664306640625.


Trial: 60 - Loss: 212.6016082763672 - Val Loss: 1231.29541015625


[I 2024-06-20 05:19:21,728] Trial 61 finished with value: 454.1611633300781 and parameters: {'num_heads': 8, 'model_dim': 184, 'num_layers': 6, 'dropout': 0.20662171803830123, 'learning_rate': 0.007682778993808566, 'weight_decay': 0.0001165744002935107, 'num_epochs': 16}. Best is trial 61 with value: 454.1611633300781.


Trial: 61 - Loss: 121.95498657226562 - Val Loss: 454.1611633300781


[I 2024-06-20 05:19:21,933] Trial 62 finished with value: 434.5456237792969 and parameters: {'num_heads': 7, 'model_dim': 217, 'num_layers': 6, 'dropout': 0.20413915514592532, 'learning_rate': 0.007439027281305262, 'weight_decay': 0.00026418681929584126, 'num_epochs': 16}. Best is trial 62 with value: 434.5456237792969.


Trial: 62 - Loss: 125.24581146240234 - Val Loss: 434.5456237792969


[I 2024-06-20 05:19:22,139] Trial 63 finished with value: 433.529296875 and parameters: {'num_heads': 7, 'model_dim': 217, 'num_layers': 6, 'dropout': 0.20616512688026367, 'learning_rate': 0.008171869544304555, 'weight_decay': 0.0002759062598257445, 'num_epochs': 16}. Best is trial 63 with value: 433.529296875.


Trial: 63 - Loss: 128.0752716064453 - Val Loss: 433.529296875


[I 2024-06-20 05:19:22,376] Trial 64 finished with value: 606.61279296875 and parameters: {'num_heads': 7, 'model_dim': 259, 'num_layers': 6, 'dropout': 0.19972055689482898, 'learning_rate': 0.007890363469237561, 'weight_decay': 0.00013446782616703866, 'num_epochs': 16}. Best is trial 63 with value: 433.529296875.
[I 2024-06-20 05:19:22,523] Trial 65 finished with value: 857.7728881835938 and parameters: {'num_heads': 7, 'model_dim': 224, 'num_layers': 6, 'dropout': 0.21695080367236255, 'learning_rate': 0.004845281838740822, 'weight_decay': 0.00010071091062008468, 'num_epochs': 10}. Best is trial 63 with value: 433.529296875.


Trial: 64 - Loss: 114.3221435546875 - Val Loss: 606.61279296875
Trial: 65 - Loss: 140.50233459472656 - Val Loss: 857.7728881835938


[I 2024-06-20 05:19:22,785] Trial 66 finished with value: 482.2769470214844 and parameters: {'num_heads': 7, 'model_dim': 189, 'num_layers': 6, 'dropout': 0.25247852734235054, 'learning_rate': 0.006069963035296275, 'weight_decay': 0.0002955285780540447, 'num_epochs': 21}. Best is trial 63 with value: 433.529296875.


Trial: 66 - Loss: 122.86529541015625 - Val Loss: 482.2769470214844


[I 2024-06-20 05:19:23,058] Trial 67 finished with value: 460.3652648925781 and parameters: {'num_heads': 7, 'model_dim': 210, 'num_layers': 6, 'dropout': 0.25103514281572614, 'learning_rate': 0.006420228753684948, 'weight_decay': 0.00025254105935970076, 'num_epochs': 17}. Best is trial 63 with value: 433.529296875.


Trial: 67 - Loss: 120.18595886230469 - Val Loss: 460.3652648925781


[I 2024-06-20 05:19:23,265] Trial 68 finished with value: 572.1307983398438 and parameters: {'num_heads': 8, 'model_dim': 248, 'num_layers': 6, 'dropout': 0.25337643365472046, 'learning_rate': 0.006108256221791702, 'weight_decay': 0.00035888078802564046, 'num_epochs': 12}. Best is trial 63 with value: 433.529296875.


Trial: 68 - Loss: 113.17401885986328 - Val Loss: 572.1307983398438


[I 2024-06-20 05:19:23,475] Trial 69 finished with value: 983.0707397460938 and parameters: {'num_heads': 7, 'model_dim': 189, 'num_layers': 6, 'dropout': 0.284449438666479, 'learning_rate': 0.00248830723571017, 'weight_decay': 0.0002558777923613955, 'num_epochs': 16}. Best is trial 63 with value: 433.529296875.


Trial: 69 - Loss: 157.1684112548828 - Val Loss: 983.0707397460938


[I 2024-06-20 05:19:23,736] Trial 70 finished with value: 862.63232421875 and parameters: {'num_heads': 8, 'model_dim': 272, 'num_layers': 6, 'dropout': 0.2520794127854338, 'learning_rate': 0.0018124544613062317, 'weight_decay': 0.00017761730193912933, 'num_epochs': 20}. Best is trial 63 with value: 433.529296875.


Trial: 70 - Loss: 134.4563751220703 - Val Loss: 862.63232421875


[I 2024-06-20 05:19:23,978] Trial 71 finished with value: 424.90234375 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 6, 'dropout': 0.23509673268545378, 'learning_rate': 0.006819194301659112, 'weight_decay': 0.0003974356481285509, 'num_epochs': 13}. Best is trial 71 with value: 424.90234375.


Trial: 71 - Loss: 127.07906341552734 - Val Loss: 424.90234375


[I 2024-06-20 05:19:24,195] Trial 72 finished with value: 647.72607421875 and parameters: {'num_heads': 8, 'model_dim': 304, 'num_layers': 6, 'dropout': 0.22909034434730502, 'learning_rate': 0.003938947624518127, 'weight_decay': 0.0003580239767881893, 'num_epochs': 13}. Best is trial 71 with value: 424.90234375.


Trial: 72 - Loss: 115.54818725585938 - Val Loss: 647.72607421875


[I 2024-06-20 05:19:24,435] Trial 73 finished with value: 536.4699096679688 and parameters: {'num_heads': 7, 'model_dim': 217, 'num_layers': 6, 'dropout': 0.23891498018121662, 'learning_rate': 0.007593763722836003, 'weight_decay': 0.0005932799758231975, 'num_epochs': 18}. Best is trial 71 with value: 424.90234375.


Trial: 73 - Loss: 119.2605209350586 - Val Loss: 536.4699096679688


[I 2024-06-20 05:19:24,713] Trial 74 finished with value: 515.4288940429688 and parameters: {'num_heads': 7, 'model_dim': 210, 'num_layers': 6, 'dropout': 0.2091640933259086, 'learning_rate': 0.005464515620947891, 'weight_decay': 0.00021084607842108955, 'num_epochs': 23}. Best is trial 71 with value: 424.90234375.


Trial: 74 - Loss: 121.00850677490234 - Val Loss: 515.4288940429688


[I 2024-06-20 05:19:24,928] Trial 75 finished with value: 482.8599853515625 and parameters: {'num_heads': 8, 'model_dim': 288, 'num_layers': 6, 'dropout': 0.29044125127358345, 'learning_rate': 0.006507888131111498, 'weight_decay': 0.0003101106386527519, 'num_epochs': 15}. Best is trial 71 with value: 424.90234375.


Trial: 75 - Loss: 124.23760986328125 - Val Loss: 482.8599853515625


[I 2024-06-20 05:19:25,185] Trial 76 finished with value: 541.427490234375 and parameters: {'num_heads': 8, 'model_dim': 296, 'num_layers': 6, 'dropout': 0.2843539506758095, 'learning_rate': 0.006630229064876425, 'weight_decay': 0.0002813089271444367, 'num_epochs': 15}. Best is trial 71 with value: 424.90234375.
[I 2024-06-20 05:19:25,368] Trial 77 finished with value: 499.314453125 and parameters: {'num_heads': 8, 'model_dim': 328, 'num_layers': 6, 'dropout': 0.30756305089461944, 'learning_rate': 0.007885409103768483, 'weight_decay': 0.0009163822275907531, 'num_epochs': 10}. Best is trial 71 with value: 424.90234375.


Trial: 76 - Loss: 116.63352966308594 - Val Loss: 541.427490234375
Trial: 77 - Loss: 118.32161712646484 - Val Loss: 499.314453125


[I 2024-06-20 05:19:25,624] Trial 78 finished with value: 515.4891967773438 and parameters: {'num_heads': 8, 'model_dim': 272, 'num_layers': 6, 'dropout': 0.1958800921514727, 'learning_rate': 0.00406963785437491, 'weight_decay': 0.0003970930253596349, 'num_epochs': 18}. Best is trial 71 with value: 424.90234375.


Trial: 78 - Loss: 115.6045150756836 - Val Loss: 515.4891967773438


[I 2024-06-20 05:19:25,894] Trial 79 finished with value: 1084.9820556640625 and parameters: {'num_heads': 2, 'model_dim': 94, 'num_layers': 6, 'dropout': 0.22440680320351514, 'learning_rate': 0.003181113884718012, 'weight_decay': 0.00031104635925527813, 'num_epochs': 21}. Best is trial 71 with value: 424.90234375.
[I 2024-06-20 05:19:26,091] Trial 80 finished with value: 541.042236328125 and parameters: {'num_heads': 8, 'model_dim': 232, 'num_layers': 6, 'dropout': 0.2452799274187423, 'learning_rate': 0.006054787156710655, 'weight_decay': 0.0002228468036016768, 'num_epochs': 13}. Best is trial 71 with value: 424.90234375.


Trial: 79 - Loss: 177.6873016357422 - Val Loss: 1084.9820556640625
Trial: 80 - Loss: 113.86357879638672 - Val Loss: 541.042236328125


[I 2024-06-20 05:19:26,282] Trial 81 finished with value: 429.22210693359375 and parameters: {'num_heads': 8, 'model_dim': 336, 'num_layers': 6, 'dropout': 0.31314637616073393, 'learning_rate': 0.008358975130329212, 'weight_decay': 0.000994073621318789, 'num_epochs': 10}. Best is trial 71 with value: 424.90234375.


Trial: 81 - Loss: 129.13768005371094 - Val Loss: 429.22210693359375


[I 2024-06-20 05:19:26,553] Trial 82 finished with value: 751.3914794921875 and parameters: {'num_heads': 8, 'model_dim': 360, 'num_layers': 6, 'dropout': 0.33322782603843204, 'learning_rate': 0.008258624294005769, 'weight_decay': 0.0001251541663879448, 'num_epochs': 16}. Best is trial 71 with value: 424.90234375.
[I 2024-06-20 05:19:26,736] Trial 83 finished with value: 623.879638671875 and parameters: {'num_heads': 8, 'model_dim': 288, 'num_layers': 6, 'dropout': 0.31781153120857053, 'learning_rate': 0.004404547360581101, 'weight_decay': 0.00018650371375913795, 'num_epochs': 13}. Best is trial 71 with value: 424.90234375.


Trial: 82 - Loss: 118.35726928710938 - Val Loss: 751.3914794921875
Trial: 83 - Loss: 114.08616638183594 - Val Loss: 623.879638671875


[I 2024-06-20 05:19:26,893] Trial 84 finished with value: 477.0397033691406 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.2945789394328777, 'learning_rate': 0.006659736391767805, 'weight_decay': 0.0006891920531150959, 'num_epochs': 10}. Best is trial 71 with value: 424.90234375.
[I 2024-06-20 05:19:27,048] Trial 85 finished with value: 410.5378112792969 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.25966838018766203, 'learning_rate': 0.00961115924403532, 'weight_decay': 0.0012671115668168938, 'num_epochs': 10}. Best is trial 85 with value: 410.5378112792969.


Trial: 84 - Loss: 116.58094024658203 - Val Loss: 477.0397033691406
Trial: 85 - Loss: 136.1328887939453 - Val Loss: 410.5378112792969


[I 2024-06-20 05:19:27,209] Trial 86 finished with value: 462.09515380859375 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.26492612965813733, 'learning_rate': 0.008972260610296478, 'weight_decay': 0.0017452365170491377, 'num_epochs': 10}. Best is trial 85 with value: 410.5378112792969.
[I 2024-06-20 05:19:27,392] Trial 87 finished with value: 678.8718872070312 and parameters: {'num_heads': 7, 'model_dim': 364, 'num_layers': 6, 'dropout': 0.2617092651014712, 'learning_rate': 0.008892479389526028, 'weight_decay': 0.0014404245835044711, 'num_epochs': 12}. Best is trial 85 with value: 410.5378112792969.


Trial: 86 - Loss: 127.3433609008789 - Val Loss: 462.09515380859375
Trial: 87 - Loss: 112.57420349121094 - Val Loss: 678.8718872070312


[I 2024-06-20 05:19:27,605] Trial 88 finished with value: 628.522705078125 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 6, 'dropout': 0.2755787908551521, 'learning_rate': 0.009463202511498348, 'weight_decay': 0.0025976509043097724, 'num_epochs': 17}. Best is trial 85 with value: 410.5378112792969.


Trial: 88 - Loss: 113.60918426513672 - Val Loss: 628.522705078125


[I 2024-06-20 05:19:27,972] Trial 89 finished with value: 583.5573120117188 and parameters: {'num_heads': 7, 'model_dim': 378, 'num_layers': 6, 'dropout': 0.2381939115623508, 'learning_rate': 0.005237784977615352, 'weight_decay': 0.0010748920214338632, 'num_epochs': 24}. Best is trial 85 with value: 410.5378112792969.


Trial: 89 - Loss: 114.3158950805664 - Val Loss: 583.5573120117188


[I 2024-06-20 05:19:28,200] Trial 90 finished with value: 576.7067260742188 and parameters: {'num_heads': 6, 'model_dim': 330, 'num_layers': 6, 'dropout': 0.21093357085245865, 'learning_rate': 0.0073304919201762145, 'weight_decay': 0.0020754068104990153, 'num_epochs': 19}. Best is trial 85 with value: 410.5378112792969.
[I 2024-06-20 05:19:28,289] Trial 91 finished with value: 474.7503356933594 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 2, 'dropout': 0.29832810030972123, 'learning_rate': 0.007044173536065081, 'weight_decay': 0.0009107291610666412, 'num_epochs': 10}. Best is trial 85 with value: 410.5378112792969.
[I 2024-06-20 05:19:28,385] Trial 92 finished with value: 606.4412841796875 and parameters: {'num_heads': 7, 'model_dim': 343, 'num_layers': 2, 'dropout': 0.1833968464614883, 'learning_rate': 0.008521182385466251, 'weight_decay': 0.0013367966869799956, 'num_epochs': 13}. Best is trial 85 with value: 410.5378112792969.


Trial: 90 - Loss: 113.79305267333984 - Val Loss: 576.7067260742188
Trial: 91 - Loss: 118.8125228881836 - Val Loss: 474.7503356933594
Trial: 92 - Loss: 116.01144409179688 - Val Loss: 606.4412841796875


[I 2024-06-20 05:19:28,455] Trial 93 finished with value: 1325.3974609375 and parameters: {'num_heads': 7, 'model_dim': 308, 'num_layers': 1, 'dropout': 0.29840170283270817, 'learning_rate': 0.0006215887991531887, 'weight_decay': 0.0020039596974308935, 'num_epochs': 10}. Best is trial 85 with value: 410.5378112792969.
[I 2024-06-20 05:19:28,589] Trial 94 finished with value: 658.1279296875 and parameters: {'num_heads': 7, 'model_dim': 350, 'num_layers': 3, 'dropout': 0.22122235282261193, 'learning_rate': 0.009783054406190215, 'weight_decay': 0.0007611229493475789, 'num_epochs': 15}. Best is trial 85 with value: 410.5378112792969.


Trial: 93 - Loss: 239.1120147705078 - Val Loss: 1325.3974609375
Trial: 94 - Loss: 115.63230895996094 - Val Loss: 658.1279296875


[I 2024-06-20 05:19:28,693] Trial 95 finished with value: 458.3416442871094 and parameters: {'num_heads': 8, 'model_dim': 400, 'num_layers': 2, 'dropout': 0.355797658663223, 'learning_rate': 0.007489556330175298, 'weight_decay': 0.0005617348790446059, 'num_epochs': 12}. Best is trial 85 with value: 410.5378112792969.


Trial: 95 - Loss: 131.84934997558594 - Val Loss: 458.3416442871094


[I 2024-06-20 05:19:28,919] Trial 96 finished with value: 475.50726318359375 and parameters: {'num_heads': 8, 'model_dim': 408, 'num_layers': 6, 'dropout': 0.36106115990594284, 'learning_rate': 0.005406929390329647, 'weight_decay': 0.0004598422740521839, 'num_epochs': 13}. Best is trial 85 with value: 410.5378112792969.
[I 2024-06-20 05:19:29,067] Trial 97 finished with value: 910.853515625 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 1, 'dropout': 0.37218091150546206, 'learning_rate': 0.0045275776855504424, 'weight_decay': 0.0008745602883528638, 'num_epochs': 20}. Best is trial 85 with value: 410.5378112792969.


Trial: 96 - Loss: 124.42733001708984 - Val Loss: 475.50726318359375
Trial: 97 - Loss: 21.338220596313477 - Val Loss: 910.853515625


[I 2024-06-20 05:19:30,256] Trial 98 finished with value: 612.6205444335938 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 6, 'dropout': 0.40998239472432446, 'learning_rate': 0.0034752413096067873, 'weight_decay': 0.0005351798891154827, 'num_epochs': 94}. Best is trial 85 with value: 410.5378112792969.


Trial: 98 - Loss: 113.30770874023438 - Val Loss: 612.6205444335938


[I 2024-06-20 05:19:30,468] Trial 99 finished with value: 503.0196838378906 and parameters: {'num_heads': 8, 'model_dim': 200, 'num_layers': 6, 'dropout': 0.23239493318264295, 'learning_rate': 0.00858599968155528, 'weight_decay': 0.0041674387650437366, 'num_epochs': 17}. Best is trial 85 with value: 410.5378112792969.


Trial: 99 - Loss: 123.97649383544922 - Val Loss: 503.0196838378906


In [52]:
# Results
print(f'Trials quantity: {len(study_mm.trials)}')
trial = study_mm.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Trials quantity: 100
Mejor prueba: 85
Mejores parametros: {'num_heads': 7, 'model_dim': 329, 'num_layers': 6, 'dropout': 0.25966838018766203, 'learning_rate': 0.00961115924403532, 'weight_decay': 0.0012671115668168938, 'num_epochs': 10}
Mejor valor de pérdida en validación: 410.5378112792969


# Training

In [44]:
output_dim = 1

## General

In [45]:
input_dim = len(features)
# hyperparameters
num_heads = study_g.best_trial.params['num_heads']
model_dim = study_g.best_trial.params['model_dim']
num_layers = study_g.best_trial.params['num_layers']
dropout = study_g.best_trial.params['dropout']
lr = study_g.best_trial.params['learning_rate']
wd = study_g.best_trial.params['weight_decay']
num_epochs = study_g.best_trial.params['num_epochs']

study_g.best_trial.params

{'num_heads': 8,
 'model_dim': 496,
 'num_layers': 6,
 'dropout': 0.4051282113428471,
 'learning_rate': 0.004309566957857929,
 'weight_decay': 0.0005126147306951872,
 'num_epochs': 15}

In [46]:
# general model initialization
model_g = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_g = model_g.to(DEVICE)
criterion_g = nn.MSELoss()
optimizer_g = optim.AdamW(model_g.parameters(), lr=lr, weight_decay=wd)

model_g.train()

for epoch in range(num_epochs):
	optimizer_g.zero_grad()
	output = model_g(X_g_train)
	loss = criterion_g(output, y_g_train)
	loss.backward()
	optimizer_g.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_g.eval()
		with torch.no_grad():
			val_predictions = model_g(X_g_test)
			val_loss = criterion_g(val_predictions, y_g_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_g.train()

Epoch 10/15, Loss: 52.74978256225586, Val Loss: 105.2210693359375
Epoch 15/15, Loss: 49.432525634765625, Val Loss: 166.17543029785156


## Single Thread

In [47]:
input_dim = len(features_st)
# hyperparameters
num_heads = study_st.best_trial.params['num_heads']
model_dim = study_st.best_trial.params['model_dim']
num_layers = study_st.best_trial.params['num_layers']
dropout = study_st.best_trial.params['dropout']
lr = study_st.best_trial.params['learning_rate']
wd = study_st.best_trial.params['weight_decay']
num_epochs = study_st.best_trial.params['num_epochs']
study_st.best_trial.params

{'num_heads': 5,
 'model_dim': 305,
 'num_layers': 4,
 'dropout': 0.3944508870189298,
 'learning_rate': 0.0052776377703212364,
 'weight_decay': 0.0007789453603024966,
 'num_epochs': 19}

In [48]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_st = model_st.to(DEVICE)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

Epoch 10/19, Loss: 17.625991821289062, Val Loss: 30.301942825317383
Epoch 19/19, Loss: 18.42536163330078, Val Loss: 35.388484954833984


## Multi Thread

In [49]:
input_dim = len(features)
# hyperparameters
num_heads = study_mm.best_trial.params['num_heads']
model_dim = study_mm.best_trial.params['model_dim']
num_layers = study_mm.best_trial.params['num_layers']
dropout = study_mm.best_trial.params['dropout']
lr = study_mm.best_trial.params['learning_rate']
wd = study_mm.best_trial.params['weight_decay']
num_epochs = study_mm.best_trial.params['num_epochs']

study_mm.best_trial.params

{'num_heads': 8,
 'model_dim': 472,
 'num_layers': 3,
 'dropout': 0.3497167919307952,
 'learning_rate': 0.0027713172701841183,
 'weight_decay': 0.005883185183789469,
 'num_epochs': 41}

In [50]:
# multi thread model initialization
model_mm = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_mm = model_mm.to(DEVICE)
criterion_mm = nn.MSELoss()
optimizer_mm = optim.AdamW(model_mm.parameters(), lr=lr, weight_decay=wd)

model_mm.train()

for epoch in range(num_epochs):
	optimizer_mm.zero_grad()
	output = model_mm(X_mm_train)
	loss = criterion_mm(output, y_mm_train)
	loss.backward()
	optimizer_mm.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_mm.eval()
		with torch.no_grad():
			val_predictions = model_mm(X_mm_test)
			val_loss = criterion_mm(val_predictions, y_mm_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_mm.train()

Epoch 10/41, Loss: 53.83079528808594, Val Loss: 427.0614929199219
Epoch 20/41, Loss: 35.6347541809082, Val Loss: 769.7138671875
Epoch 30/41, Loss: 35.99632263183594, Val Loss: 926.7322387695312
Epoch 40/41, Loss: 34.37905502319336, Val Loss: 1228.2109375
Epoch 41/41, Loss: 34.84366226196289, Val Loss: 1210.6322021484375


# Conclusion
Queda trabajo que hacer en la red, además de conseguir más datos para un entrenamiento más robusto. Queda por ahora descartado el uso de solo un modelo para multi-threading y single-threading, ya que el modelo tiene más del triple de *loss*.