In [1]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

In [2]:
# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE.type

'cuda'

In [3]:
# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Pre-processing input data

In [4]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [5]:
results_df = pd.read_csv('../results/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['l3_cache_size'] = results_df['l3_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [6]:
# Make the target dataset
target_df = results_df[['total_time', 'brand_raw', 'count', 'l2_cache_size', 'l3_cache_size', 'l2_cache_line_size', 'l2_cache_associativity', 'ghz_advertised_friendly', 'benchmark']].copy()
# Rename columns to *_target
target_df = target_df.rename(columns={
    'total_time': 'total_time_target',
    'brand_raw': 'brand_raw_target',
    'count': 'count_target',
    'l2_cache_size': 'l2_cache_size_target',
    'l3_cache_size': 'l3_cache_size_target',
    'l2_cache_line_size': 'l2_cache_line_size_target',
    'l2_cache_associativity': 'l2_cache_associativity_target',
    'ghz_advertised_friendly': 'ghz_advertised_friendly_target',
})

dataset_df = pd.merge(results_df, target_df, how='inner', on='benchmark')
dataset_df = dataset_df[dataset_df['brand_raw'] != dataset_df['brand_raw_target']]
dataset_df.head(2)

Unnamed: 0,total_time,total_cpu_usage,max_ram_usage,brand_raw,count,l2_cache_size,l3_cache_size,l2_cache_line_size,l2_cache_associativity,benchmark,ghz_actual_friendly,ghz_advertised_friendly,total_time_target,brand_raw_target,count_target,l2_cache_size_target,l3_cache_size_target,l2_cache_line_size_target,l2_cache_associativity_target,ghz_advertised_friendly_target
5,13.49,0.99,1435.851562,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.0996,2.9,32.01,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496
6,13.49,0.99,1435.851562,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.0996,2.9,19.09,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496


In [7]:
# remove one computer for testing
g_train = dataset_df[(dataset_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (dataset_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')].drop(columns=['benchmark','brand_raw', 'brand_raw_target'])
g_test = dataset_df[dataset_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw', 'brand_raw_target'])

In [8]:
mm_df = dataset_df[dataset_df['benchmark']=='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
mm_train = mm_df[(mm_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (mm_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')].drop(columns=['brand_raw', 'brand_raw_target'])
mm_test = mm_df[mm_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw', 'brand_raw_target'])

In [9]:
st_df = dataset_df[dataset_df['benchmark']!='MATRIX_MULT'].drop(columns=['benchmark','count'])
# remove one computer for testing
st_train = st_df[(st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (st_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')].drop(columns=['brand_raw', 'brand_raw_target'])
st_test = st_df[st_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw', 'brand_raw_target'])

In [10]:
target = 'total_time_target'
features = mm_test.columns.copy().drop(target)
features_st = features.copy().drop(['count', 'count_target'])

In [11]:
# general data
## split data
X_g_train = g_train[features]
y_g_train = g_train[target]

X_g_test = g_test[features]
y_g_test = g_test[target]

## normalize data
scaler = StandardScaler()
X_g_train = scaler.fit_transform(X_g_train)
X_g_test = scaler.transform(X_g_test)

## convert to tensor
X_g_train = torch.tensor(X_g_train, dtype=torch.float32).unsqueeze(1)
X_g_test = torch.tensor(X_g_test, dtype=torch.float32).unsqueeze(1)
y_g_train = torch.tensor(y_g_train.values, dtype=torch.float32).view(-1, 1)
y_g_test = torch.tensor(y_g_test.values, dtype=torch.float32).view(-1, 1)

In [12]:
# single thread data
## split data
X_st_train = st_train[features_st]
y_st_train = st_train[target]

X_st_test = st_test[features_st]
y_st_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_st_train = scaler.fit_transform(X_st_train)
X_st_test = scaler.transform(X_st_test)

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32).unsqueeze(1)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32).unsqueeze(1)
y_st_train = torch.tensor(y_st_train.values, dtype=torch.float32).view(-1, 1)
y_st_test = torch.tensor(y_st_test.values, dtype=torch.float32).view(-1, 1)

In [13]:
# multi thread data
## split data
X_mm_train = mm_train[features]
y_mm_train = mm_train[target]

X_mm_test = mm_test[features]
y_mm_test = mm_test[target]

## normalize data
scaler = StandardScaler()
X_mm_train = scaler.fit_transform(X_mm_train)
X_mm_test = scaler.transform(X_mm_test)

## convert to tensor
X_mm_train = torch.tensor(X_mm_train, dtype=torch.float32).unsqueeze(1)
X_mm_test = torch.tensor(X_mm_test, dtype=torch.float32).unsqueeze(1)
y_mm_train = torch.tensor(y_mm_train.values, dtype=torch.float32).view(-1, 1)
y_mm_test = torch.tensor(y_mm_test.values, dtype=torch.float32).view(-1, 1)

In [14]:
if DEVICE.type == 'cuda':
	# move to DEVICE
	X_g_train = X_g_train.to(DEVICE)
	y_g_train = y_g_train.to(DEVICE)
	X_g_test = X_g_test.to(DEVICE)
	y_g_test = y_g_test.to(DEVICE)

	X_st_train = X_st_train.to(DEVICE)
	y_st_train = y_st_train.to(DEVICE)
	X_st_test = X_st_test.to(DEVICE)
	y_st_test = y_st_test.to(DEVICE)

	X_mm_train = X_mm_train.to(DEVICE)
	y_mm_train = y_mm_train.to(DEVICE)
	X_mm_test = X_mm_test.to(DEVICE)
	y_mm_test = y_mm_test.to(DEVICE)

# Model

In [15]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, dropout=0.1):
		super(TransformerModel, self).__init__()
		# layers
		self.embedding = nn.Linear(input_dim, model_dim)
		encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
		self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
		self.fc = nn.Linear(model_dim, output_dim)
		self.dropout = nn.Dropout(dropout)
	
	def forward(self, x):
		x = self.embedding(x)
		x = self.dropout(x)
		x = self.transformer(x)
		x = self.fc(x.mean(dim=1))
		return x

In [16]:
def objective(trial: optuna.Trial, X_train, y_train, X_test, y_test, input_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	model_dim = trial.suggest_int('model_dim', num_heads * 4, num_heads * 64, step=num_heads)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	dropout = trial.suggest_float('dropout', 0.1, 0.5)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
	if DEVICE.type == 'cuda':
		model = model.to(DEVICE)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()
	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

		# trial.report(val_loss.item(), epoch+1)
		# if trial.should_prune():
		# 	raise optuna.TrialPruned()
	print(f"Trial: {trial.number} - Loss: {loss.item()} - Val Loss: {val_loss.item()}")
	return val_loss.item()

# Hyperparameters Optimization

In [17]:
n_trials = 25

## General

In [18]:
# configuration optuna
study_g = optuna.create_study(direction='minimize')
study_g.optimize(lambda trial: objective(trial, X_g_train, y_g_train, X_g_test, y_g_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 04:41:47,084] A new study created in memory with name: no-name-814156af-dfe2-4fc4-b4f4-f928c46c4894
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 04:42:05,670] Trial 0 finished with value: 630.4725341796875 and parameters: {'num_heads': 5, 'model_dim': 100, 'num_layers': 5, 'dropout': 0.2195942850340166, 'learning_rate': 0.0004855896077100036, 'weight_decay': 0.0005573194205974267, 'num_epochs': 76}. Best is trial 0 with value: 630.4725341796875.


Trial: 0 - Loss: 241.7632293701172 - Val Loss: 630.4725341796875


[I 2024-06-20 04:42:28,502] Trial 1 finished with value: 698.8194580078125 and parameters: {'num_heads': 4, 'model_dim': 120, 'num_layers': 6, 'dropout': 0.19228931201660393, 'learning_rate': 0.00022197998807286582, 'weight_decay': 0.0004353313038170108, 'num_epochs': 95}. Best is trial 0 with value: 630.4725341796875.


Trial: 1 - Loss: 281.3769836425781 - Val Loss: 698.8194580078125


[I 2024-06-20 04:42:30,786] Trial 2 finished with value: 800.7874145507812 and parameters: {'num_heads': 3, 'model_dim': 135, 'num_layers': 2, 'dropout': 0.256129713607558, 'learning_rate': 0.00012425991386710083, 'weight_decay': 1.643331370174554e-05, 'num_epochs': 32}. Best is trial 0 with value: 630.4725341796875.


Trial: 2 - Loss: 344.2342529296875 - Val Loss: 800.7874145507812


[I 2024-06-20 04:42:36,024] Trial 3 finished with value: 795.2274169921875 and parameters: {'num_heads': 5, 'model_dim': 175, 'num_layers': 5, 'dropout': 0.24879919912102097, 'learning_rate': 9.296648339423297e-05, 'weight_decay': 2.1309049134727586e-05, 'num_epochs': 20}. Best is trial 0 with value: 630.4725341796875.


Trial: 3 - Loss: 340.73077392578125 - Val Loss: 795.2274169921875


[I 2024-06-20 04:42:36,705] Trial 4 finished with value: 726.2723999023438 and parameters: {'num_heads': 4, 'model_dim': 84, 'num_layers': 1, 'dropout': 0.12080365559247946, 'learning_rate': 0.0015790152574653282, 'weight_decay': 0.005883364006771132, 'num_epochs': 18}. Best is trial 0 with value: 630.4725341796875.


Trial: 4 - Loss: 304.67974853515625 - Val Loss: 726.2723999023438


[I 2024-06-20 04:42:39,303] Trial 5 finished with value: 504.0435791015625 and parameters: {'num_heads': 7, 'model_dim': 315, 'num_layers': 3, 'dropout': 0.17088538443889562, 'learning_rate': 0.002115714865548784, 'weight_decay': 0.0009036479063680725, 'num_epochs': 11}. Best is trial 5 with value: 504.0435791015625.


Trial: 5 - Loss: 189.87525939941406 - Val Loss: 504.0435791015625


[I 2024-06-20 04:42:41,897] Trial 6 finished with value: 851.5448608398438 and parameters: {'num_heads': 1, 'model_dim': 35, 'num_layers': 3, 'dropout': 0.15969830470820046, 'learning_rate': 9.952428772692241e-05, 'weight_decay': 0.00016760662157228813, 'num_epochs': 59}. Best is trial 5 with value: 504.0435791015625.


Trial: 6 - Loss: 374.98388671875 - Val Loss: 851.5448608398438


[I 2024-06-20 04:42:43,919] Trial 7 finished with value: 585.325439453125 and parameters: {'num_heads': 1, 'model_dim': 29, 'num_layers': 2, 'dropout': 0.13677706718728389, 'learning_rate': 0.0020183389017154663, 'weight_decay': 4.330541863386072e-05, 'num_epochs': 70}. Best is trial 5 with value: 504.0435791015625.


Trial: 7 - Loss: 216.39988708496094 - Val Loss: 585.325439453125


[I 2024-06-20 04:42:55,770] Trial 8 finished with value: 844.6906127929688 and parameters: {'num_heads': 8, 'model_dim': 56, 'num_layers': 2, 'dropout': 0.30347689271685063, 'learning_rate': 4.5347671361034614e-05, 'weight_decay': 0.00011925181373478402, 'num_epochs': 93}. Best is trial 5 with value: 504.0435791015625.


Trial: 8 - Loss: 371.4059143066406 - Val Loss: 844.6906127929688


[I 2024-06-20 04:43:33,109] Trial 9 finished with value: 632.068115234375 and parameters: {'num_heads': 7, 'model_dim': 434, 'num_layers': 4, 'dropout': 0.11314765706536445, 'learning_rate': 5.887073401154036e-05, 'weight_decay': 0.008900693133812684, 'num_epochs': 100}. Best is trial 5 with value: 504.0435791015625.


Trial: 9 - Loss: 242.7649688720703 - Val Loss: 632.068115234375


[I 2024-06-20 04:43:46,524] Trial 10 finished with value: 816.4310913085938 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 4, 'dropout': 0.40217347607335346, 'learning_rate': 1.2894057062875401e-05, 'weight_decay': 0.001638209847009987, 'num_epochs': 42}. Best is trial 5 with value: 504.0435791015625.


Trial: 10 - Loss: 352.47259521484375 - Val Loss: 816.4310913085938


[I 2024-06-20 04:43:48,088] Trial 11 finished with value: 300.966064453125 and parameters: {'num_heads': 1, 'model_dim': 11, 'num_layers': 2, 'dropout': 0.3513485300482806, 'learning_rate': 0.009115495258674128, 'weight_decay': 5.929519446499742e-05, 'num_epochs': 59}. Best is trial 11 with value: 300.966064453125.


Trial: 11 - Loss: 82.35442352294922 - Val Loss: 300.966064453125


[I 2024-06-20 04:43:50,915] Trial 12 finished with value: 530.24609375 and parameters: {'num_heads': 2, 'model_dim': 8, 'num_layers': 3, 'dropout': 0.35010203456976324, 'learning_rate': 0.009395291181561257, 'weight_decay': 0.0019029367213164775, 'num_epochs': 47}. Best is trial 11 with value: 300.966064453125.


Trial: 12 - Loss: 192.50917053222656 - Val Loss: 530.24609375


[I 2024-06-20 04:43:55,102] Trial 13 finished with value: 346.1713562011719 and parameters: {'num_heads': 6, 'model_dim': 252, 'num_layers': 1, 'dropout': 0.4998113690131488, 'learning_rate': 0.006201038885761247, 'weight_decay': 8.50728974304019e-05, 'num_epochs': 63}. Best is trial 11 with value: 300.966064453125.


Trial: 13 - Loss: 15.797163009643555 - Val Loss: 346.1713562011719


[I 2024-06-20 04:43:59,286] Trial 14 finished with value: 371.0318298339844 and parameters: {'num_heads': 6, 'model_dim': 234, 'num_layers': 1, 'dropout': 0.48070867615005747, 'learning_rate': 0.009685014315091098, 'weight_decay': 7.232121812135712e-05, 'num_epochs': 64}. Best is trial 11 with value: 300.966064453125.


Trial: 14 - Loss: 13.250377655029297 - Val Loss: 371.0318298339844


[I 2024-06-20 04:44:01,818] Trial 15 finished with value: 336.2541809082031 and parameters: {'num_heads': 3, 'model_dim': 69, 'num_layers': 1, 'dropout': 0.47370531900823276, 'learning_rate': 0.00451933294804767, 'weight_decay': 4.2605381492632355e-05, 'num_epochs': 80}. Best is trial 11 with value: 300.966064453125.


Trial: 15 - Loss: 14.740933418273926 - Val Loss: 336.2541809082031


[I 2024-06-20 04:44:05,581] Trial 16 finished with value: 328.3572692871094 and parameters: {'num_heads': 2, 'model_dim': 56, 'num_layers': 2, 'dropout': 0.4258453846282542, 'learning_rate': 0.003976241408019705, 'weight_decay': 1.0308844049617297e-05, 'num_epochs': 83}. Best is trial 11 with value: 300.966064453125.


Trial: 16 - Loss: 20.73383903503418 - Val Loss: 328.3572692871094


[I 2024-06-20 04:44:09,062] Trial 17 finished with value: 811.5722045898438 and parameters: {'num_heads': 2, 'model_dim': 12, 'num_layers': 2, 'dropout': 0.4081816252728404, 'learning_rate': 0.000864388340580338, 'weight_decay': 1.1883514310171142e-05, 'num_epochs': 88}. Best is trial 11 with value: 300.966064453125.


Trial: 17 - Loss: 349.6107482910156 - Val Loss: 811.5722045898438


[I 2024-06-20 04:44:11,082] Trial 18 finished with value: 438.390380859375 and parameters: {'num_heads': 2, 'model_dim': 36, 'num_layers': 2, 'dropout': 0.40831682381134393, 'learning_rate': 0.0038818523942623757, 'weight_decay': 3.498113872480083e-05, 'num_epochs': 47}. Best is trial 11 with value: 300.966064453125.


Trial: 18 - Loss: 142.5735626220703 - Val Loss: 438.390380859375


[I 2024-06-20 04:44:14,395] Trial 19 finished with value: 758.754638671875 and parameters: {'num_heads': 1, 'model_dim': 24, 'num_layers': 3, 'dropout': 0.3603466616049841, 'learning_rate': 0.0008460814776744224, 'weight_decay': 1.0662977091536462e-05, 'num_epochs': 83}. Best is trial 11 with value: 300.966064453125.


Trial: 19 - Loss: 316.87481689453125 - Val Loss: 758.754638671875


[I 2024-06-20 04:44:22,934] Trial 20 finished with value: 228.31983947753906 and parameters: {'num_heads': 3, 'model_dim': 54, 'num_layers': 4, 'dropout': 0.3228494095923632, 'learning_rate': 0.003047258572434952, 'weight_decay': 0.00020641330002257253, 'num_epochs': 71}. Best is trial 20 with value: 228.31983947753906.


Trial: 20 - Loss: 57.407440185546875 - Val Loss: 228.31983947753906


[I 2024-06-20 04:44:31,550] Trial 21 finished with value: 254.90924072265625 and parameters: {'num_heads': 3, 'model_dim': 51, 'num_layers': 4, 'dropout': 0.31843559298557017, 'learning_rate': 0.003006277708194012, 'weight_decay': 0.00022384310488735704, 'num_epochs': 70}. Best is trial 20 with value: 228.31983947753906.


Trial: 21 - Loss: 64.98091888427734 - Val Loss: 254.90924072265625


[I 2024-06-20 04:44:40,106] Trial 22 finished with value: 365.9410095214844 and parameters: {'num_heads': 3, 'model_dim': 51, 'num_layers': 4, 'dropout': 0.31508719977765265, 'learning_rate': 0.0022999211629547416, 'weight_decay': 0.00022095028308781543, 'num_epochs': 70}. Best is trial 20 with value: 228.31983947753906.


Trial: 22 - Loss: 105.97310638427734 - Val Loss: 365.9410095214844


[I 2024-06-20 04:44:48,918] Trial 23 finished with value: 582.1925659179688 and parameters: {'num_heads': 3, 'model_dim': 75, 'num_layers': 5, 'dropout': 0.3318732618952987, 'learning_rate': 0.0011693304118209505, 'weight_decay': 0.00029637646728211984, 'num_epochs': 54}. Best is trial 20 with value: 228.31983947753906.


Trial: 23 - Loss: 215.77381896972656 - Val Loss: 582.1925659179688


[I 2024-06-20 04:45:00,433] Trial 24 finished with value: 687.0759887695312 and parameters: {'num_heads': 4, 'model_dim': 92, 'num_layers': 4, 'dropout': 0.27085389774497903, 'learning_rate': 0.00041080980023786126, 'weight_decay': 0.00011371790686247332, 'num_epochs': 72}. Best is trial 20 with value: 228.31983947753906.


Trial: 24 - Loss: 274.8428649902344 - Val Loss: 687.0759887695312


In [19]:
# Results
print(f'Número de pruebas: {len(study_g.trials)}')
trial = study_g.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida: {trial.value}')

Número de pruebas: 25
Mejor prueba: 20
Mejores parametros: {'num_heads': 3, 'model_dim': 54, 'num_layers': 4, 'dropout': 0.3228494095923632, 'learning_rate': 0.003047258572434952, 'weight_decay': 0.00020641330002257253, 'num_epochs': 71}
Mejor valor de pérdida: 228.31983947753906


## Single Thread

In [20]:
# configuration optuna
study_st = optuna.create_study(direction='minimize')
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features_st), len(target)), n_trials=n_trials)

[I 2024-06-20 04:45:00,450] A new study created in memory with name: no-name-4c79815d-a5ee-415b-96d3-4285fe6ef8a8
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 04:45:01,574] Trial 0 finished with value: 608.517578125 and parameters: {'num_heads': 1, 'model_dim': 64, 'num_layers': 4, 'dropout': 0.2933557518942793, 'learning_rate': 7.86120524107566e-05, 'weight_decay': 0.0009388583262466356, 'num_epochs': 23}. Best is trial 0 with value: 608.517578125.


Trial: 0 - Loss: 389.4541320800781 - Val Loss: 608.517578125


[I 2024-06-20 04:45:21,035] Trial 1 finished with value: 39.55455017089844 and parameters: {'num_heads': 7, 'model_dim': 140, 'num_layers': 4, 'dropout': 0.20452105085437045, 'learning_rate': 0.002707224949204147, 'weight_decay': 0.00018284144712738177, 'num_epochs': 96}. Best is trial 1 with value: 39.55455017089844.


Trial: 1 - Loss: 17.454801559448242 - Val Loss: 39.55455017089844


[I 2024-06-20 04:45:25,330] Trial 2 finished with value: 18.659408569335938 and parameters: {'num_heads': 5, 'model_dim': 245, 'num_layers': 4, 'dropout': 0.33568732964885606, 'learning_rate': 0.006292101996192198, 'weight_decay': 0.000120584928971844, 'num_epochs': 23}. Best is trial 2 with value: 18.659408569335938.


Trial: 2 - Loss: 33.558502197265625 - Val Loss: 18.659408569335938


[I 2024-06-20 04:45:32,974] Trial 3 finished with value: 621.1950073242188 and parameters: {'num_heads': 2, 'model_dim': 46, 'num_layers': 5, 'dropout': 0.4682323474652742, 'learning_rate': 1.626469433303478e-05, 'weight_decay': 0.0018620843146916926, 'num_epochs': 88}. Best is trial 2 with value: 18.659408569335938.


Trial: 3 - Loss: 399.81939697265625 - Val Loss: 621.1950073242188


[I 2024-06-20 04:45:34,301] Trial 4 finished with value: 619.0711059570312 and parameters: {'num_heads': 4, 'model_dim': 156, 'num_layers': 3, 'dropout': 0.10504813099048947, 'learning_rate': 3.42165282568478e-05, 'weight_decay': 2.59409820741719e-05, 'num_epochs': 13}. Best is trial 2 with value: 18.659408569335938.


Trial: 4 - Loss: 403.2465515136719 - Val Loss: 619.0711059570312


[I 2024-06-20 04:45:54,391] Trial 5 finished with value: 413.0791320800781 and parameters: {'num_heads': 6, 'model_dim': 96, 'num_layers': 6, 'dropout': 0.460308812685446, 'learning_rate': 0.0005174489631930271, 'weight_decay': 9.310560000294868e-05, 'num_epochs': 82}. Best is trial 2 with value: 18.659408569335938.


Trial: 5 - Loss: 240.99447631835938 - Val Loss: 413.0791320800781


[I 2024-06-20 04:45:55,431] Trial 6 finished with value: 631.3442993164062 and parameters: {'num_heads': 1, 'model_dim': 11, 'num_layers': 4, 'dropout': 0.14581893048562955, 'learning_rate': 0.0011773067978302053, 'weight_decay': 0.00048556634447020127, 'num_epochs': 25}. Best is trial 2 with value: 18.659408569335938.


Trial: 6 - Loss: 407.8698425292969 - Val Loss: 631.3442993164062


[I 2024-06-20 04:45:59,875] Trial 7 finished with value: 142.05450439453125 and parameters: {'num_heads': 5, 'model_dim': 230, 'num_layers': 2, 'dropout': 0.19910475561598365, 'learning_rate': 0.0011074014591464993, 'weight_decay': 0.0051116235056069045, 'num_epochs': 49}. Best is trial 2 with value: 18.659408569335938.


Trial: 7 - Loss: 61.07684326171875 - Val Loss: 142.05450439453125


[I 2024-06-20 04:46:07,993] Trial 8 finished with value: 263.0382385253906 and parameters: {'num_heads': 6, 'model_dim': 246, 'num_layers': 2, 'dropout': 0.44044705646546645, 'learning_rate': 0.0004027495548875436, 'weight_decay': 2.5120409061224292e-05, 'num_epochs': 78}. Best is trial 2 with value: 18.659408569335938.


Trial: 8 - Loss: 136.21200561523438 - Val Loss: 263.0382385253906


[I 2024-06-20 04:46:09,267] Trial 9 finished with value: 606.954345703125 and parameters: {'num_heads': 3, 'model_dim': 99, 'num_layers': 3, 'dropout': 0.474244929190065, 'learning_rate': 7.490898617241934e-05, 'weight_decay': 0.001297078443490064, 'num_epochs': 16}. Best is trial 2 with value: 18.659408569335938.


Trial: 9 - Loss: 390.3738708496094 - Val Loss: 606.954345703125


[I 2024-06-20 04:46:12,657] Trial 10 finished with value: 42.939849853515625 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 1, 'dropout': 0.3485267367019958, 'learning_rate': 0.007429507608324253, 'weight_decay': 0.00011750911678509627, 'num_epochs': 44}. Best is trial 2 with value: 18.659408569335938.


Trial: 10 - Loss: 17.74909210205078 - Val Loss: 42.939849853515625


[I 2024-06-20 04:46:47,122] Trial 11 finished with value: 34.231449127197266 and parameters: {'num_heads': 8, 'model_dim': 352, 'num_layers': 5, 'dropout': 0.2569168303546118, 'learning_rate': 0.006200191763417153, 'weight_decay': 0.00018236249082717232, 'num_epochs': 100}. Best is trial 2 with value: 18.659408569335938.


Trial: 11 - Loss: 17.531951904296875 - Val Loss: 34.231449127197266


[I 2024-06-20 04:47:17,175] Trial 12 finished with value: 34.21455764770508 and parameters: {'num_heads': 8, 'model_dim': 392, 'num_layers': 6, 'dropout': 0.31817382376513564, 'learning_rate': 0.005748042620873319, 'weight_decay': 5.175563196759787e-05, 'num_epochs': 69}. Best is trial 2 with value: 18.659408569335938.


Trial: 12 - Loss: 17.477529525756836 - Val Loss: 34.21455764770508


[I 2024-06-20 04:47:30,804] Trial 13 finished with value: 35.45481491088867 and parameters: {'num_heads': 4, 'model_dim': 192, 'num_layers': 6, 'dropout': 0.3658978311268311, 'learning_rate': 0.003514792446328253, 'weight_decay': 4.994134718528283e-05, 'num_epochs': 65}. Best is trial 2 with value: 18.659408569335938.


Trial: 13 - Loss: 17.551250457763672 - Val Loss: 35.45481491088867


[I 2024-06-20 04:47:40,915] Trial 14 finished with value: 38.944374084472656 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 5, 'dropout': 0.3774192420819211, 'learning_rate': 0.009953069617971707, 'weight_decay': 1.1077598482423151e-05, 'num_epochs': 36}. Best is trial 2 with value: 18.659408569335938.


Trial: 14 - Loss: 17.644519805908203 - Val Loss: 38.944374084472656


[I 2024-06-20 04:48:08,021] Trial 15 finished with value: 38.55120849609375 and parameters: {'num_heads': 7, 'model_dim': 441, 'num_layers': 6, 'dropout': 0.31390960870946033, 'learning_rate': 0.0017576653730120926, 'weight_decay': 0.00039661115370031826, 'num_epochs': 63}. Best is trial 2 with value: 18.659408569335938.


Trial: 15 - Loss: 17.46663475036621 - Val Loss: 38.55120849609375


[I 2024-06-20 04:48:24,039] Trial 16 finished with value: 418.0400390625 and parameters: {'num_heads': 5, 'model_dim': 275, 'num_layers': 5, 'dropout': 0.3927826545392735, 'learning_rate': 0.0001972620050624684, 'weight_decay': 5.140354404055404e-05, 'num_epochs': 66}. Best is trial 2 with value: 18.659408569335938.


Trial: 16 - Loss: 245.39646911621094 - Val Loss: 418.0400390625


[I 2024-06-20 04:48:27,002] Trial 17 finished with value: 49.122005462646484 and parameters: {'num_heads': 3, 'model_dim': 126, 'num_layers': 3, 'dropout': 0.27456127246675976, 'learning_rate': 0.003883991188851359, 'weight_decay': 1.1078361121460985e-05, 'num_epochs': 36}. Best is trial 2 with value: 18.659408569335938.


Trial: 17 - Loss: 19.22475242614746 - Val Loss: 49.122005462646484


[I 2024-06-20 04:48:30,592] Trial 18 finished with value: 113.87181854248047 and parameters: {'num_heads': 7, 'model_dim': 364, 'num_layers': 1, 'dropout': 0.23741358515515082, 'learning_rate': 0.0007214493562478741, 'weight_decay': 4.8586900785702396e-05, 'num_epochs': 54}. Best is trial 2 with value: 18.659408569335938.


Trial: 18 - Loss: 48.39320373535156 - Val Loss: 113.87181854248047


[I 2024-06-20 04:49:07,872] Trial 19 finished with value: 259.0928955078125 and parameters: {'num_heads': 8, 'model_dim': 488, 'num_layers': 6, 'dropout': 0.4163140940204095, 'learning_rate': 0.00020703163439388363, 'weight_decay': 0.00024857557068670906, 'num_epochs': 77}. Best is trial 2 with value: 18.659408569335938.


Trial: 19 - Loss: 132.1822509765625 - Val Loss: 259.0928955078125


[I 2024-06-20 04:49:09,327] Trial 20 finished with value: 395.5486755371094 and parameters: {'num_heads': 3, 'model_dim': 72, 'num_layers': 2, 'dropout': 0.32679186980647634, 'learning_rate': 0.0021601812582834803, 'weight_decay': 0.008449206708457358, 'num_epochs': 30}. Best is trial 2 with value: 18.659408569335938.


Trial: 20 - Loss: 233.73207092285156 - Val Loss: 395.5486755371094


[I 2024-06-20 04:49:43,456] Trial 21 finished with value: 34.26642608642578 and parameters: {'num_heads': 8, 'model_dim': 328, 'num_layers': 5, 'dropout': 0.25091125857516705, 'learning_rate': 0.005662106559255401, 'weight_decay': 0.00011183326366271925, 'num_epochs': 99}. Best is trial 2 with value: 18.659408569335938.


Trial: 21 - Loss: 17.505468368530273 - Val Loss: 34.26642608642578


[I 2024-06-20 04:50:07,222] Trial 22 finished with value: 33.20869827270508 and parameters: {'num_heads': 7, 'model_dim': 364, 'num_layers': 5, 'dropout': 0.27460440960698446, 'learning_rate': 0.006190145480484268, 'weight_decay': 0.0001898122425482862, 'num_epochs': 73}. Best is trial 2 with value: 18.659408569335938.


Trial: 22 - Loss: 17.552021026611328 - Val Loss: 33.20869827270508


[I 2024-06-20 04:50:26,542] Trial 23 finished with value: 34.671043395996094 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 4, 'dropout': 0.3331321123864456, 'learning_rate': 0.003970774995179344, 'weight_decay': 0.0007031447975448214, 'num_epochs': 70}. Best is trial 2 with value: 18.659408569335938.


Trial: 23 - Loss: 17.545364379882812 - Val Loss: 34.671043395996094


[I 2024-06-20 04:50:43,253] Trial 24 finished with value: 36.572547912597656 and parameters: {'num_heads': 6, 'model_dim': 306, 'num_layers': 5, 'dropout': 0.2935946358334059, 'learning_rate': 0.0013868340757457137, 'weight_decay': 7.154081423987044e-05, 'num_epochs': 57}. Best is trial 2 with value: 18.659408569335938.


Trial: 24 - Loss: 17.559967041015625 - Val Loss: 36.572547912597656


In [21]:
# Results
print(f'Número de pruebas: {len(study_st.trials)}')
trial = study_st.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Número de pruebas: 25
Mejor prueba: 2
Mejores parametros: {'num_heads': 5, 'model_dim': 245, 'num_layers': 4, 'dropout': 0.33568732964885606, 'learning_rate': 0.006292101996192198, 'weight_decay': 0.000120584928971844, 'num_epochs': 23}
Mejor valor de pérdida en validación: 18.659408569335938


## Multi Thread

In [22]:
# configuration optuna
study_mm = optuna.create_study(direction='minimize')
study_mm.optimize(lambda trial: objective(trial, X_mm_train, y_mm_train, X_mm_test, y_mm_test, len(features), len(target)), n_trials=n_trials)

[I 2024-06-20 04:50:43,268] A new study created in memory with name: no-name-61681b69-8d1c-449c-b50b-a0465cec8b60
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-20 04:50:45,874] Trial 0 finished with value: 638.0918579101562 and parameters: {'num_heads': 5, 'model_dim': 190, 'num_layers': 4, 'dropout': 0.13864499293250457, 'learning_rate': 0.003500746199850497, 'weight_decay': 0.005625796757427672, 'num_epochs': 44}. Best is trial 0 with value: 638.0918579101562.


Trial: 0 - Loss: 113.08677673339844 - Val Loss: 638.0918579101562


[I 2024-06-20 04:50:46,739] Trial 1 finished with value: 928.3875122070312 and parameters: {'num_heads': 3, 'model_dim': 156, 'num_layers': 1, 'dropout': 0.42332718762588695, 'learning_rate': 0.0006234509490345264, 'weight_decay': 5.2549738725678564e-05, 'num_epochs': 83}. Best is trial 0 with value: 638.0918579101562.


Trial: 1 - Loss: 133.14291381835938 - Val Loss: 928.3875122070312


[I 2024-06-20 04:50:50,793] Trial 2 finished with value: 748.2271728515625 and parameters: {'num_heads': 7, 'model_dim': 329, 'num_layers': 4, 'dropout': 0.3930102912349217, 'learning_rate': 0.0008304519402535401, 'weight_decay': 4.497279652035178e-05, 'num_epochs': 46}. Best is trial 0 with value: 638.0918579101562.


Trial: 2 - Loss: 118.16265869140625 - Val Loss: 748.2271728515625


[I 2024-06-20 04:50:51,432] Trial 3 finished with value: 1549.0987548828125 and parameters: {'num_heads': 5, 'model_dim': 205, 'num_layers': 1, 'dropout': 0.36124547775864013, 'learning_rate': 3.349898754824614e-05, 'weight_decay': 0.005336632598845758, 'num_epochs': 41}. Best is trial 0 with value: 638.0918579101562.


Trial: 3 - Loss: 304.6725769042969 - Val Loss: 1549.0987548828125


[I 2024-06-20 04:50:52,945] Trial 4 finished with value: 1009.1809692382812 and parameters: {'num_heads': 8, 'model_dim': 240, 'num_layers': 2, 'dropout': 0.22001832514644704, 'learning_rate': 0.0008351030591335075, 'weight_decay': 0.00019016461914097887, 'num_epochs': 36}. Best is trial 0 with value: 638.0918579101562.


Trial: 4 - Loss: 157.91567993164062 - Val Loss: 1009.1809692382812


[I 2024-06-20 04:50:58,631] Trial 5 finished with value: 1407.7197265625 and parameters: {'num_heads': 4, 'model_dim': 208, 'num_layers': 5, 'dropout': 0.4525993456752493, 'learning_rate': 6.55223377688456e-05, 'weight_decay': 2.5826696297999764e-05, 'num_epochs': 88}. Best is trial 0 with value: 638.0918579101562.


Trial: 5 - Loss: 261.4542236328125 - Val Loss: 1407.7197265625


[I 2024-06-20 04:51:03,985] Trial 6 finished with value: 1454.2967529296875 and parameters: {'num_heads': 4, 'model_dim': 232, 'num_layers': 6, 'dropout': 0.1265987992897385, 'learning_rate': 1.7199836200223135e-05, 'weight_decay': 0.002519401795074349, 'num_epochs': 66}. Best is trial 0 with value: 638.0918579101562.


Trial: 6 - Loss: 276.430908203125 - Val Loss: 1454.2967529296875


[I 2024-06-20 04:51:04,748] Trial 7 finished with value: 1403.527587890625 and parameters: {'num_heads': 7, 'model_dim': 406, 'num_layers': 3, 'dropout': 0.3474997397628343, 'learning_rate': 0.00016659873728528388, 'weight_decay': 0.006773688208448841, 'num_epochs': 10}. Best is trial 0 with value: 638.0918579101562.


Trial: 7 - Loss: 262.3399353027344 - Val Loss: 1403.527587890625


[I 2024-06-20 04:51:06,980] Trial 8 finished with value: 1531.0653076171875 and parameters: {'num_heads': 2, 'model_dim': 52, 'num_layers': 4, 'dropout': 0.31343363586498457, 'learning_rate': 4.4034043478003545e-05, 'weight_decay': 4.2818150326115666e-05, 'num_epochs': 92}. Best is trial 0 with value: 638.0918579101562.


Trial: 8 - Loss: 302.3449401855469 - Val Loss: 1531.0653076171875


[I 2024-06-20 04:51:15,604] Trial 9 finished with value: 1410.624755859375 and parameters: {'num_heads': 7, 'model_dim': 196, 'num_layers': 5, 'dropout': 0.27823455144670584, 'learning_rate': 5.5510409464804553e-05, 'weight_decay': 0.005186273034694741, 'num_epochs': 95}. Best is trial 0 with value: 638.0918579101562.


Trial: 9 - Loss: 262.32928466796875 - Val Loss: 1410.624755859375


[I 2024-06-20 04:51:15,824] Trial 10 finished with value: 1547.143798828125 and parameters: {'num_heads': 1, 'model_dim': 4, 'num_layers': 3, 'dropout': 0.10332173797572328, 'learning_rate': 0.008607713294475641, 'weight_decay': 0.000840315572416948, 'num_epochs': 18}. Best is trial 0 with value: 638.0918579101562.


Trial: 10 - Loss: 309.6009826660156 - Val Loss: 1547.143798828125


[I 2024-06-20 04:51:19,996] Trial 11 finished with value: 619.1051025390625 and parameters: {'num_heads': 6, 'model_dim': 324, 'num_layers': 4, 'dropout': 0.21312737393068878, 'learning_rate': 0.0029055367715234283, 'weight_decay': 0.00020628979964609553, 'num_epochs': 53}. Best is trial 11 with value: 619.1051025390625.


Trial: 11 - Loss: 112.99082946777344 - Val Loss: 619.1051025390625


[I 2024-06-20 04:51:23,823] Trial 12 finished with value: 627.4741821289062 and parameters: {'num_heads': 5, 'model_dim': 110, 'num_layers': 5, 'dropout': 0.195655987964487, 'learning_rate': 0.005640510047340704, 'weight_decay': 0.00030250851477102274, 'num_epochs': 60}. Best is trial 11 with value: 619.1051025390625.


Trial: 12 - Loss: 113.03582000732422 - Val Loss: 627.4741821289062


[I 2024-06-20 04:51:29,501] Trial 13 finished with value: 621.4251098632812 and parameters: {'num_heads': 6, 'model_dim': 120, 'num_layers': 6, 'dropout': 0.2058716845127128, 'learning_rate': 0.0031649926905662814, 'weight_decay': 0.00019997284534482477, 'num_epochs': 65}. Best is trial 11 with value: 619.1051025390625.


Trial: 13 - Loss: 113.03530883789062 - Val Loss: 621.4251098632812


[I 2024-06-20 04:51:38,420] Trial 14 finished with value: 619.0458984375 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 6, 'dropout': 0.22690077471334974, 'learning_rate': 0.0021704500771642494, 'weight_decay': 0.0001150914048443582, 'num_epochs': 77}. Best is trial 14 with value: 619.0458984375.


Trial: 14 - Loss: 112.88542175292969 - Val Loss: 619.0458984375


[I 2024-06-20 04:51:47,226] Trial 15 finished with value: 625.4734497070312 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 6, 'dropout': 0.2616202940683793, 'learning_rate': 0.001818679324859779, 'weight_decay': 0.0008377745537859545, 'num_epochs': 76}. Best is trial 14 with value: 619.0458984375.


Trial: 15 - Loss: 112.69718933105469 - Val Loss: 625.4734497070312


[I 2024-06-20 04:51:51,138] Trial 16 finished with value: 1022.1359252929688 and parameters: {'num_heads': 8, 'model_dim': 392, 'num_layers': 2, 'dropout': 0.17256971984492706, 'learning_rate': 0.00021477096744578373, 'weight_decay': 1.0220754764012422e-05, 'num_epochs': 75}. Best is trial 14 with value: 619.0458984375.


Trial: 16 - Loss: 155.2539520263672 - Val Loss: 1022.1359252929688


[I 2024-06-20 04:51:56,356] Trial 17 finished with value: 625.884033203125 and parameters: {'num_heads': 6, 'model_dim': 312, 'num_layers': 5, 'dropout': 0.2352263387293546, 'learning_rate': 0.0016332884354667244, 'weight_decay': 0.00010791947995618602, 'num_epochs': 54}. Best is trial 14 with value: 619.0458984375.


Trial: 17 - Loss: 112.91793823242188 - Val Loss: 625.884033203125


[I 2024-06-20 04:51:57,030] Trial 18 finished with value: 1484.7467041015625 and parameters: {'num_heads': 3, 'model_dim': 78, 'num_layers': 3, 'dropout': 0.31192068447848387, 'learning_rate': 0.0004200692831219727, 'weight_decay': 0.0006723131618781763, 'num_epochs': 25}. Best is trial 14 with value: 619.0458984375.


Trial: 18 - Loss: 287.6164855957031 - Val Loss: 1484.7467041015625


[I 2024-06-20 04:52:05,515] Trial 19 finished with value: 617.5748901367188 and parameters: {'num_heads': 6, 'model_dim': 270, 'num_layers': 6, 'dropout': 0.15722370276707742, 'learning_rate': 0.0020518283656019336, 'weight_decay': 0.000102427676997386, 'num_epochs': 75}. Best is trial 19 with value: 617.5748901367188.


Trial: 19 - Loss: 112.92919921875 - Val Loss: 617.5748901367188


[I 2024-06-20 04:52:17,977] Trial 20 finished with value: 629.0552978515625 and parameters: {'num_heads': 7, 'model_dim': 273, 'num_layers': 6, 'dropout': 0.1607417132232227, 'learning_rate': 0.0013719634491665671, 'weight_decay': 9.164713169376434e-05, 'num_epochs': 100}. Best is trial 19 with value: 617.5748901367188.


Trial: 20 - Loss: 112.99951934814453 - Val Loss: 629.0552978515625


[I 2024-06-20 04:52:25,331] Trial 21 finished with value: 607.7455444335938 and parameters: {'num_heads': 6, 'model_dim': 336, 'num_layers': 5, 'dropout': 0.25241691549181033, 'learning_rate': 0.00987077757680359, 'weight_decay': 0.00011251132452506049, 'num_epochs': 74}. Best is trial 21 with value: 607.7455444335938.


Trial: 21 - Loss: 112.89176177978516 - Val Loss: 607.7455444335938


[I 2024-06-20 04:52:34,918] Trial 22 finished with value: 612.9329223632812 and parameters: {'num_heads': 6, 'model_dim': 354, 'num_layers': 6, 'dropout': 0.25939352884707323, 'learning_rate': 0.009746355433058315, 'weight_decay': 8.323531777315702e-05, 'num_epochs': 76}. Best is trial 21 with value: 607.7455444335938.


Trial: 22 - Loss: 112.95305633544922 - Val Loss: 612.9329223632812


[I 2024-06-20 04:52:40,912] Trial 23 finished with value: 618.1771240234375 and parameters: {'num_heads': 5, 'model_dim': 275, 'num_layers': 5, 'dropout': 0.2706695732594715, 'learning_rate': 0.009971955672038523, 'weight_decay': 1.8403966093973846e-05, 'num_epochs': 70}. Best is trial 21 with value: 607.7455444335938.


Trial: 23 - Loss: 112.99073028564453 - Val Loss: 618.1771240234375


[I 2024-06-20 04:52:47,722] Trial 24 finished with value: 613.8899536132812 and parameters: {'num_heads': 4, 'model_dim': 240, 'num_layers': 6, 'dropout': 0.16909945254996686, 'learning_rate': 0.0052434734982383345, 'weight_decay': 0.00040634519986640236, 'num_epochs': 83}. Best is trial 21 with value: 607.7455444335938.


Trial: 24 - Loss: 112.92240142822266 - Val Loss: 613.8899536132812


In [23]:
# Results
print(f'Trials quantity: {len(study_mm.trials)}')
trial = study_mm.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Trials quantity: 25
Mejor prueba: 21
Mejores parametros: {'num_heads': 6, 'model_dim': 336, 'num_layers': 5, 'dropout': 0.25241691549181033, 'learning_rate': 0.00987077757680359, 'weight_decay': 0.00011251132452506049, 'num_epochs': 74}
Mejor valor de pérdida en validación: 607.7455444335938


# Training

In [24]:
output_dim = 1

## General

In [25]:
input_dim = len(features)
# hyperparameters
num_heads = study_g.best_trial.params['num_heads']
model_dim = study_g.best_trial.params['model_dim']
num_layers = study_g.best_trial.params['num_layers']
dropout = study_g.best_trial.params['dropout']
lr = study_g.best_trial.params['learning_rate']
wd = study_g.best_trial.params['weight_decay']
num_epochs = study_g.best_trial.params['num_epochs']

study_g.best_trial.params

{'num_heads': 3,
 'model_dim': 54,
 'num_layers': 4,
 'dropout': 0.3228494095923632,
 'learning_rate': 0.003047258572434952,
 'weight_decay': 0.00020641330002257253,
 'num_epochs': 71}

In [26]:
# general model initialization
model_g = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_g = model_g.to(DEVICE)
criterion_g = nn.MSELoss()
optimizer_g = optim.AdamW(model_g.parameters(), lr=lr, weight_decay=wd)

model_g.train()

for epoch in range(num_epochs):
	optimizer_g.zero_grad()
	output = model_g(X_g_train)
	loss = criterion_g(output, y_g_train)
	loss.backward()
	optimizer_g.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_g.eval()
		with torch.no_grad():
			val_predictions = model_g(X_g_test)
			val_loss = criterion_g(val_predictions, y_g_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_g.train()

Epoch 10/71, Loss: 239.3172149658203, Val Loss: 613.6309814453125
Epoch 20/71, Loss: 190.65248107910156, Val Loss: 524.8025512695312
Epoch 30/71, Loss: 142.29408264160156, Val Loss: 434.852294921875
Epoch 40/71, Loss: 101.49819946289062, Val Loss: 348.8166198730469
Epoch 50/71, Loss: 72.3261489868164, Val Loss: 274.54254150390625
Epoch 60/71, Loss: 55.270992279052734, Val Loss: 216.94171142578125
Epoch 70/71, Loss: 48.282691955566406, Val Loss: 178.2835235595703
Epoch 71/71, Loss: 47.95528030395508, Val Loss: 175.4420928955078


## Single Thread

In [27]:
input_dim = len(features_st)
# hyperparameters
num_heads = study_st.best_trial.params['num_heads']
model_dim = study_st.best_trial.params['model_dim']
num_layers = study_st.best_trial.params['num_layers']
dropout = study_st.best_trial.params['dropout']
lr = study_st.best_trial.params['learning_rate']
wd = study_st.best_trial.params['weight_decay']
num_epochs = study_st.best_trial.params['num_epochs']
study_st.best_trial.params

{'num_heads': 5,
 'model_dim': 245,
 'num_layers': 4,
 'dropout': 0.33568732964885606,
 'learning_rate': 0.006292101996192198,
 'weight_decay': 0.000120584928971844,
 'num_epochs': 23}

In [28]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_st = model_st.to(DEVICE)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

Epoch 10/23, Loss: 19.4830379486084, Val Loss: 36.49971008300781
Epoch 20/23, Loss: 22.07948875427246, Val Loss: 28.931926727294922
Epoch 23/23, Loss: 20.814666748046875, Val Loss: 56.268009185791016


## Multi Thread

In [29]:
input_dim = len(features)
# hyperparameters
num_heads = study_mm.best_trial.params['num_heads']
model_dim = study_mm.best_trial.params['model_dim']
num_layers = study_mm.best_trial.params['num_layers']
dropout = study_mm.best_trial.params['dropout']
lr = study_mm.best_trial.params['learning_rate']
wd = study_mm.best_trial.params['weight_decay']
num_epochs = study_mm.best_trial.params['num_epochs']

study_mm.best_trial.params

{'num_heads': 6,
 'model_dim': 336,
 'num_layers': 5,
 'dropout': 0.25241691549181033,
 'learning_rate': 0.00987077757680359,
 'weight_decay': 0.00011251132452506049,
 'num_epochs': 74}

In [30]:
# multi thread model initialization
model_mm = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_mm = model_mm.to(DEVICE)
criterion_mm = nn.MSELoss()
optimizer_mm = optim.AdamW(model_mm.parameters(), lr=lr, weight_decay=wd)

model_mm.train()

for epoch in range(num_epochs):
	optimizer_mm.zero_grad()
	output = model_mm(X_mm_train)
	loss = criterion_mm(output, y_mm_train)
	loss.backward()
	optimizer_mm.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_mm.eval()
		with torch.no_grad():
			val_predictions = model_mm(X_mm_test)
			val_loss = criterion_mm(val_predictions, y_mm_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_mm.train()

Epoch 10/74, Loss: 113.2838363647461, Val Loss: 637.9877319335938
Epoch 20/74, Loss: 113.93785095214844, Val Loss: 561.0780029296875
Epoch 30/74, Loss: 112.93811798095703, Val Loss: 608.9549560546875
Epoch 40/74, Loss: 113.05597686767578, Val Loss: 631.4415283203125
Epoch 50/74, Loss: 112.85626983642578, Val Loss: 618.8742065429688
Epoch 60/74, Loss: 113.09892272949219, Val Loss: 609.8304443359375
Epoch 70/74, Loss: 112.87665557861328, Val Loss: 609.0673828125
Epoch 74/74, Loss: 113.03215789794922, Val Loss: 615.2300415039062


# Conclusion
Queda trabajo que hacer en la red, además de conseguir más datos para un entrenamiento más robusto. Queda por ahora descartado el uso de solo un modelo para multi-threading y single-threading, ya que el modelo tiene más del triple de *loss*.