In [20]:
import optuna
from optuna.samplers import TPESampler
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
#from joblib import dump

In [21]:
# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE.type

'cuda'

In [22]:
# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
sampler = TPESampler(seed=seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Pre-processing input data

In [23]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [24]:
results_df = pd.read_csv('../results_new/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio_new/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['l3_cache_size'] = results_df['l3_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [25]:
# Make the target dataset
target_df = results_df[['total_time', 'brand_raw', 'count', 'l2_cache_size', 'l3_cache_size', 'l2_cache_line_size', 'l2_cache_associativity', 'ghz_advertised_friendly', 'benchmark']].copy()
# Rename columns to *_target
target_df = target_df.rename(columns={
    'total_time': 'total_time_target',
    'brand_raw': 'brand_raw_target',
    'count': 'count_target',
    'l2_cache_size': 'l2_cache_size_target',
    'l3_cache_size': 'l3_cache_size_target',
    'l2_cache_line_size': 'l2_cache_line_size_target',
    'l2_cache_associativity': 'l2_cache_associativity_target',
    'ghz_advertised_friendly': 'ghz_advertised_friendly_target',
})

dataset_df = pd.merge(results_df, target_df, how='inner', on='benchmark')
dataset_df = dataset_df[dataset_df['brand_raw'] != dataset_df['brand_raw_target']]
dataset_df.head(2)

Unnamed: 0,total_time,total_cpu_usage,max_ram_usage,brand_raw,count,l2_cache_size,l3_cache_size,l2_cache_line_size,l2_cache_associativity,benchmark,ghz_actual_friendly,ghz_advertised_friendly,total_time_target,brand_raw_target,count_target,l2_cache_size_target,l3_cache_size_target,l2_cache_line_size_target,l2_cache_associativity_target,ghz_advertised_friendly_target
5,13.47,0.99,1436.714844,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.1729,2.9,45.91,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496
6,13.47,0.99,1436.714844,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.1729,2.9,25.77,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496


In [26]:
# remove one computer for testing
g_train = dataset_df[(dataset_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (dataset_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')]
g_test = dataset_df[dataset_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U']

In [27]:
mm_df = dataset_df[dataset_df['benchmark'].isin(['MATRIX_MULT', 'MATRIX_MULT2', 'MATRIX_MULT3'])]
# remove one computer for testing
mm_train = mm_df[(mm_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (mm_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')]
mm_test = mm_df[mm_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U']

In [28]:
st_df = dataset_df[~dataset_df['benchmark'].isin(['MATRIX_MULT', 'MATRIX_MULT2', 'MATRIX_MULT3'])]
# remove one computer for testing
st_train = st_df[(st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (st_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')]
st_test = st_df[st_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U']

In [29]:
# load test dataset
g_test = pd.read_csv('csv/g_test.csv')
st_test = pd.read_csv('csv/st_test.csv')
mm_test = pd.read_csv('csv/mm_test.csv')

In [30]:
target = 'total_time_target'
features = mm_test.columns.copy().drop(target).drop(['benchmark','brand_raw', 'brand_raw_target'])
features_st = features.copy().drop(['count', 'count_target'])

In [31]:
# general data
## split data
X_g_train = g_train[features]
y_g_train = g_train[target]

X_g_test = g_test[features]
y_g_test = g_test[target]

## normalize data
scaler_g = StandardScaler()
X_g_train = scaler_g.fit_transform(X_g_train)
X_g_test = scaler_g.transform(X_g_test)

## convert to tensor
X_g_train = torch.tensor(X_g_train, dtype=torch.float32).unsqueeze(1)
X_g_test = torch.tensor(X_g_test, dtype=torch.float32).unsqueeze(1)
y_g_train = torch.tensor(y_g_train.values, dtype=torch.float32).view(-1, 1)
y_g_test = torch.tensor(y_g_test.values, dtype=torch.float32).view(-1, 1)

In [32]:
# single thread data
## split data
X_st_train = st_train[features_st]
y_st_train = st_train[target]

X_st_test = st_test[features_st]
y_st_test = st_test[target]

## normalize data
scaler_st = StandardScaler()
X_st_train = scaler_st.fit_transform(X_st_train)
X_st_test = scaler_st.transform(X_st_test)

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32).unsqueeze(1)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32).unsqueeze(1)
y_st_train = torch.tensor(y_st_train.values, dtype=torch.float32).view(-1, 1)
y_st_test = torch.tensor(y_st_test.values, dtype=torch.float32).view(-1, 1)

In [33]:
# multi thread data
## split data
X_mm_train = mm_train[features]
y_mm_train = mm_train[target]

X_mm_test = mm_test[features]
y_mm_test = mm_test[target]

## normalize data
scaler_mm = StandardScaler()
X_mm_train = scaler_mm.fit_transform(X_mm_train)
X_mm_test = scaler_mm.transform(X_mm_test)

## convert to tensor
X_mm_train = torch.tensor(X_mm_train, dtype=torch.float32).unsqueeze(1)
X_mm_test = torch.tensor(X_mm_test, dtype=torch.float32).unsqueeze(1)
y_mm_train = torch.tensor(y_mm_train.values, dtype=torch.float32).view(-1, 1)
y_mm_test = torch.tensor(y_mm_test.values, dtype=torch.float32).view(-1, 1)

In [34]:
if DEVICE.type == 'cuda':
	# move to DEVICE
	X_g_train = X_g_train.to(DEVICE)
	y_g_train = y_g_train.to(DEVICE)
	X_g_test = X_g_test.to(DEVICE)
	y_g_test = y_g_test.to(DEVICE)

	X_st_train = X_st_train.to(DEVICE)
	y_st_train = y_st_train.to(DEVICE)
	X_st_test = X_st_test.to(DEVICE)
	y_st_test = y_st_test.to(DEVICE)

	X_mm_train = X_mm_train.to(DEVICE)
	y_mm_train = y_mm_train.to(DEVICE)
	X_mm_test = X_mm_test.to(DEVICE)
	y_mm_test = y_mm_test.to(DEVICE)

# Model

In [35]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim_numeric, embedding_dim, seq_length, num_heads, num_layers, output_dim):
		super(TransformerModel, self).__init__()
		# layers
		self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads),
            num_layers=num_layers
        )
		self.fc_numeric = nn.Linear(input_dim_numeric, embedding_dim)
		self.fc_final = nn.Linear(embedding_dim * (seq_length + 1), output_dim)
	
	def forward(self, numeric_inputs, seq_inputs):
		numeric_embeddings = self.fc_numeric(numeric_inputs)
		seq_embeddings = self.transformer_encoder(seq_inputs)
		combined = torch.cat((seq_embeddings.flatten(1), numeric_embeddings), dim=1)
		output = self.fc_final(combined)
		return output

In [36]:
def objective(trial: optuna.Trial, X_train, X_emb_train, y_train, X_test, y_test, input_num_dim, input_emb_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_num_dim, input_emb_dim, 5000, num_heads, num_layers, output_dim)
	if DEVICE.type == 'cuda':
		model = model.to(DEVICE)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()
	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

		# trial.report(val_loss.item(), epoch+1)
		# if trial.should_prune():
		# 	raise optuna.TrialPruned()
	print(f"Trial: {trial.number} - Loss: {loss.item()} - Val Loss: {val_loss.item()}")
	return val_loss.item()

# Hyperparameters Optimization

In [37]:
n_trials = 25
study_g = None
study_st = None
study_mm = None

## General

In [38]:
# configuration optuna
study_g = optuna.create_study(direction='minimize', sampler=sampler)
study_g.optimize(lambda trial: objective(trial, X_g_train, y_g_train, X_g_test, y_g_test, len(features), 1), n_trials=n_trials)

[I 2024-07-03 07:45:51,581] A new study created in memory with name: no-name-504ff626-0f26-4741-9f15-d1fb61d4cf84
[I 2024-07-03 07:45:58,988] Trial 0 finished with value: 587.005615234375 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 5, 'dropout': 0.3394633936788146, 'learning_rate': 2.9380279387035334e-05, 'weight_decay': 2.9375384576328295e-05, 'num_epochs': 15}. Best is trial 0 with value: 587.005615234375.


Trial: 0 - Loss: 573.3038940429688 - Val Loss: 587.005615234375


[I 2024-07-03 07:46:26,691] Trial 1 finished with value: 65.44300842285156 and parameters: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.10823379771832098, 'learning_rate': 0.008123245085588688, 'weight_decay': 0.00314288089084011, 'num_epochs': 29}. Best is trial 1 with value: 65.44300842285156.


Trial: 1 - Loss: 267.2928161621094 - Val Loss: 65.44300842285156


[I 2024-07-03 07:46:33,623] Trial 2 finished with value: 616.79443359375 and parameters: {'num_heads': 2, 'model_dim': 30, 'num_layers': 2, 'dropout': 0.3099025726528951, 'learning_rate': 0.00019762189340280086, 'weight_decay': 7.476312062252303e-05, 'num_epochs': 65}. Best is trial 1 with value: 65.44300842285156.


Trial: 2 - Loss: 600.5411376953125 - Val Loss: 616.79443359375


[I 2024-07-03 07:46:42,984] Trial 3 finished with value: 332.29901123046875 and parameters: {'num_heads': 2, 'model_dim': 42, 'num_layers': 3, 'dropout': 0.28242799368681437, 'learning_rate': 0.0022673986523780395, 'weight_decay': 3.972110727381908e-05, 'num_epochs': 56}. Best is trial 1 with value: 65.44300842285156.


Trial: 3 - Loss: 396.53668212890625 - Val Loss: 332.29901123046875


[I 2024-07-03 07:47:24,540] Trial 4 finished with value: 668.1171875 and parameters: {'num_heads': 5, 'model_dim': 30, 'num_layers': 4, 'dropout': 0.16820964947491662, 'learning_rate': 1.5673095467235405e-05, 'weight_decay': 0.007025166339242158, 'num_epochs': 97}. Best is trial 1 with value: 65.44300842285156.


Trial: 4 - Loss: 637.9264526367188 - Val Loss: 668.1171875


[I 2024-07-03 07:47:33,715] Trial 5 finished with value: 400.95147705078125 and parameters: {'num_heads': 7, 'model_dim': 154, 'num_layers': 1, 'dropout': 0.3736932106048628, 'learning_rate': 0.00020914981329035596, 'weight_decay': 2.32335035153901e-05, 'num_epochs': 55}. Best is trial 1 with value: 65.44300842285156.


Trial: 5 - Loss: 444.4972839355469 - Val Loss: 400.95147705078125


[I 2024-07-03 07:47:38,489] Trial 6 finished with value: 581.8510131835938 and parameters: {'num_heads': 1, 'model_dim': 59, 'num_layers': 2, 'dropout': 0.36500891374159283, 'learning_rate': 8.612579192594876e-05, 'weight_decay': 0.00036324869566766035, 'num_epochs': 59}. Best is trial 1 with value: 65.44300842285156.


Trial: 6 - Loss: 575.3663330078125 - Val Loss: 581.8510131835938


[I 2024-07-03 07:48:10,450] Trial 7 finished with value: 80.60675811767578 and parameters: {'num_heads': 2, 'model_dim': 126, 'num_layers': 5, 'dropout': 0.4757995766256756, 'learning_rate': 0.004835952776465951, 'weight_decay': 0.0006218704727769079, 'num_epochs': 93}. Best is trial 1 with value: 65.44300842285156.


Trial: 7 - Loss: 266.5770568847656 - Val Loss: 80.60675811767578


[I 2024-07-03 07:48:13,299] Trial 8 finished with value: 668.2510986328125 and parameters: {'num_heads': 1, 'model_dim': 15, 'num_layers': 1, 'dropout': 0.23013213230530574, 'learning_rate': 0.00014656553886225324, 'weight_decay': 6.516990611177177e-05, 'num_epochs': 85}. Best is trial 1 with value: 65.44300842285156.


Trial: 8 - Loss: 640.484375 - Val Loss: 668.2510986328125


[I 2024-07-03 07:48:43,890] Trial 9 finished with value: 84.94974517822266 and parameters: {'num_heads': 3, 'model_dim': 63, 'num_layers': 4, 'dropout': 0.15636968998990508, 'learning_rate': 0.002550298070162891, 'weight_decay': 1.6736010167825783e-05, 'num_epochs': 99}. Best is trial 1 with value: 65.44300842285156.


Trial: 9 - Loss: 266.5422058105469 - Val Loss: 84.94974517822266


[I 2024-07-03 07:49:10,088] Trial 10 finished with value: 178.59274291992188 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 6, 'dropout': 0.10718475024592788, 'learning_rate': 0.0009028168422346188, 'weight_decay': 0.005284384205738849, 'num_epochs': 17}. Best is trial 1 with value: 65.44300842285156.


Trial: 10 - Loss: 307.724609375 - Val Loss: 178.59274291992188


[I 2024-07-03 07:49:49,292] Trial 11 finished with value: 80.0769271850586 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 6, 'dropout': 0.4969555436367596, 'learning_rate': 0.006877477209765817, 'weight_decay': 0.001134736909845589, 'num_epochs': 35}. Best is trial 1 with value: 65.44300842285156.


Trial: 11 - Loss: 267.1382751464844 - Val Loss: 80.0769271850586


[I 2024-07-03 07:50:27,889] Trial 12 finished with value: 78.46221923828125 and parameters: {'num_heads': 6, 'model_dim': 294, 'num_layers': 6, 'dropout': 0.49925267200585316, 'learning_rate': 0.007097392833743549, 'weight_decay': 0.0017343770121671484, 'num_epochs': 35}. Best is trial 1 with value: 65.44300842285156.


Trial: 12 - Loss: 266.8121337890625 - Val Loss: 78.46221923828125


[I 2024-07-03 07:50:54,009] Trial 13 finished with value: 195.78785705566406 and parameters: {'num_heads': 6, 'model_dim': 270, 'num_layers': 5, 'dropout': 0.423556575950062, 'learning_rate': 0.00090313699739029, 'weight_decay': 0.002069134639321997, 'num_epochs': 30}. Best is trial 1 with value: 65.44300842285156.


Trial: 13 - Loss: 313.4983825683594 - Val Loss: 195.78785705566406


[I 2024-07-03 07:51:46,169] Trial 14 finished with value: 119.49128723144531 and parameters: {'num_heads': 8, 'model_dim': 352, 'num_layers': 6, 'dropout': 0.21199639010812193, 'learning_rate': 0.0008129948297847969, 'weight_decay': 0.002229023133098905, 'num_epochs': 38}. Best is trial 1 with value: 65.44300842285156.


Trial: 14 - Loss: 275.8993835449219 - Val Loss: 119.49128723144531


[I 2024-07-03 07:52:18,164] Trial 15 finished with value: 70.45816802978516 and parameters: {'num_heads': 5, 'model_dim': 225, 'num_layers': 5, 'dropout': 0.4232264137788658, 'learning_rate': 0.009728613904848124, 'weight_decay': 0.00017387706430988122, 'num_epochs': 44}. Best is trial 1 with value: 65.44300842285156.


Trial: 15 - Loss: 267.1008605957031 - Val Loss: 70.45816802978516


[I 2024-07-03 07:52:40,068] Trial 16 finished with value: 75.3851547241211 and parameters: {'num_heads': 4, 'model_dim': 208, 'num_layers': 4, 'dropout': 0.4172068328297226, 'learning_rate': 0.009813558212616596, 'weight_decay': 0.00017607569497085006, 'num_epochs': 45}. Best is trial 1 with value: 65.44300842285156.


Trial: 16 - Loss: 266.7921142578125 - Val Loss: 75.3851547241211


[I 2024-07-03 07:52:56,874] Trial 17 finished with value: 100.46436309814453 and parameters: {'num_heads': 5, 'model_dim': 235, 'num_layers': 5, 'dropout': 0.27109579386411675, 'learning_rate': 0.002648001847191296, 'weight_decay': 0.00021351181070935403, 'num_epochs': 23}. Best is trial 1 with value: 65.44300842285156.


Trial: 17 - Loss: 271.3751525878906 - Val Loss: 100.46436309814453


[I 2024-07-03 07:53:41,604] Trial 18 finished with value: 94.75528717041016 and parameters: {'num_heads': 7, 'model_dim': 350, 'num_layers': 3, 'dropout': 0.1092953650710502, 'learning_rate': 0.0005743793329759141, 'weight_decay': 0.00011599023206535064, 'num_epochs': 70}. Best is trial 1 with value: 65.44300842285156.


Trial: 18 - Loss: 268.06646728515625 - Val Loss: 94.75528717041016


[I 2024-07-03 07:54:28,179] Trial 19 finished with value: 78.8536148071289 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 5, 'dropout': 0.4240924757804225, 'learning_rate': 0.0018786921580503184, 'weight_decay': 0.0004997101917350775, 'num_epochs': 46}. Best is trial 1 with value: 65.44300842285156.


Trial: 19 - Loss: 266.6694030761719 - Val Loss: 78.8536148071289


[I 2024-07-03 07:54:39,860] Trial 20 finished with value: 69.58648681640625 and parameters: {'num_heads': 4, 'model_dim': 156, 'num_layers': 4, 'dropout': 0.23297963437340713, 'learning_rate': 0.004475232967246422, 'weight_decay': 1.0592310093219841e-05, 'num_epochs': 26}. Best is trial 1 with value: 65.44300842285156.


Trial: 20 - Loss: 267.0910339355469 - Val Loss: 69.58648681640625


[I 2024-07-03 07:54:49,298] Trial 21 finished with value: 162.96531677246094 and parameters: {'num_heads': 4, 'model_dim': 108, 'num_layers': 4, 'dropout': 0.2221985744009721, 'learning_rate': 0.004305334729458304, 'weight_decay': 1.0883479244730683e-05, 'num_epochs': 23}. Best is trial 1 with value: 65.44300842285156.


Trial: 21 - Loss: 299.6440124511719 - Val Loss: 162.96531677246094


[I 2024-07-03 07:55:08,853] Trial 22 finished with value: 73.73278045654297 and parameters: {'num_heads': 5, 'model_dim': 235, 'num_layers': 3, 'dropout': 0.15142331831653255, 'learning_rate': 0.00996048262953731, 'weight_decay': 0.004024213491547553, 'num_epochs': 45}. Best is trial 1 with value: 65.44300842285156.


Trial: 22 - Loss: 266.6722106933594 - Val Loss: 73.73278045654297


[I 2024-07-03 07:55:15,160] Trial 23 finished with value: 203.9698028564453 and parameters: {'num_heads': 4, 'model_dim': 172, 'num_layers': 5, 'dropout': 0.26166750169446507, 'learning_rate': 0.004567652233675725, 'weight_decay': 0.0007663713378426931, 'num_epochs': 11}. Best is trial 1 with value: 65.44300842285156.


Trial: 23 - Loss: 331.7026062011719 - Val Loss: 203.9698028564453


[I 2024-07-03 07:55:24,779] Trial 24 finished with value: 308.0043029785156 and parameters: {'num_heads': 3, 'model_dim': 135, 'num_layers': 4, 'dropout': 0.1818506671679665, 'learning_rate': 0.0016176302658366711, 'weight_decay': 1.0086280568678738e-05, 'num_epochs': 26}. Best is trial 1 with value: 65.44300842285156.


Trial: 24 - Loss: 383.8891906738281 - Val Loss: 308.0043029785156


In [39]:
# Results
print(f'Número de pruebas: {len(study_g.trials)}')
trial = study_g.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida de validación: {trial.value}')

Número de pruebas: 25
Mejor prueba: 1
Mejores parametros: {'num_heads': 7, 'model_dim': 280, 'num_layers': 5, 'dropout': 0.10823379771832098, 'learning_rate': 0.008123245085588688, 'weight_decay': 0.00314288089084011, 'num_epochs': 29}
Mejor valor de pérdida de validación: 65.44300842285156


## Single Thread

In [40]:
# configuration optuna
study_st = optuna.create_study(direction='minimize', sampler=sampler)
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features_st), 1), n_trials=n_trials)

[I 2024-07-03 07:55:24,799] A new study created in memory with name: no-name-dc29c454-4add-4623-b461-8651c59556d0
[I 2024-07-03 07:55:34,940] Trial 0 finished with value: 62.06943893432617 and parameters: {'num_heads': 7, 'model_dim': 112, 'num_layers': 1, 'dropout': 0.42618457138193366, 'learning_rate': 0.001319994226153501, 'weight_decay': 0.0015382308040279, 'num_epochs': 80}. Best is trial 0 with value: 62.06943893432617.


Trial: 0 - Loss: 307.4320068359375 - Val Loss: 62.06943893432617


[I 2024-07-03 07:55:35,386] Trial 1 finished with value: 507.5620422363281 and parameters: {'num_heads': 1, 'model_dim': 25, 'num_layers': 1, 'dropout': 0.4452413703502375, 'learning_rate': 0.0007411299781083245, 'weight_decay': 9.833181933644887e-05, 'num_epochs': 15}. Best is trial 0 with value: 62.06943893432617.


Trial: 1 - Loss: 744.2530517578125 - Val Loss: 507.5620422363281


[I 2024-07-03 07:55:41,791] Trial 2 finished with value: 220.41946411132812 and parameters: {'num_heads': 3, 'model_dim': 69, 'num_layers': 5, 'dropout': 0.35502298854208525, 'learning_rate': 0.0045881565491609705, 'weight_decay': 0.00026100256506134784, 'num_epochs': 20}. Best is trial 0 with value: 62.06943893432617.


Trial: 2 - Loss: 473.9880676269531 - Val Loss: 220.41946411132812


[I 2024-07-03 07:56:09,826] Trial 3 finished with value: 145.89273071289062 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 4, 'dropout': 0.40838687198182444, 'learning_rate': 0.00030296104428212476, 'weight_decay': 0.0003699972431463808, 'num_epochs': 48}. Best is trial 0 with value: 62.06943893432617.


Trial: 3 - Loss: 389.7847900390625 - Val Loss: 145.89273071289062


[I 2024-07-03 07:56:12,314] Trial 4 finished with value: 542.7485961914062 and parameters: {'num_heads': 1, 'model_dim': 10, 'num_layers': 1, 'dropout': 0.3545641645055122, 'learning_rate': 8.771380343280557e-05, 'weight_decay': 0.0003355151022721483, 'num_epochs': 92}. Best is trial 0 with value: 62.06943893432617.


Trial: 4 - Loss: 775.09228515625 - Val Loss: 542.7485961914062


[I 2024-07-03 07:56:17,938] Trial 5 finished with value: 556.658447265625 and parameters: {'num_heads': 2, 'model_dim': 58, 'num_layers': 5, 'dropout': 0.191519266196649, 'learning_rate': 1.7019223026554023e-05, 'weight_decay': 7.40038575908737e-05, 'num_epochs': 24}. Best is trial 0 with value: 62.06943893432617.


Trial: 5 - Loss: 775.6277465820312 - Val Loss: 556.658447265625


[I 2024-07-03 07:57:30,330] Trial 6 finished with value: 37.884891510009766 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 4, 'dropout': 0.44858423607508713, 'learning_rate': 0.0025764174425233167, 'weight_decay': 3.6283583803549155e-05, 'num_epochs': 91}. Best is trial 6 with value: 37.884891510009766.


Trial: 6 - Loss: 281.2131042480469 - Val Loss: 37.884891510009766


[I 2024-07-03 07:58:06,843] Trial 7 finished with value: 283.6199035644531 and parameters: {'num_heads': 5, 'model_dim': 265, 'num_layers': 6, 'dropout': 0.22720138998874556, 'learning_rate': 2.1387290754148914e-05, 'weight_decay': 4.8284249748183215e-05, 'num_epochs': 48}. Best is trial 6 with value: 37.884891510009766.


Trial: 7 - Loss: 524.4100952148438 - Val Loss: 283.6199035644531


[I 2024-07-03 07:58:10,406] Trial 8 finished with value: 213.32315063476562 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 1, 'dropout': 0.3042989210310263, 'learning_rate': 0.000178744632562384, 'weight_decay': 4.6379219034580266e-05, 'num_epochs': 20}. Best is trial 6 with value: 37.884891510009766.


Trial: 8 - Loss: 443.738037109375 - Val Loss: 213.32315063476562


[I 2024-07-03 07:58:25,810] Trial 9 finished with value: 135.2371826171875 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 2, 'dropout': 0.30751624869734645, 'learning_rate': 0.0012854549964879019, 'weight_decay': 0.00012327891605450807, 'num_epochs': 98}. Best is trial 6 with value: 37.884891510009766.


Trial: 9 - Loss: 214.21823120117188 - Val Loss: 135.2371826171875


[I 2024-07-03 07:59:10,231] Trial 10 finished with value: 38.05074691772461 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 3, 'dropout': 0.1246026171282115, 'learning_rate': 0.008106149171392763, 'weight_decay': 1.5025399484753894e-05, 'num_epochs': 69}. Best is trial 6 with value: 37.884891510009766.


Trial: 10 - Loss: 281.2942810058594 - Val Loss: 38.05074691772461


[I 2024-07-03 07:59:55,966] Trial 11 finished with value: 37.85926055908203 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.10589171034877662, 'learning_rate': 0.008149658576652717, 'weight_decay': 1.219583825009273e-05, 'num_epochs': 71}. Best is trial 11 with value: 37.85926055908203.


Trial: 11 - Loss: 281.20562744140625 - Val Loss: 37.85926055908203


[I 2024-07-03 08:00:39,940] Trial 12 finished with value: 37.77417755126953 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.49215709557858045, 'learning_rate': 0.0028372961333157348, 'weight_decay': 1.0411236616031968e-05, 'num_epochs': 68}. Best is trial 12 with value: 37.77417755126953.


Trial: 12 - Loss: 281.2080078125 - Val Loss: 37.77417755126953


[I 2024-07-03 08:01:11,177] Trial 13 finished with value: 38.269779205322266 and parameters: {'num_heads': 6, 'model_dim': 384, 'num_layers': 3, 'dropout': 0.49664022433141086, 'learning_rate': 0.009186367110957461, 'weight_decay': 1.0863509237016974e-05, 'num_epochs': 66}. Best is trial 12 with value: 37.77417755126953.


Trial: 13 - Loss: 281.20819091796875 - Val Loss: 38.269779205322266


[I 2024-07-03 08:01:49,971] Trial 14 finished with value: 37.89213180541992 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.10368763693190514, 'learning_rate': 0.0031940106049038897, 'weight_decay': 0.004586798164817242, 'num_epochs': 61}. Best is trial 12 with value: 37.77417755126953.


Trial: 14 - Loss: 281.1505126953125 - Val Loss: 37.89213180541992


[I 2024-07-03 08:02:06,212] Trial 15 finished with value: 82.97280883789062 and parameters: {'num_heads': 5, 'model_dim': 190, 'num_layers': 2, 'dropout': 0.22973592976265728, 'learning_rate': 0.000676201440378111, 'weight_decay': 1.979748529126409e-05, 'num_epochs': 75}. Best is trial 12 with value: 37.77417755126953.


Trial: 15 - Loss: 327.6230163574219 - Val Loss: 82.97280883789062


[I 2024-07-03 08:02:18,656] Trial 16 finished with value: 38.07381057739258 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 2, 'dropout': 0.16322619687285683, 'learning_rate': 0.004210179973374917, 'weight_decay': 2.432317943590038e-05, 'num_epochs': 38}. Best is trial 12 with value: 37.77417755126953.


Trial: 16 - Loss: 281.3700866699219 - Val Loss: 38.07381057739258


[I 2024-07-03 08:03:01,583] Trial 17 finished with value: 37.23849105834961 and parameters: {'num_heads': 6, 'model_dim': 252, 'num_layers': 4, 'dropout': 0.2504274959984424, 'learning_rate': 0.0016940390315482881, 'weight_decay': 0.001067356851619411, 'num_epochs': 81}. Best is trial 17 with value: 37.23849105834961.


Trial: 17 - Loss: 281.1636047363281 - Val Loss: 37.23849105834961


[I 2024-07-03 08:03:56,976] Trial 18 finished with value: 37.26271438598633 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 5, 'dropout': 0.2657724769815869, 'learning_rate': 0.0017536399739157401, 'weight_decay': 0.0011021199598813852, 'num_epochs': 87}. Best is trial 17 with value: 37.23849105834961.


Trial: 18 - Loss: 281.17974853515625 - Val Loss: 37.26271438598633


[I 2024-07-03 08:04:42,014] Trial 19 finished with value: 284.8900146484375 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 6, 'dropout': 0.266075759280839, 'learning_rate': 5.985229427655942e-05, 'weight_decay': 0.0014194080627116961, 'num_epochs': 83}. Best is trial 17 with value: 37.23849105834961.


Trial: 19 - Loss: 524.3458862304688 - Val Loss: 284.8900146484375


[I 2024-07-03 08:05:37,810] Trial 20 finished with value: 64.01228332519531 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 5, 'dropout': 0.25891169456684265, 'learning_rate': 0.0005319533783152769, 'weight_decay': 0.0011210735795237944, 'num_epochs': 88}. Best is trial 17 with value: 37.23849105834961.


Trial: 20 - Loss: 307.80596923828125 - Val Loss: 64.01228332519531


[I 2024-07-03 08:06:09,473] Trial 21 finished with value: 38.89128494262695 and parameters: {'num_heads': 5, 'model_dim': 135, 'num_layers': 4, 'dropout': 0.3426511953937996, 'learning_rate': 0.0016169762892262577, 'weight_decay': 0.00392312460813385, 'num_epochs': 78}. Best is trial 17 with value: 37.23849105834961.


Trial: 21 - Loss: 282.9061279296875 - Val Loss: 38.89128494262695


[I 2024-07-03 08:07:12,749] Trial 22 finished with value: 37.341793060302734 and parameters: {'num_heads': 6, 'model_dim': 228, 'num_layers': 5, 'dropout': 0.2653976842275822, 'learning_rate': 0.0020999084932467383, 'weight_decay': 0.00988213445390245, 'num_epochs': 100}. Best is trial 17 with value: 37.23849105834961.


Trial: 22 - Loss: 281.1385192871094 - Val Loss: 37.341793060302734


[I 2024-07-03 08:08:15,549] Trial 23 finished with value: 105.7842025756836 and parameters: {'num_heads': 6, 'model_dim': 234, 'num_layers': 5, 'dropout': 0.2691750909442026, 'learning_rate': 0.0003436782144940018, 'weight_decay': 0.007187818462626211, 'num_epochs': 98}. Best is trial 17 with value: 37.23849105834961.


Trial: 23 - Loss: 348.4277648925781 - Val Loss: 105.7842025756836


[I 2024-07-03 08:09:12,858] Trial 24 finished with value: 37.23106002807617 and parameters: {'num_heads': 4, 'model_dim': 212, 'num_layers': 6, 'dropout': 0.19657967657361075, 'learning_rate': 0.0015831434455046685, 'weight_decay': 0.0007568934817355099, 'num_epochs': 100}. Best is trial 24 with value: 37.23106002807617.


Trial: 24 - Loss: 281.19830322265625 - Val Loss: 37.23106002807617


In [41]:
# Results
print(f'Número de pruebas: {len(study_st.trials)}')
trial = study_st.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Número de pruebas: 25
Mejor prueba: 24
Mejores parametros: {'num_heads': 4, 'model_dim': 212, 'num_layers': 6, 'dropout': 0.19657967657361075, 'learning_rate': 0.0015831434455046685, 'weight_decay': 0.0007568934817355099, 'num_epochs': 100}
Mejor valor de pérdida en validación: 37.23106002807617


## Multi Thread

In [42]:
# configuration optuna
study_mm = optuna.create_study(direction='minimize', sampler=sampler)
study_mm.optimize(lambda trial: objective(trial, X_mm_train, y_mm_train, X_mm_test, y_mm_test, len(features), 1), n_trials=n_trials)

[I 2024-07-03 08:09:12,875] A new study created in memory with name: no-name-6519a2ac-7050-4a61-bc66-b9c1f9dc1c17
[I 2024-07-03 08:09:20,085] Trial 0 finished with value: 828.8079223632812 and parameters: {'num_heads': 8, 'model_dim': 152, 'num_layers': 3, 'dropout': 0.22035132392670786, 'learning_rate': 7.153547794693153e-05, 'weight_decay': 1.2902113024567147e-05, 'num_epochs': 65}. Best is trial 0 with value: 828.8079223632812.


Trial: 0 - Loss: 133.3551025390625 - Val Loss: 828.8079223632812


[I 2024-07-03 08:09:22,447] Trial 1 finished with value: 1094.959716796875 and parameters: {'num_heads': 5, 'model_dim': 35, 'num_layers': 2, 'dropout': 0.46330635438666146, 'learning_rate': 5.232216089948759e-05, 'weight_decay': 2.7207248059486674e-05, 'num_epochs': 54}. Best is trial 0 with value: 828.8079223632812.


Trial: 1 - Loss: 200.1089630126953 - Val Loss: 1094.959716796875


[I 2024-07-03 08:09:30,323] Trial 2 finished with value: 882.26953125 and parameters: {'num_heads': 8, 'model_dim': 144, 'num_layers': 5, 'dropout': 0.4046478461314871, 'learning_rate': 5.163124910488014e-05, 'weight_decay': 0.0015298506868937445, 'num_epochs': 43}. Best is trial 0 with value: 828.8079223632812.


Trial: 2 - Loss: 147.49929809570312 - Val Loss: 882.26953125


[I 2024-07-03 08:09:33,821] Trial 3 finished with value: 507.0150146484375 and parameters: {'num_heads': 6, 'model_dim': 252, 'num_layers': 4, 'dropout': 0.13611590802176332, 'learning_rate': 0.0032055863990707507, 'weight_decay': 9.169384722192322e-05, 'num_epochs': 26}. Best is trial 3 with value: 507.0150146484375.


Trial: 3 - Loss: 107.1495361328125 - Val Loss: 507.0150146484375


[I 2024-07-03 08:09:36,356] Trial 4 finished with value: 1034.8868408203125 and parameters: {'num_heads': 1, 'model_dim': 40, 'num_layers': 5, 'dropout': 0.10663513157114246, 'learning_rate': 0.0003437788661779579, 'weight_decay': 4.7806541413289224e-05, 'num_epochs': 68}. Best is trial 3 with value: 507.0150146484375.


Trial: 4 - Loss: 184.2609100341797 - Val Loss: 1034.8868408203125


[I 2024-07-03 08:09:37,135] Trial 5 finished with value: 1130.845458984375 and parameters: {'num_heads': 2, 'model_dim': 92, 'num_layers': 3, 'dropout': 0.4746919954946939, 'learning_rate': 2.5856088907313374e-05, 'weight_decay': 0.0001054870271491805, 'num_epochs': 20}. Best is trial 3 with value: 507.0150146484375.


Trial: 5 - Loss: 222.30142211914062 - Val Loss: 1130.845458984375


[I 2024-07-03 08:09:43,092] Trial 6 finished with value: 845.4793090820312 and parameters: {'num_heads': 8, 'model_dim': 456, 'num_layers': 2, 'dropout': 0.3639936184136716, 'learning_rate': 0.0028292192255361887, 'weight_decay': 0.00046302286171220994, 'num_epochs': 58}. Best is trial 3 with value: 507.0150146484375.


Trial: 6 - Loss: 38.88782501220703 - Val Loss: 845.4793090820312


[I 2024-07-03 08:09:45,732] Trial 7 finished with value: 1148.4400634765625 and parameters: {'num_heads': 2, 'model_dim': 18, 'num_layers': 6, 'dropout': 0.4601672228653322, 'learning_rate': 0.0007930569433855138, 'weight_decay': 0.00010401341922663452, 'num_epochs': 41}. Best is trial 3 with value: 507.0150146484375.


Trial: 7 - Loss: 216.91741943359375 - Val Loss: 1148.4400634765625


[I 2024-07-03 08:09:51,297] Trial 8 finished with value: 437.9061584472656 and parameters: {'num_heads': 6, 'model_dim': 348, 'num_layers': 6, 'dropout': 0.41195021834304957, 'learning_rate': 0.0008435191341743053, 'weight_decay': 1.7882156647879485e-05, 'num_epochs': 24}. Best is trial 8 with value: 437.9061584472656.


Trial: 8 - Loss: 63.53544616699219 - Val Loss: 437.9061584472656


[I 2024-07-03 08:09:52,363] Trial 9 finished with value: 389.2778625488281 and parameters: {'num_heads': 8, 'model_dim': 320, 'num_layers': 1, 'dropout': 0.14058861714641285, 'learning_rate': 0.0009783749110062348, 'weight_decay': 1.0355826161899173e-05, 'num_epochs': 24}. Best is trial 9 with value: 389.2778625488281.


Trial: 9 - Loss: 59.555973052978516 - Val Loss: 389.2778625488281


[I 2024-07-03 08:09:54,819] Trial 10 finished with value: 646.1348876953125 and parameters: {'num_heads': 4, 'model_dim': 208, 'num_layers': 1, 'dropout': 0.22168965070955987, 'learning_rate': 0.009837604519689649, 'weight_decay': 0.004983077649837294, 'num_epochs': 98}. Best is trial 9 with value: 389.2778625488281.


Trial: 10 - Loss: 8.70503044128418 - Val Loss: 646.1348876953125


[I 2024-07-03 08:09:57,194] Trial 11 finished with value: 652.6309204101562 and parameters: {'num_heads': 6, 'model_dim': 342, 'num_layers': 6, 'dropout': 0.28998570239311117, 'learning_rate': 0.0006548520554345827, 'weight_decay': 1.169646556655389e-05, 'num_epochs': 10}. Best is trial 9 with value: 389.2778625488281.


Trial: 11 - Loss: 95.51595306396484 - Val Loss: 652.6309204101562


[I 2024-07-03 08:09:58,401] Trial 12 finished with value: 851.2083740234375 and parameters: {'num_heads': 6, 'model_dim': 318, 'num_layers': 1, 'dropout': 0.33741357173803993, 'learning_rate': 0.001516980761077545, 'weight_decay': 1.0186004972082117e-05, 'num_epochs': 31}. Best is trial 9 with value: 389.2778625488281.


Trial: 12 - Loss: 39.17889404296875 - Val Loss: 851.2083740234375


[I 2024-07-03 08:10:01,122] Trial 13 finished with value: 582.8424682617188 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 4, 'dropout': 0.2530798290377343, 'learning_rate': 0.00023456623532978524, 'weight_decay': 0.00037585640087546524, 'num_epochs': 15}. Best is trial 9 with value: 389.2778625488281.


Trial: 13 - Loss: 83.96907806396484 - Val Loss: 582.8424682617188


[I 2024-07-03 08:10:02,727] Trial 14 finished with value: 801.2111206054688 and parameters: {'num_heads': 4, 'model_dim': 192, 'num_layers': 2, 'dropout': 0.16993001335286187, 'learning_rate': 0.00017900604001545273, 'weight_decay': 3.207704775634026e-05, 'num_epochs': 33}. Best is trial 9 with value: 389.2778625488281.


Trial: 14 - Loss: 128.22059631347656 - Val Loss: 801.2111206054688


[I 2024-07-03 08:10:11,995] Trial 15 finished with value: 920.3092651367188 and parameters: {'num_heads': 7, 'model_dim': 322, 'num_layers': 5, 'dropout': 0.39273421642847345, 'learning_rate': 1.02850805484477e-05, 'weight_decay': 2.8257776305044408e-05, 'num_epochs': 44}. Best is trial 9 with value: 389.2778625488281.


Trial: 15 - Loss: 151.9146270751953 - Val Loss: 920.3092651367188


[I 2024-07-03 08:10:14,631] Trial 16 finished with value: 836.8941650390625 and parameters: {'num_heads': 5, 'model_dim': 250, 'num_layers': 1, 'dropout': 0.30560991258328973, 'learning_rate': 0.00799960477003386, 'weight_decay': 0.0001939932442761239, 'num_epochs': 84}. Best is trial 9 with value: 389.2778625488281.


Trial: 16 - Loss: 6.752071380615234 - Val Loss: 836.8941650390625


[I 2024-07-03 08:10:18,813] Trial 17 finished with value: 1070.9525146484375 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 4, 'dropout': 0.42513148116607025, 'learning_rate': 0.0008872822595648719, 'weight_decay': 0.0009291037977312205, 'num_epochs': 23}. Best is trial 9 with value: 389.2778625488281.


Trial: 17 - Loss: 48.058048248291016 - Val Loss: 1070.9525146484375


[I 2024-07-03 08:10:27,217] Trial 18 finished with value: 521.8192749023438 and parameters: {'num_heads': 7, 'model_dim': 294, 'num_layers': 6, 'dropout': 0.17461935994011935, 'learning_rate': 0.00365961960817426, 'weight_decay': 0.00949143808478466, 'num_epochs': 34}. Best is trial 9 with value: 389.2778625488281.


Trial: 18 - Loss: 107.15029907226562 - Val Loss: 521.8192749023438


[I 2024-07-03 08:10:29,771] Trial 19 finished with value: 884.5906982421875 and parameters: {'num_heads': 3, 'model_dim': 78, 'num_layers': 3, 'dropout': 0.3384610916919939, 'learning_rate': 0.0003997371275727628, 'weight_decay': 6.0409881842626066e-05, 'num_epochs': 51}. Best is trial 9 with value: 389.2778625488281.


Trial: 19 - Loss: 148.8589324951172 - Val Loss: 884.5906982421875


[I 2024-07-03 08:10:30,607] Trial 20 finished with value: 520.04052734375 and parameters: {'num_heads': 6, 'model_dim': 270, 'num_layers': 2, 'dropout': 0.4994938611591352, 'learning_rate': 0.0019713042142075524, 'weight_decay': 2.0297148749567726e-05, 'num_epochs': 11}. Best is trial 9 with value: 389.2778625488281.


Trial: 20 - Loss: 77.83181762695312 - Val Loss: 520.04052734375


[I 2024-07-03 08:10:33,982] Trial 21 finished with value: 503.9742126464844 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 4, 'dropout': 0.1113603668275614, 'learning_rate': 0.004492101668109098, 'weight_decay': 7.091977889357501e-05, 'num_epochs': 25}. Best is trial 9 with value: 389.2778625488281.


Trial: 21 - Loss: 107.44219970703125 - Val Loss: 503.9742126464844


[I 2024-07-03 08:10:37,338] Trial 22 finished with value: 506.086669921875 and parameters: {'num_heads': 5, 'model_dim': 220, 'num_layers': 5, 'dropout': 0.10108751672506605, 'learning_rate': 0.005395169057039046, 'weight_decay': 1.901568377658239e-05, 'num_epochs': 23}. Best is trial 9 with value: 389.2778625488281.


Trial: 22 - Loss: 107.12117004394531 - Val Loss: 506.086669921875


[I 2024-07-03 08:10:40,470] Trial 23 finished with value: 616.2301635742188 and parameters: {'num_heads': 7, 'model_dim': 357, 'num_layers': 4, 'dropout': 0.1506422952327002, 'learning_rate': 0.0016771596843081652, 'weight_decay': 5.3371988517039574e-05, 'num_epochs': 18}. Best is trial 9 with value: 389.2778625488281.


Trial: 23 - Loss: 59.77368927001953 - Val Loss: 616.2301635742188


[I 2024-07-03 08:10:51,772] Trial 24 finished with value: 528.7885131835938 and parameters: {'num_heads': 8, 'model_dim': 472, 'num_layers': 6, 'dropout': 0.19178868835085539, 'learning_rate': 0.0010973253638146487, 'weight_decay': 0.00018781161475919233, 'num_epochs': 36}. Best is trial 9 with value: 389.2778625488281.


Trial: 24 - Loss: 107.22895050048828 - Val Loss: 528.7885131835938


In [43]:
# Results
print(f'Trials quantity: {len(study_mm.trials)}')
trial = study_mm.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Trials quantity: 25
Mejor prueba: 9
Mejores parametros: {'num_heads': 8, 'model_dim': 320, 'num_layers': 1, 'dropout': 0.14058861714641285, 'learning_rate': 0.0009783749110062348, 'weight_decay': 1.0355826161899173e-05, 'num_epochs': 24}
Mejor valor de pérdida en validación: 389.2778625488281


# Training

In [44]:
models_folder = '../models/transformer'
output_dim = 1

In [45]:
#dump(scaler_g, f'{models_folder}/scaler_g.joblib')
#dump(scaler_st, f'{models_folder}/scaler_st.joblib')
#dump(scaler_mm, f'{models_folder}/scaler_mm.joblib')

## General

In [46]:
input_dim = len(features)
# hyperparameters
if study_g is not None:
	num_heads = study_g.best_trial.params['num_heads']
	model_dim = study_g.best_trial.params['model_dim']
	num_layers = study_g.best_trial.params['num_layers']
	dropout = study_g.best_trial.params['dropout']
	lr = study_g.best_trial.params['learning_rate']
	wd = study_g.best_trial.params['weight_decay']
	num_epochs = study_g.best_trial.params['num_epochs']
else:
	num_heads = 6
	model_dim = 294
	num_layers = 6
	dropout = 0.49925267200585316
	lr = 0.007097392833743549
	wd = 0.0017343770121671484
	num_epochs = 35

In [47]:
# general model initialization
model_g = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_g = model_g.to(DEVICE)
criterion_g = nn.MSELoss()
optimizer_g = optim.AdamW(model_g.parameters(), lr=lr, weight_decay=wd)

model_g.train()

for epoch in range(num_epochs):
	optimizer_g.zero_grad()
	output = model_g(X_g_train)
	loss = criterion_g(output, y_g_train)
	loss.backward()
	optimizer_g.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_g.eval()
		with torch.no_grad():
			val_predictions = model_g(X_g_test)
			val_loss = criterion_g(val_predictions, y_g_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_g.train()

Epoch 10/29, Loss: 270.0231628417969, Val Loss: 56.22865676879883
Epoch 20/29, Loss: 267.6117858886719, Val Loss: 93.45111846923828
Epoch 29/29, Loss: 267.2730407714844, Val Loss: 66.2825698852539


In [49]:
model_g.eval()
with torch.no_grad():
	preds = model_g(X_g_test).cpu().numpy().flatten()
mse = mean_squared_error(y_g_test.cpu().numpy().flatten(), preds)
print(f"MSE: {mse} - RMSE: {np.sqrt(mse)} - MAE: {mean_absolute_error(y_g_test.cpu().numpy().flatten(), preds)}")

MSE: 66.28256225585938 - RMSE: 8.141409873962402 - MAE: 5.510138511657715


In [50]:
# save model
torch.save(model_g, f'{models_folder}/general.pt')

## Single Thread

In [51]:
input_dim = len(features_st)
# hyperparameters
if study_st is not None:
	num_heads = study_st.best_trial.params['num_heads']
	model_dim = study_st.best_trial.params['model_dim']
	num_layers = study_st.best_trial.params['num_layers']
	dropout = study_st.best_trial.params['dropout']
	lr = study_st.best_trial.params['learning_rate']
	wd = study_st.best_trial.params['weight_decay']
	num_epochs = study_st.best_trial.params['num_epochs']
else:
	num_heads = 6
	model_dim = 192
	num_layers = 4
	dropout = 0.3731512093597947
	lr = 0.0027591245533166004
	wd = 0.0014100590768903643
	num_epochs = 78

In [52]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_st = model_st.to(DEVICE)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

Epoch 10/100, Loss: 468.1701965332031, Val Loss: 215.08013916015625
Epoch 20/100, Loss: 387.3863220214844, Val Loss: 137.333984375
Epoch 30/100, Loss: 329.1274719238281, Val Loss: 81.00090026855469
Epoch 40/100, Loss: 295.8929138183594, Val Loss: 49.859031677246094
Epoch 50/100, Loss: 283.1757507324219, Val Loss: 38.61391067504883
Epoch 60/100, Loss: 281.1542053222656, Val Loss: 37.20664596557617
Epoch 70/100, Loss: 281.47821044921875, Val Loss: 37.50769805908203
Epoch 80/100, Loss: 281.2610778808594, Val Loss: 37.17526626586914
Epoch 90/100, Loss: 281.125244140625, Val Loss: 37.208595275878906
Epoch 100/100, Loss: 281.1525573730469, Val Loss: 37.19477844238281


In [53]:
model_st.eval()
with torch.no_grad():
	preds = model_st(X_st_test).cpu().numpy().flatten()
mse = mean_squared_error(y_st_test.cpu().numpy().flatten(), preds)
print(f"MSE: {mse} - RMSE: {np.sqrt(mse)} - MAE: {mean_absolute_error(y_st_test.cpu().numpy().flatten(), preds)}")

MSE: 37.19477844238281 - RMSE: 6.098752021789551 - MAE: 3.26633358001709


In [54]:
# save model
torch.save(model_st, f'{models_folder}/single_thread.pt')

## Multi Thread

In [55]:
input_dim = len(features)
# hyperparameters
if study_mm is not None:
	num_heads = study_mm.best_trial.params['num_heads']
	model_dim = study_mm.best_trial.params['model_dim']
	num_layers = study_mm.best_trial.params['num_layers']
	dropout = study_mm.best_trial.params['dropout']
	lr = study_mm.best_trial.params['learning_rate']
	wd = study_mm.best_trial.params['weight_decay']
	num_epochs = study_mm.best_trial.params['num_epochs']
else:
	num_heads = 7
	model_dim = 413
	num_layers = 1
	dropout = 0.2897232837483553
	lr = 0.0016919131094273875
	wd = 0.0008901136789000729
	num_epochs = 49

In [56]:
# multi thread model initialization
model_mm = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_mm = model_mm.to(DEVICE)
criterion_mm = nn.MSELoss()
optimizer_mm = optim.AdamW(model_mm.parameters(), lr=lr, weight_decay=wd)

model_mm.train()

for epoch in range(num_epochs):
	optimizer_mm.zero_grad()
	output = model_mm(X_mm_train)
	loss = criterion_mm(output, y_mm_train)
	loss.backward()
	optimizer_mm.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_mm.eval()
		with torch.no_grad():
			val_predictions = model_mm(X_mm_test)
			val_loss = criterion_mm(val_predictions, y_mm_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_mm.train()

Epoch 10/24, Loss: 87.99156188964844, Val Loss: 586.1416625976562
Epoch 20/24, Loss: 66.74063873291016, Val Loss: 452.10601806640625
Epoch 24/24, Loss: 61.3031120300293, Val Loss: 407.0425109863281


In [57]:
model_mm.eval()
with torch.no_grad():
	preds = model_mm(X_mm_test).cpu().numpy().flatten()
mse = mean_squared_error(y_mm_test.cpu().numpy().flatten(), preds)
print(f"MSE: {mse} - RMSE: {np.sqrt(mse)} - MAE: {mean_absolute_error(y_mm_test.cpu().numpy().flatten(), preds)}")

MSE: 407.0425109863281 - RMSE: 20.175294876098633 - MAE: 20.15708351135254


In [58]:
# save model
torch.save(model_mm, f'{models_folder}/multi_thread.pt')

# Load models

In [59]:
model_g = torch.load(f'{models_folder}/general.pt').to(DEVICE)
model_st = torch.load(f'{models_folder}/single_thread.pt').to(DEVICE)
model_mm = torch.load(f'{models_folder}/multi_thread.pt').to(DEVICE)

In [60]:
# 1 instance prediction
def predict(model, X):
	model.eval()
	with torch.no_grad():
		prediction = model(X)
	return prediction

def describe_val(model, X, y):
	min_instance = {"prediction": float('inf'), "actual": 0, "index": 0}
	max_instance = {"prediction": 0, "actual": 0, "index": 0}
	
	predictions = predict(model, X).cpu().numpy().flatten()
	index_min = np.argmin(np.abs(predictions - y.cpu().numpy().flatten()))
	min_instance["prediction"] = predictions[index_min]
	min_instance["actual"] = y.cpu().numpy().flatten()[index_min]
	min_instance["index"] = index_min
	index_max = np.argmax(np.abs(predictions - y.cpu().numpy().flatten()))
	max_instance["prediction"] = predictions[index_max]
	max_instance["actual"] = y.cpu().numpy().flatten()[index_max]
	max_instance["index"] = index_max

	return min_instance, max_instance, predictions

In [68]:
# general model
print("Validation set general model")
min_instance, max_instance, predictions = describe_val(model_g, X_g_test, y_g_test)
errors = np.abs(predictions - y_g_test.cpu().numpy().flatten())
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_g_test.cpu().numpy().flatten())} | Std actual: {np.std(y_g_test.cpu().numpy().flatten())}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("---")
print("Min instance")
print(g_test.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(g_test.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set general model
Mean prediction: 24.186809539794922 | Std actual: 2.045403562078718e-06
Mean actual: 27.556499481201172 | Std actual: 7.411326885223389
Mean Error: 5.510138511657715 | Std Error: 5.993408679962158
---
Min instance
total_time                                                       17.87
total_cpu_usage                                                   0.99
max_ram_usage                                                15.527344
brand_raw                         12th Gen Intel(R) Core(TM) i5-12400F
count                                                               12
l2_cache_size                                                      7.5
l3_cache_size                                                     18.0
l2_cache_line_size                                                1280
l2_cache_associativity                                               7
benchmark                                                     N_Queens
ghz_actual_friendly                            

In [69]:
# single thread model
print("Validation set single thread model")
min_instance, max_instance, predictions = describe_val(model_st, X_st_test, y_st_test)
errors = np.abs(predictions - y_st_test.cpu().numpy().flatten())
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_st_test.cpu().numpy().flatten())} | Std actual: {np.std(y_st_test.cpu().numpy().flatten())}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(st_test.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(st_test.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model
Mean prediction: 24.714994430541992 | Std actual: 4.625657311407849e-06
Mean actual: 24.564001083374023 | Std actual: 6.096883296966553
Mean Error: 3.26633358001709 | Std Error: 5.150325298309326
Min instance
total_time                                                             24.7
total_cpu_usage                                                        0.99
max_ram_usage                                                     10.207031
brand_raw                         Intel(R) Xeon(R) CPU E5-2623 v3 @ 3.00GHz
count                                                                     8
l2_cache_size                                                           2.0
l3_cache_size                                                          10.0
l2_cache_line_size                                                      256
l2_cache_associativity                                                    2
benchmark                                                          N_Que

In [70]:
# multi thread model
print("Validation set multi thread model")
min_instance, max_instance, predictions = describe_val(model_mm, X_mm_test, y_mm_test)
errors = np.abs(predictions - y_mm_test.cpu().numpy().flatten())
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_mm_test.cpu().numpy().flatten())} | Std actual: {np.std(y_mm_test.cpu().numpy().flatten())}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(mm_test.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(mm_test.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set multi thread model
Mean prediction: 16.376916885375977 | Std actual: 0.00497777434065938
Mean actual: 36.534000396728516 | Std actual: 0.8569851517677307
Mean Error: 20.15708351135254 | Std Error: 0.8569996356964111
Min instance
total_time                                                            37.51
total_cpu_usage                                                        6.15
max_ram_usage                                                   2350.140625
brand_raw                         Intel(R) Core(TM) i5-10300H CPU @ 2.50GHz
count                                                                     8
l2_cache_size                                                           1.0
l3_cache_size                                                           8.0
l2_cache_line_size                                                      256
l2_cache_associativity                                                    6
benchmark                                                       MATRIX_M