In [28]:
import optuna
from optuna.samplers import TPESampler
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn.metrics import mean_squared_error, mean_absolute_error
#from joblib import dump

In [2]:
# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE.type

'cuda'

In [3]:
# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
sampler = TPESampler(seed=seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Pre-processing input data

In [5]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [6]:
results_df = pd.read_csv('../results_new/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio_new/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['l3_cache_size'] = results_df['l3_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [7]:
# Make the target dataset
target_df = results_df[['total_time', 'brand_raw', 'count', 'l2_cache_size', 'l3_cache_size', 'l2_cache_line_size', 'l2_cache_associativity', 'ghz_advertised_friendly', 'benchmark']].copy()
# Rename columns to *_target
target_df = target_df.rename(columns={
    'total_time': 'total_time_target',
    'brand_raw': 'brand_raw_target',
    'count': 'count_target',
    'l2_cache_size': 'l2_cache_size_target',
    'l3_cache_size': 'l3_cache_size_target',
    'l2_cache_line_size': 'l2_cache_line_size_target',
    'l2_cache_associativity': 'l2_cache_associativity_target',
    'ghz_advertised_friendly': 'ghz_advertised_friendly_target',
})

dataset_df = pd.merge(results_df, target_df, how='inner', on='benchmark')
dataset_df = dataset_df[dataset_df['brand_raw'] != dataset_df['brand_raw_target']]
dataset_df.head(2)

Unnamed: 0,total_time,total_cpu_usage,max_ram_usage,brand_raw,count,l2_cache_size,l3_cache_size,l2_cache_line_size,l2_cache_associativity,benchmark,ghz_actual_friendly,ghz_advertised_friendly,total_time_target,brand_raw_target,count_target,l2_cache_size_target,l3_cache_size_target,l2_cache_line_size_target,l2_cache_associativity_target,ghz_advertised_friendly_target
5,13.47,0.99,1436.714844,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.1729,2.9,45.91,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496
6,13.47,0.99,1436.714844,Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz,12,1.5,12.0,256,6,KNP,4.1729,2.9,25.77,13th Gen Intel(R) Core(TM) i5-1335U,12,7.5,12.0,1280,7,2.496


In [9]:
st_df = dataset_df[~dataset_df['benchmark'].isin(['MATRIX_MULT', 'MATRIX_MULT2', 'MATRIX_MULT3'])]
# remove one computer for testing
st_train = st_df[(st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U') & (st_df['brand_raw_target'] != '13th Gen Intel(R) Core(TM) i5-1335U')]
st_test = st_df[st_df['brand_raw_target'] == '13th Gen Intel(R) Core(TM) i5-1335U']

In [29]:
# load test dataset
# g_test = pd.read_csv('csv/g_test.csv')
# st_test = pd.read_csv('csv/st_test.csv')
# mm_test = pd.read_csv('csv/mm_test.csv')

In [18]:
target = 'total_time_target'
features = st_test.columns.copy().drop(target).drop(['benchmark','brand_raw', 'brand_raw_target'])
embeddings = ['benchmark'] 
features_st = features.copy().drop(['count', 'count_target'])

In [20]:
# single thread data
## split data
X_st_train = st_train[features_st]
y_st_train = st_train[target]
emb_st_train = st_train[embeddings]

X_st_test = st_test[features_st]
y_st_test = st_test[target]
emb_st_test = st_test[embeddings]

## normalize data
x_st_scaler = MinMaxScaler(feature_range=(0, 1))
X_st_train = x_st_scaler.fit_transform(X_st_train)
X_st_test = x_st_scaler.transform(X_st_test)
y_st_scaler = MinMaxScaler(feature_range=(0, 1))
y_st_train = y_st_scaler.fit_transform(y_st_train.values.reshape(-1, 1))
y_st_test = y_st_scaler.transform(y_st_test.values.reshape(-1, 1))

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32)
y_st_train = torch.tensor(y_st_train, dtype=torch.float32)
y_st_test = torch.tensor(y_st_test, dtype=torch.float32)

In [31]:
# process embeddings
import os
benchmarks = emb_st_train['benchmark'].unique().copy()
filenames = [f'strace_{x}.txt' for x in benchmarks]

file_contents = []
for filename in filenames:
	with open(os.path.join('..','results_new','X86_64', filename), 'r') as f:
		lines = f.readlines()
		file_contents.append([line.strip() for line in lines])

# process only syscalls
list_of_sequences = []
for file in file_contents:
	calls = []
	for line in file:
		if "+++ exited with 0 +++" in line:
			break
		calls.append(line.split('(')[0])
	list_of_sequences.append(calls)

all_calls = [call for seq in list_of_sequences for call in seq]

# mapping from calls to tokens
call_to_token = {call: idx for idx, call in enumerate(set(all_calls))}
token_to_call = {idx: call for call, idx in call_to_token.items()}

# tokenize the sequences
tokenized_sequences = [[call_to_token[call] for call in seq] for seq in list_of_sequences]

# turn the tokenized sequences into tensors with padding
sequences_tensors = [torch.tensor(seq) for seq in tokenized_sequences]
padded_sequences = pad_sequence(sequences_tensors, batch_first=True, padding_value=0)

# dict of benchmark to padded sequence
benchmark_to_sequence = {bench: seq for bench, seq in zip(benchmarks, padded_sequences)}

In [34]:
if DEVICE.type == 'cuda':
	# move to DEVICE
	X_st_train = X_st_train.to(DEVICE)
	y_st_train = y_st_train.to(DEVICE)
	X_st_test = X_st_test.to(DEVICE)
	y_st_test = y_st_test.to(DEVICE)

# Model

In [None]:
def inv_scaling(y, y_scaler, d=False):
    if d:
        return y_scaler.inverse_transform(y.detach().cpu().numpy().reshape(-1, 1))
    return y_scaler.inverse_transform(y.cpu().numpy().reshape(-1, 1))

In [35]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim_numeric, embedding_dim, seq_length, num_heads, num_layers, output_dim):
		super(TransformerModel, self).__init__()
		# layers
		self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads),
            num_layers=num_layers
        )
		self.fc_numeric = nn.Linear(input_dim_numeric, embedding_dim)
		self.fc_final = nn.Linear(embedding_dim * (seq_length + 1), output_dim)
	
	def forward(self, numeric_inputs, seq_inputs):
		numeric_embeddings = self.fc_numeric(numeric_inputs)
		seq_embeddings = self.transformer_encoder(seq_inputs)
		combined = torch.cat((seq_embeddings.flatten(1), numeric_embeddings), dim=1)
		output = self.fc_final(combined)
		return output

In [36]:
def objective(trial: optuna.Trial, X_train, X_emb_train, y_train, X_test, y_test, input_num_dim, input_emb_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_num_dim, input_emb_dim, 5000, num_heads, num_layers, output_dim)
	if DEVICE.type == 'cuda':
		model = model.to(DEVICE)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()
	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

		# trial.report(val_loss.item(), epoch+1)
		# if trial.should_prune():
		# 	raise optuna.TrialPruned()
	print(f"Trial: {trial.number} - Loss: {loss.item()} - Val Loss: {val_loss.item()}")
	return val_loss.item()

# Hyperparameters Optimization

In [37]:
n_trials = 25
study_st = None

## Single Thread

In [40]:
# configuration optuna
study_st = optuna.create_study(direction='minimize', sampler=sampler)
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features_st), 1), n_trials=n_trials)

[I 2024-07-03 07:55:24,799] A new study created in memory with name: no-name-dc29c454-4add-4623-b461-8651c59556d0
[I 2024-07-03 07:55:34,940] Trial 0 finished with value: 62.06943893432617 and parameters: {'num_heads': 7, 'model_dim': 112, 'num_layers': 1, 'dropout': 0.42618457138193366, 'learning_rate': 0.001319994226153501, 'weight_decay': 0.0015382308040279, 'num_epochs': 80}. Best is trial 0 with value: 62.06943893432617.


Trial: 0 - Loss: 307.4320068359375 - Val Loss: 62.06943893432617


[I 2024-07-03 07:55:35,386] Trial 1 finished with value: 507.5620422363281 and parameters: {'num_heads': 1, 'model_dim': 25, 'num_layers': 1, 'dropout': 0.4452413703502375, 'learning_rate': 0.0007411299781083245, 'weight_decay': 9.833181933644887e-05, 'num_epochs': 15}. Best is trial 0 with value: 62.06943893432617.


Trial: 1 - Loss: 744.2530517578125 - Val Loss: 507.5620422363281


[I 2024-07-03 07:55:41,791] Trial 2 finished with value: 220.41946411132812 and parameters: {'num_heads': 3, 'model_dim': 69, 'num_layers': 5, 'dropout': 0.35502298854208525, 'learning_rate': 0.0045881565491609705, 'weight_decay': 0.00026100256506134784, 'num_epochs': 20}. Best is trial 0 with value: 62.06943893432617.


Trial: 2 - Loss: 473.9880676269531 - Val Loss: 220.41946411132812


[I 2024-07-03 07:56:09,826] Trial 3 finished with value: 145.89273071289062 and parameters: {'num_heads': 6, 'model_dim': 300, 'num_layers': 4, 'dropout': 0.40838687198182444, 'learning_rate': 0.00030296104428212476, 'weight_decay': 0.0003699972431463808, 'num_epochs': 48}. Best is trial 0 with value: 62.06943893432617.


Trial: 3 - Loss: 389.7847900390625 - Val Loss: 145.89273071289062


[I 2024-07-03 07:56:12,314] Trial 4 finished with value: 542.7485961914062 and parameters: {'num_heads': 1, 'model_dim': 10, 'num_layers': 1, 'dropout': 0.3545641645055122, 'learning_rate': 8.771380343280557e-05, 'weight_decay': 0.0003355151022721483, 'num_epochs': 92}. Best is trial 0 with value: 62.06943893432617.


Trial: 4 - Loss: 775.09228515625 - Val Loss: 542.7485961914062


[I 2024-07-03 07:56:17,938] Trial 5 finished with value: 556.658447265625 and parameters: {'num_heads': 2, 'model_dim': 58, 'num_layers': 5, 'dropout': 0.191519266196649, 'learning_rate': 1.7019223026554023e-05, 'weight_decay': 7.40038575908737e-05, 'num_epochs': 24}. Best is trial 0 with value: 62.06943893432617.


Trial: 5 - Loss: 775.6277465820312 - Val Loss: 556.658447265625


[I 2024-07-03 07:57:30,330] Trial 6 finished with value: 37.884891510009766 and parameters: {'num_heads': 8, 'model_dim': 424, 'num_layers': 4, 'dropout': 0.44858423607508713, 'learning_rate': 0.0025764174425233167, 'weight_decay': 3.6283583803549155e-05, 'num_epochs': 91}. Best is trial 6 with value: 37.884891510009766.


Trial: 6 - Loss: 281.2131042480469 - Val Loss: 37.884891510009766


[I 2024-07-03 07:58:06,843] Trial 7 finished with value: 283.6199035644531 and parameters: {'num_heads': 5, 'model_dim': 265, 'num_layers': 6, 'dropout': 0.22720138998874556, 'learning_rate': 2.1387290754148914e-05, 'weight_decay': 4.8284249748183215e-05, 'num_epochs': 48}. Best is trial 6 with value: 37.884891510009766.


Trial: 7 - Loss: 524.4100952148438 - Val Loss: 283.6199035644531


[I 2024-07-03 07:58:10,406] Trial 8 finished with value: 213.32315063476562 and parameters: {'num_heads': 7, 'model_dim': 392, 'num_layers': 1, 'dropout': 0.3042989210310263, 'learning_rate': 0.000178744632562384, 'weight_decay': 4.6379219034580266e-05, 'num_epochs': 20}. Best is trial 6 with value: 37.884891510009766.


Trial: 8 - Loss: 443.738037109375 - Val Loss: 213.32315063476562


[I 2024-07-03 07:58:25,810] Trial 9 finished with value: 135.2371826171875 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 2, 'dropout': 0.30751624869734645, 'learning_rate': 0.0012854549964879019, 'weight_decay': 0.00012327891605450807, 'num_epochs': 98}. Best is trial 6 with value: 37.884891510009766.


Trial: 9 - Loss: 214.21823120117188 - Val Loss: 135.2371826171875


[I 2024-07-03 07:59:10,231] Trial 10 finished with value: 38.05074691772461 and parameters: {'num_heads': 8, 'model_dim': 496, 'num_layers': 3, 'dropout': 0.1246026171282115, 'learning_rate': 0.008106149171392763, 'weight_decay': 1.5025399484753894e-05, 'num_epochs': 69}. Best is trial 6 with value: 37.884891510009766.


Trial: 10 - Loss: 281.2942810058594 - Val Loss: 38.05074691772461


[I 2024-07-03 07:59:55,966] Trial 11 finished with value: 37.85926055908203 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.10589171034877662, 'learning_rate': 0.008149658576652717, 'weight_decay': 1.219583825009273e-05, 'num_epochs': 71}. Best is trial 11 with value: 37.85926055908203.


Trial: 11 - Loss: 281.20562744140625 - Val Loss: 37.85926055908203


[I 2024-07-03 08:00:39,940] Trial 12 finished with value: 37.77417755126953 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.49215709557858045, 'learning_rate': 0.0028372961333157348, 'weight_decay': 1.0411236616031968e-05, 'num_epochs': 68}. Best is trial 12 with value: 37.77417755126953.


Trial: 12 - Loss: 281.2080078125 - Val Loss: 37.77417755126953


[I 2024-07-03 08:01:11,177] Trial 13 finished with value: 38.269779205322266 and parameters: {'num_heads': 6, 'model_dim': 384, 'num_layers': 3, 'dropout': 0.49664022433141086, 'learning_rate': 0.009186367110957461, 'weight_decay': 1.0863509237016974e-05, 'num_epochs': 66}. Best is trial 12 with value: 37.77417755126953.


Trial: 13 - Loss: 281.20819091796875 - Val Loss: 38.269779205322266


[I 2024-07-03 08:01:49,971] Trial 14 finished with value: 37.89213180541992 and parameters: {'num_heads': 8, 'model_dim': 512, 'num_layers': 3, 'dropout': 0.10368763693190514, 'learning_rate': 0.0031940106049038897, 'weight_decay': 0.004586798164817242, 'num_epochs': 61}. Best is trial 12 with value: 37.77417755126953.


Trial: 14 - Loss: 281.1505126953125 - Val Loss: 37.89213180541992


[I 2024-07-03 08:02:06,212] Trial 15 finished with value: 82.97280883789062 and parameters: {'num_heads': 5, 'model_dim': 190, 'num_layers': 2, 'dropout': 0.22973592976265728, 'learning_rate': 0.000676201440378111, 'weight_decay': 1.979748529126409e-05, 'num_epochs': 75}. Best is trial 12 with value: 37.77417755126953.


Trial: 15 - Loss: 327.6230163574219 - Val Loss: 82.97280883789062


[I 2024-07-03 08:02:18,656] Trial 16 finished with value: 38.07381057739258 and parameters: {'num_heads': 7, 'model_dim': 336, 'num_layers': 2, 'dropout': 0.16322619687285683, 'learning_rate': 0.004210179973374917, 'weight_decay': 2.432317943590038e-05, 'num_epochs': 38}. Best is trial 12 with value: 37.77417755126953.


Trial: 16 - Loss: 281.3700866699219 - Val Loss: 38.07381057739258


[I 2024-07-03 08:03:01,583] Trial 17 finished with value: 37.23849105834961 and parameters: {'num_heads': 6, 'model_dim': 252, 'num_layers': 4, 'dropout': 0.2504274959984424, 'learning_rate': 0.0016940390315482881, 'weight_decay': 0.001067356851619411, 'num_epochs': 81}. Best is trial 17 with value: 37.23849105834961.


Trial: 17 - Loss: 281.1636047363281 - Val Loss: 37.23849105834961


[I 2024-07-03 08:03:56,976] Trial 18 finished with value: 37.26271438598633 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 5, 'dropout': 0.2657724769815869, 'learning_rate': 0.0017536399739157401, 'weight_decay': 0.0011021199598813852, 'num_epochs': 87}. Best is trial 17 with value: 37.23849105834961.


Trial: 18 - Loss: 281.17974853515625 - Val Loss: 37.26271438598633


[I 2024-07-03 08:04:42,014] Trial 19 finished with value: 284.8900146484375 and parameters: {'num_heads': 4, 'model_dim': 184, 'num_layers': 6, 'dropout': 0.266075759280839, 'learning_rate': 5.985229427655942e-05, 'weight_decay': 0.0014194080627116961, 'num_epochs': 83}. Best is trial 17 with value: 37.23849105834961.


Trial: 19 - Loss: 524.3458862304688 - Val Loss: 284.8900146484375


[I 2024-07-03 08:05:37,810] Trial 20 finished with value: 64.01228332519531 and parameters: {'num_heads': 6, 'model_dim': 240, 'num_layers': 5, 'dropout': 0.25891169456684265, 'learning_rate': 0.0005319533783152769, 'weight_decay': 0.0011210735795237944, 'num_epochs': 88}. Best is trial 17 with value: 37.23849105834961.


Trial: 20 - Loss: 307.80596923828125 - Val Loss: 64.01228332519531


[I 2024-07-03 08:06:09,473] Trial 21 finished with value: 38.89128494262695 and parameters: {'num_heads': 5, 'model_dim': 135, 'num_layers': 4, 'dropout': 0.3426511953937996, 'learning_rate': 0.0016169762892262577, 'weight_decay': 0.00392312460813385, 'num_epochs': 78}. Best is trial 17 with value: 37.23849105834961.


Trial: 21 - Loss: 282.9061279296875 - Val Loss: 38.89128494262695


[I 2024-07-03 08:07:12,749] Trial 22 finished with value: 37.341793060302734 and parameters: {'num_heads': 6, 'model_dim': 228, 'num_layers': 5, 'dropout': 0.2653976842275822, 'learning_rate': 0.0020999084932467383, 'weight_decay': 0.00988213445390245, 'num_epochs': 100}. Best is trial 17 with value: 37.23849105834961.


Trial: 22 - Loss: 281.1385192871094 - Val Loss: 37.341793060302734


[I 2024-07-03 08:08:15,549] Trial 23 finished with value: 105.7842025756836 and parameters: {'num_heads': 6, 'model_dim': 234, 'num_layers': 5, 'dropout': 0.2691750909442026, 'learning_rate': 0.0003436782144940018, 'weight_decay': 0.007187818462626211, 'num_epochs': 98}. Best is trial 17 with value: 37.23849105834961.


Trial: 23 - Loss: 348.4277648925781 - Val Loss: 105.7842025756836


[I 2024-07-03 08:09:12,858] Trial 24 finished with value: 37.23106002807617 and parameters: {'num_heads': 4, 'model_dim': 212, 'num_layers': 6, 'dropout': 0.19657967657361075, 'learning_rate': 0.0015831434455046685, 'weight_decay': 0.0007568934817355099, 'num_epochs': 100}. Best is trial 24 with value: 37.23106002807617.


Trial: 24 - Loss: 281.19830322265625 - Val Loss: 37.23106002807617


In [41]:
# Results
print(f'Número de pruebas: {len(study_st.trials)}')
trial = study_st.best_trial
print(f'Mejor prueba: {trial.number}')
print(f'Mejores parametros: {trial.params}')
print(f'Mejor valor de pérdida en validación: {trial.value}')

Número de pruebas: 25
Mejor prueba: 24
Mejores parametros: {'num_heads': 4, 'model_dim': 212, 'num_layers': 6, 'dropout': 0.19657967657361075, 'learning_rate': 0.0015831434455046685, 'weight_decay': 0.0007568934817355099, 'num_epochs': 100}
Mejor valor de pérdida en validación: 37.23106002807617


# Training

In [44]:
models_folder = '../models/transformer'
output_dim = 1

In [45]:
#dump(scaler_g, f'{models_folder}/scaler_g.joblib')
#dump(scaler_st, f'{models_folder}/scaler_st.joblib')
#dump(scaler_mm, f'{models_folder}/scaler_mm.joblib')

## Single Thread

In [51]:
input_dim = len(features_st)
# hyperparameters
if study_st is not None:
	num_heads = study_st.best_trial.params['num_heads']
	model_dim = study_st.best_trial.params['model_dim']
	num_layers = study_st.best_trial.params['num_layers']
	dropout = study_st.best_trial.params['dropout']
	lr = study_st.best_trial.params['learning_rate']
	wd = study_st.best_trial.params['weight_decay']
	num_epochs = study_st.best_trial.params['num_epochs']
else:
	num_heads = 6
	model_dim = 192
	num_layers = 4
	dropout = 0.3731512093597947
	lr = 0.0027591245533166004
	wd = 0.0014100590768903643
	num_epochs = 78

In [52]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
if DEVICE.type == 'cuda':
	model_st = model_st.to(DEVICE)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0 or epoch == num_epochs-1:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

Epoch 10/100, Loss: 468.1701965332031, Val Loss: 215.08013916015625
Epoch 20/100, Loss: 387.3863220214844, Val Loss: 137.333984375
Epoch 30/100, Loss: 329.1274719238281, Val Loss: 81.00090026855469
Epoch 40/100, Loss: 295.8929138183594, Val Loss: 49.859031677246094
Epoch 50/100, Loss: 283.1757507324219, Val Loss: 38.61391067504883
Epoch 60/100, Loss: 281.1542053222656, Val Loss: 37.20664596557617
Epoch 70/100, Loss: 281.47821044921875, Val Loss: 37.50769805908203
Epoch 80/100, Loss: 281.2610778808594, Val Loss: 37.17526626586914
Epoch 90/100, Loss: 281.125244140625, Val Loss: 37.208595275878906
Epoch 100/100, Loss: 281.1525573730469, Val Loss: 37.19477844238281


In [53]:
model_st.eval()
with torch.no_grad():
	preds = model_st(X_st_test).cpu().numpy().flatten()
mse = mean_squared_error(y_st_test.cpu().numpy().flatten(), preds)
print(f"MSE: {mse} - RMSE: {np.sqrt(mse)} - MAE: {mean_absolute_error(y_st_test.cpu().numpy().flatten(), preds)}")

MSE: 37.19477844238281 - RMSE: 6.098752021789551 - MAE: 3.26633358001709


In [54]:
# save model
torch.save(model_st, f'{models_folder}/single_thread.pt')

# Load models

In [59]:
model_st = torch.load(f'{models_folder}/single_thread.pt').to(DEVICE)

In [60]:
# 1 instance prediction
def predict(model, X):
	model.eval()
	with torch.no_grad():
		prediction = model(X)
	return prediction

def describe_val(model, X, y):
	min_instance = {"prediction": float('inf'), "actual": 0, "index": 0}
	max_instance = {"prediction": 0, "actual": 0, "index": 0}
	
	predictions = predict(model, X).cpu().numpy().flatten()
	index_min = np.argmin(np.abs(predictions - y.cpu().numpy().flatten()))
	min_instance["prediction"] = predictions[index_min]
	min_instance["actual"] = y.cpu().numpy().flatten()[index_min]
	min_instance["index"] = index_min
	index_max = np.argmax(np.abs(predictions - y.cpu().numpy().flatten()))
	max_instance["prediction"] = predictions[index_max]
	max_instance["actual"] = y.cpu().numpy().flatten()[index_max]
	max_instance["index"] = index_max

	return min_instance, max_instance, predictions

In [69]:
# single thread model
print("Validation set single thread model")
min_instance, max_instance, predictions = describe_val(model_st, X_st_test, y_st_test)
errors = np.abs(predictions - y_st_test.cpu().numpy().flatten())
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_st_test.cpu().numpy().flatten())} | Std actual: {np.std(y_st_test.cpu().numpy().flatten())}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(st_test.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(st_test.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model
Mean prediction: 24.714994430541992 | Std actual: 4.625657311407849e-06
Mean actual: 24.564001083374023 | Std actual: 6.096883296966553
Mean Error: 3.26633358001709 | Std Error: 5.150325298309326
Min instance
total_time                                                             24.7
total_cpu_usage                                                        0.99
max_ram_usage                                                     10.207031
brand_raw                         Intel(R) Xeon(R) CPU E5-2623 v3 @ 3.00GHz
count                                                                     8
l2_cache_size                                                           2.0
l3_cache_size                                                          10.0
l2_cache_line_size                                                      256
l2_cache_associativity                                                    2
benchmark                                                          N_Que