In [1]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from icecream import ic

  from .autonotebook import tqdm as notebook_tqdm


# Pre-processing input data

In [2]:
def bits_to_MiB(row):
	# verify if has string ' MiB'
	if 'MiB' in str(row):
		row = row.replace(' MiB', '')
		row = float(row)
	else:
		row = float(row) / np.power(2, 20)
	return row


def MHz_to_GHz(row):
	# verify if has string ' GHz'
	if 'GHz' in str(row):
		row = row.replace(' GHz', '')
		# convert to float
		row = float(row)
	else:
		row = row.replace(' MHz', '')
		row = float(row) / 1000
	return row

In [3]:
results_df = pd.read_csv('../results/execution_time.csv')
results_savio_df = pd.read_csv('../results_savio/execution_time.csv')
results_df = pd.concat([results_df, results_savio_df], ignore_index=True)
# preprocessing
results_df['total_cpu_usage'] = results_df['total_cpu_usage'].str.replace('%', '').astype(float) / 100
results_df['max_ram_usage'] = results_df['max_ram_usage'] / 1024
results_df['l2_cache_size'] = results_df['l2_cache_size'].apply(bits_to_MiB)
results_df['ghz_actual_friendly'] = results_df['hz_actual_friendly'].apply(MHz_to_GHz)
results_df['ghz_advertised_friendly'] = results_df['hz_advertised_friendly'].str.replace('GHz', '').astype(float)
results_df = results_df.drop(columns=['hz_actual_friendly', 'hz_advertised_friendly', 'arch', 'vendor_id_raw'])

In [4]:
# remove one computer for testing
g_train = results_df[results_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])
g_test = results_df[results_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['benchmark','brand_raw'])

In [5]:
mm_df = results_df[results_df['benchmark']=='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
mm_train = mm_df[mm_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
mm_test = mm_df[mm_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [6]:
st_df = results_df[results_df['benchmark']!='MATRIX_MULT'].drop(columns=['benchmark'])
# remove one computer for testing
st_train = st_df[st_df['brand_raw'] != '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])
st_test = st_df[st_df['brand_raw'] == '13th Gen Intel(R) Core(TM) i5-1335U'].drop(columns=['brand_raw'])

In [7]:
features = mm_test.columns[1:]
target = mm_test.columns[0]

In [8]:
# general data
## split data
X_g_train = g_train[features]
y_g_train = g_train[target]

X_g_test = g_test[features]
y_g_test = g_test[target]

## normalize data
scaler = StandardScaler()
X_g_train = scaler.fit_transform(X_g_train)
X_g_test = scaler.transform(X_g_test)

## convert to tensor
X_g_train = torch.tensor(X_g_train, dtype=torch.float32).unsqueeze(1)
X_g_test = torch.tensor(X_g_test, dtype=torch.float32).unsqueeze(1)
y_g_train = torch.tensor(y_g_train.values, dtype=torch.float32).view(-1, 1)
y_g_test = torch.tensor(y_g_test.values, dtype=torch.float32).view(-1, 1)

In [9]:
# single thread data
## split data
X_st_train = st_train[features]
y_st_train = st_train[target]

X_st_test = st_test[features]
y_st_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_st_train = scaler.fit_transform(X_st_train)
X_st_test = scaler.transform(X_st_test)

## convert to tensor
X_st_train = torch.tensor(X_st_train, dtype=torch.float32).unsqueeze(1)
X_st_test = torch.tensor(X_st_test, dtype=torch.float32).unsqueeze(1)
y_st_train = torch.tensor(y_st_train.values, dtype=torch.float32).view(-1, 1)
y_st_test = torch.tensor(y_st_test.values, dtype=torch.float32).view(-1, 1)

In [10]:
# multi thread data
## split data
X_mm_train = st_train[features]
y_mm_train = st_train[target]

X_mm_test = st_test[features]
y_mm_test = st_test[target]

## normalize data
scaler = StandardScaler()
X_mm_train = scaler.fit_transform(X_mm_train)
X_mm_test = scaler.transform(X_mm_test)

## convert to tensor
X_mm_train = torch.tensor(X_mm_train, dtype=torch.float32).unsqueeze(1)
X_mm_test = torch.tensor(X_mm_test, dtype=torch.float32).unsqueeze(1)
y_mm_train = torch.tensor(y_mm_train.values, dtype=torch.float32).view(-1, 1)
y_mm_test = torch.tensor(y_mm_test.values, dtype=torch.float32).view(-1, 1)

# Model

In [11]:
class TransformerModel(nn.Module):
	def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, dropout=0.1):
		super(TransformerModel, self).__init__()
		# layers
		self.embedding = nn.Linear(input_dim, model_dim)
		encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
		self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
		self.fc = nn.Linear(model_dim, output_dim)
		self.dropout = nn.Dropout(dropout)
	
	def forward(self, x):
		x = self.embedding(x)
		x = self.dropout(x)
		x = self.transformer(x)
		x = self.fc(x.mean(dim=1))
		return x


In [17]:
def objective(trial, X_train, y_train, X_test, y_test, input_dim, output_dim):
	# Definimos los hiperparámetros a buscar
	num_heads = trial.suggest_int('num_heads', 1, 8)
	model_dim = trial.suggest_int('model_dim', num_heads * 4, num_heads * 64, step=num_heads)
	num_layers = trial.suggest_int('num_layers', 1, 6)
	dropout = trial.suggest_float('dropout', 0.1, 0.5)
	learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
	weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
	num_epochs = trial.suggest_int('num_epochs', 10, 100)

	# model initialization 
	model = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
	criterion = nn.MSELoss()
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

	# training
	model.train()
	for epoch in range(num_epochs):
		optimizer.zero_grad()
		output = model(X_train)
		loss = criterion(output, y_train)
		loss.backward()
		optimizer.step()

	# evaluation
	model.eval()
	with torch.no_grad():
		predictions = model(X_test)
		val_loss = criterion(predictions, y_test)

	return val_loss.item()

# Hyperparameters Optimization

## General

In [14]:
# configuration optuna
study_g = optuna.create_study(direction='minimize')
study_g.optimize(lambda trial: objective(trial, X_g_train, y_g_train, X_g_test, y_g_test, len(features), len(target)), n_trials=100)

# Results
ic(f'Número de pruebas: {len(study_g.trials)}')
ic(f'Mejor prueba: {study_g.best_trial.params}')
ic(f'Mejor valor de pérdida: {study_g.best_trial.value}')

[I 2024-06-12 04:49:16,481] A new study created in memory with name: no-name-d359ab1f-37d2-47ac-b0e6-f6b8c0b8ca07
[I 2024-06-12 04:49:18,077] Trial 0 finished with value: 298.1510009765625 and parameters: {'num_heads': 5, 'model_dim': 135, 'num_layers': 3, 'dropout': 0.14020102965756767, 'learning_rate': 0.0016063201474818608, 'weight_decay': 0.0003404098153945933, 'num_epochs': 37}. Best is trial 0 with value: 298.1510009765625.
[I 2024-06-12 04:49:19,414] Trial 1 finished with value: 821.4876098632812 and parameters: {'num_heads': 1, 'model_dim': 22, 'num_layers': 6, 'dropout': 0.4845791742902722, 'learning_rate': 1.5352616675667106e-05, 'weight_decay': 0.0008114013185230386, 'num_epochs': 30}. Best is trial 0 with value: 298.1510009765625.
[I 2024-06-12 04:49:22,270] Trial 2 finished with value: 416.85418701171875 and parameters: {'num_heads': 8, 'model_dim': 280, 'num_layers': 6, 'dropout': 0.390689487127974, 'learning_rate': 9.735509230031567e-05, 'weight_decay': 0.000414213963519

Número de pruebas: 100
Mejor prueba: {'num_heads': 8, 'model_dim': 440, 'num_layers': 4, 'dropout': 0.17754549994274815, 'learning_rate': 0.004020903372061854, 'weight_decay': 0.0001529765304527148, 'num_epochs': 10}
Mejor valor de pérdida: 110.2566909790039


## Single Thread

In [15]:
# configuration optuna
study_st = optuna.create_study(direction='minimize')
study_st.optimize(lambda trial: objective(trial, X_st_train, y_st_train, X_st_test, y_st_test, len(features), len(target)), n_trials=100)

# Results
ic(f'Número de pruebas: {len(study_st.trials)}')
ic(f'Mejor prueba: {study_st.best_trial.params}')
ic(f'Mejor valor de pérdida: {study_st.best_trial.value}')

[I 2024-06-12 05:01:09,137] A new study created in memory with name: no-name-1ca2f765-e410-498a-bd5a-25f442fcbfcc
[I 2024-06-12 05:01:10,184] Trial 0 finished with value: 480.5138244628906 and parameters: {'num_heads': 4, 'model_dim': 60, 'num_layers': 6, 'dropout': 0.42991711468106575, 'learning_rate': 0.00016839817138468574, 'weight_decay': 0.00015088227698209177, 'num_epochs': 14}. Best is trial 0 with value: 480.5138244628906.
[I 2024-06-12 05:01:11,699] Trial 1 finished with value: 330.24249267578125 and parameters: {'num_heads': 2, 'model_dim': 124, 'num_layers': 3, 'dropout': 0.3438325769521081, 'learning_rate': 0.000236733362545748, 'weight_decay': 0.00038251023377453193, 'num_epochs': 41}. Best is trial 1 with value: 330.24249267578125.
[I 2024-06-12 05:01:15,206] Trial 2 finished with value: 351.36474609375 and parameters: {'num_heads': 7, 'model_dim': 112, 'num_layers': 3, 'dropout': 0.38515227867810153, 'learning_rate': 8.091130284678718e-05, 'weight_decay': 0.0039202861497

Número de pruebas: 100
Mejor prueba: {'num_heads': 7, 'model_dim': 434, 'num_layers': 3, 'dropout': 0.4401663857563715, 'learning_rate': 0.004094956461358572, 'weight_decay': 2.779003561692465e-05, 'num_epochs': 64}
Mejor valor de pérdida: 33.48606491088867


## Multi Thread

In [18]:
# configuration optuna
study_mm = optuna.create_study(direction='minimize')
study_mm.optimize(lambda trial: objective(trial, X_mm_train, y_mm_train, X_mm_test, y_mm_test, len(features), len(target)), n_trials=100)

# Results
ic(f'Trials quantity: {len(study_mm.trials)}')
ic(f'Best trial: {study_mm.best_trial.params}')
ic(f'Mejor valor de pérdida: {study_mm.best_trial.value}')

[I 2024-06-12 05:12:15,393] A new study created in memory with name: no-name-b42da388-ff14-4989-8004-8b8b1d5d6f6c
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2024-06-12 05:12:16,352] Trial 0 finished with value: 577.6166381835938 and parameters: {'num_heads': 5, 'model_dim': 55, 'num_layers': 2, 'dropout': 0.39379409591033876, 'learning_rate': 0.00028780351583125146, 'weight_decay': 0.00979019221645173, 'num_epochs': 47}. Best is trial 0 with value: 577.6166381835938.
[I 2024-06-12 05:12:17,505] Trial 1 finished with value: 448.9656677246094 and parameters: {'num_heads': 3, 'model_dim': 183, 'num_layers': 2, 'dropout': 0.45455717814332286, 'learning_rate': 0.00042176177834808666, 'weight_decay': 1.2625036145872518e-05, 'num_epochs': 33}. Best is trial 1 with value: 448.9656677246094.
[I 2024-06-12 05:12:20,836] Trial 2 finished with value: 45.338829040527344 and parameters: {'num_heads': 8, 'model_dim': 29

'Mejor valor de pérdida: 33.852020263671875'

# Training

In [19]:
input_dim = len(features)
output_dim = 1

## General

In [20]:
# hyperparameters
num_heads = study_g.best_trial.params['num_heads']
model_dim = study_g.best_trial.params['model_dim']
num_layers = study_g.best_trial.params['num_layers']
dropout = study_g.best_trial.params['dropout']
lr = study_g.best_trial.params['learning_rate']
wd = study_g.best_trial.params['weight_decay']
num_epochs = study_g.best_trial.params['num_epochs']
ic(study_g.best_trial.params)

ic| study_g.best_trial.params: {'dropout': 0.17754549994274815,
                                'learning_rate': 0.004020903372061854,
                                'model_dim': 440,
                                'num_epochs': 10,
                                'num_heads': 8,
                                'num_layers': 4,
                                'weight_decay': 0.0001529765304527148}


{'num_heads': 8,
 'model_dim': 440,
 'num_layers': 4,
 'dropout': 0.17754549994274815,
 'learning_rate': 0.004020903372061854,
 'weight_decay': 0.0001529765304527148,
 'num_epochs': 10}

In [21]:
# general model initialization
model_g = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
criterion_g = nn.MSELoss()
optimizer_g = optim.AdamW(model_g.parameters(), lr=lr, weight_decay=wd)

model_g.train()

for epoch in range(num_epochs):
	optimizer_g.zero_grad()
	output = model_g(X_g_train)
	loss = criterion_g(output, y_g_train)
	loss.backward()
	optimizer_g.step()
	# validation
	if (epoch+1) % 10 == 0:
		model_g.eval()
		with torch.no_grad():
			val_predictions = model_g(X_g_test)
			val_loss = criterion_g(val_predictions, y_g_test)
		ic(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_g.train()

ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 10/10, Loss: 50.93722152709961, Val Loss: 111.02555084228516'


## Single Thread

In [22]:
# hyperparameters
num_heads = study_st.best_trial.params['num_heads']
model_dim = study_st.best_trial.params['model_dim']
num_layers = study_st.best_trial.params['num_layers']
dropout = study_st.best_trial.params['dropout']
lr = study_st.best_trial.params['learning_rate']
wd = study_st.best_trial.params['weight_decay']
num_epochs = study_st.best_trial.params['num_epochs']
ic(study_st.best_trial.params)

ic| study_st.best_trial.params: {'dropout': 0.4401663857563715,
                                 'learning_rate': 0.004094956461358572,
                                 'model_dim': 434,
                                 'num_epochs': 64,
                                 'num_heads': 7,
                                 'num_layers': 3,
                                 'weight_decay': 2.779003561692465e-05}


{'num_heads': 7,
 'model_dim': 434,
 'num_layers': 3,
 'dropout': 0.4401663857563715,
 'learning_rate': 0.004094956461358572,
 'weight_decay': 2.779003561692465e-05,
 'num_epochs': 64}

In [23]:
# single thread model initialization
model_st = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
criterion_st = nn.MSELoss()
optimizer_st = optim.AdamW(model_st.parameters(), lr=lr, weight_decay=wd)

model_st.train()

for epoch in range(num_epochs):
	optimizer_st.zero_grad()
	output = model_st(X_st_train)
	loss = criterion_st(output, y_st_train)
	loss.backward()
	optimizer_st.step()
	# validation
	if (epoch+1) % 10 == 0:
		model_st.eval()
		with torch.no_grad():
			val_predictions = model_st(X_st_test)
			val_loss = criterion_st(val_predictions, y_st_test)
		ic(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_st.train()

ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 10/64, Loss: 17.28394317626953, Val Loss: 21.656965255737305'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 20/64, Loss: 17.372127532958984, Val Loss: 53.971885681152344'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 30/64, Loss: 16.680809020996094, Val Loss: 28.57021141052246'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 40/64, Loss: 15.886628150939941, Val Loss: 38.20534896850586'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 50/64, Loss: 15.993600845336914, Val Loss: 33.39986038208008'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 60/64, Loss: 15.981679916381836, Val Loss: 34.56472396850586'


## Multi Thread

In [27]:
# hyperparameters
num_heads = study_mm.best_trial.params['num_heads']
model_dim = study_mm.best_trial.params['model_dim']
num_layers = study_mm.best_trial.params['num_layers']
dropout = study_mm.best_trial.params['dropout']
lr = study_mm.best_trial.params['learning_rate']
wd = study_mm.best_trial.params['weight_decay']
num_epochs = study_mm.best_trial.params['num_epochs']
ic(study_mm.best_trial.params)

ic| study_mm.best_trial.params: {'dropout': 0.14548354765365423,
                                 'learning_rate': 0.0035210360246936668,
                                 'model_dim': 448,
                                 'num_epochs': 89,
                                 'num_heads': 7,
                                 'num_layers': 4,
                                 'weight_decay': 0.0002816526079779778}


{'num_heads': 7,
 'model_dim': 448,
 'num_layers': 4,
 'dropout': 0.14548354765365423,
 'learning_rate': 0.0035210360246936668,
 'weight_decay': 0.0002816526079779778,
 'num_epochs': 89}

In [25]:
# multi thread model initialization
model_mm = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim, dropout)
criterion_mm = nn.MSELoss()
optimizer_mm = optim.AdamW(model_mm.parameters(), lr=lr, weight_decay=wd)

model_mm.train()
num_epochs = 500

for epoch in range(num_epochs):
	optimizer_mm.zero_grad()
	output = model_mm(X_mm_train)
	loss = criterion_mm(output, y_mm_train)
	loss.backward()
	optimizer_mm.step()
	# validation
	if (epoch+1) % 10 == 0:
		model_mm.eval()
		with torch.no_grad():
			val_predictions = model_mm(X_mm_test)
			val_loss = criterion_mm(val_predictions, y_mm_test)
		ic(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
		model_mm.train()

ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 10/500, Loss: 15.367077827453613, Val Loss: 27.776830673217773'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 20/500, Loss: 16.665973663330078, Val Loss: 47.85505294799805'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 30/500, Loss: 16.205753326416016, Val Loss: 31.511938095092773'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 40/500, Loss: 15.595016479492188, Val Loss: 33.759613037109375'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 50/500, Loss: 15.786975860595703, Val Loss: 37.12171936035156'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}': 'Epoch 60/500, Loss: 15.929427146911621, Val Loss: 33.58354949951172'
ic| f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}

# Conclusion
Queda trabajo que hacer en la red, además de conseguir más datos para un entrenamiento más robusto. Queda por ahora descartado el uso de solo un modelo para multi-threading y single-threading, ya que el modelo tiene más del triple de *loss*.