In [12]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetRegressor
import random
import os
from joblib import dump

# CUDA
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE.type)

# Fix random seed
seed = 42
torch.manual_seed(seed)
if DEVICE.type == 'cuda':
	torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

cuda


# Pre-processing input data

In [13]:
# train_df and test_df are created in eda notebook
if not os.path.exists('csv/train_df.csv') or not os.path.exists('csv/test_df.csv'):
	print('Please run the eda notebook first')
	exit()
train_df = pd.read_csv('csv/train_df.csv')
test_df = pd.read_csv('csv/test_df.csv')
target = 'total_time_target'
features = test_df.columns.copy().drop(target).drop(['benchmark','brand_raw', 'brand_raw_target','vendor_id_raw', 'arch'])

# log for total_times
train_df_log = train_df.copy()
test_df_log = test_df.copy()
train_df_log[[target, 'total_time']] = np.log1p(train_df[[target, 'total_time']])
test_df_log[[target, 'total_time']] = np.log1p(test_df[[target, 'total_time']])

# Split data
## XGB & MCD
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

# Normalize data
x_scaler = MinMaxScaler(feature_range=(0, 1))
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)
y_scaler = MinMaxScaler(feature_range=(0, 1))
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = y_scaler.transform(y_test.values.reshape(-1, 1))

## convert to tensor
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32)

if DEVICE.type == 'cuda':
	X_train_t = X_train_t.to(DEVICE)
	y_train_t = y_train_t.to(DEVICE)
	X_test_t = X_test_t.to(DEVICE)
	y_test_t = y_test_t.to(DEVICE)

## TabNet & FNN
X_train_log = train_df_log[features]
y_train_log = train_df_log[target]

X_test_log = test_df_log[features]
y_test_log = test_df_log[target]

# Normalize data
x_scaler_log = MinMaxScaler(feature_range=(0, 1))
X_train_log = x_scaler_log.fit_transform(X_train_log)
X_test_log = x_scaler_log.transform(X_test_log)

y_scaler_log = MinMaxScaler(feature_range=(0, 1))
y_train_log = y_scaler_log.fit_transform(y_train_log.values.reshape(-1, 1))
y_test_log = y_scaler_log.transform(y_test_log.values.reshape(-1, 1))

## convert to tensor
X_train_t_log = torch.tensor(X_train_log, dtype=torch.float32)
X_test_t_log = torch.tensor(X_test_log, dtype=torch.float32)
y_train_t_log = torch.tensor(y_train_log, dtype=torch.float32)
y_test_t_log = torch.tensor(y_test_log, dtype=torch.float32)

if DEVICE.type == 'cuda':
	X_train_t_log = X_train_t_log.to(DEVICE)
	y_train_t_log = y_train_t_log.to(DEVICE)
	X_test_t_log = X_test_t_log.to(DEVICE)
	y_test_t_log = y_test_t_log.to(DEVICE)


In [14]:
if not os.path.exists('../models/x_scaler.joblib') or not os.path.exists('../models/y_scaler.joblib'):
	dump(x_scaler, '../models/x_scaler.joblib')
	dump(y_scaler, '../models/y_scaler.joblib')

# Model

In [15]:
class FeedforwardModel(nn.Module):
	def __init__(self, input_dim, dropout=0.1):
		super(FeedforwardModel, self).__init__()
		# layers
		self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(p=dropout),
			nn.Linear(64, 32),
			nn.ReLU(),
			nn.Dropout(p=dropout),
            nn.Linear(32, 1),
        )
	
	def forward(self, x):
		return self.model(x)
	
	def predict(model, X):
		model.eval()
		with torch.no_grad():
			prediction = model(X)
		return prediction
	
class MCDropoutModel(nn.Module):
	def __init__(self, input_dim, n_hidden_layers, dropout=0.1):
		super(MCDropoutModel, self).__init__()
		if n_hidden_layers < 2:
			raise ValueError("n_hidden_layers must be greater than 1")
		# Input layer
		layers = [
			nn.Linear(input_dim, 200),
			nn.ReLU(),
			nn.Dropout(p=dropout),
		]
		# Hidden layers
		layers.extend([
			nn.Linear(200, 500),
			nn.ReLU(),
			nn.Dropout(p=dropout)
		])
		for _ in range(n_hidden_layers-2):
			layers.extend([
				nn.Linear(500, 500),
				nn.ReLU(),
				nn.Dropout(p=dropout)
			])
		layers.extend([
			nn.Linear(500, 200),
			nn.ReLU(),
			nn.Dropout(p=dropout)
		])
		# Output layer
		layers.append(nn.Linear(200, 2))
		self.model = nn.Sequential(*layers)
	
	def forward(self, x):
		params = self.model(x)
		loc = params[:, 0:1]
		scale =  1e-6 + torch.nn.functional.softplus(0.33 * params[:, 1:2])
		return torch.distributions.Normal(loc, scale)
	
	def predict(self, X):
		self.eval()
		with torch.no_grad():
			predictions = self(X).sample()
		return predictions

In [16]:
def inv_scaling_pyt(y, y_scaler, d=False):
    if d:
        return y_scaler.inverse_transform(y.detach().cpu().numpy().reshape(-1, 1))
    return y_scaler.inverse_transform(y.cpu().numpy().reshape(-1, 1))

def inv_scaling(y, y_scaler):
    return y_scaler.inverse_transform(y.reshape(-1, 1))

In [17]:
models_folder = '../models/'
models_path = {
	'tabnet': models_folder + 'tabnet_model.zip',
	'mc_dropout': models_folder + 'mc_dropout_model.pt',
	'feedforward': models_folder + 'feedforward_model.pt',
	'xgboost': models_folder + 'xgboost_model.json'
}

In [18]:
def ensemble_predict(models, X, X_t, X_log, X_t_log, y_scaler, y_scaler_log):
	predictions = []
	prediction = models[0].predict(X_t_log)
	predictions.append(np.expm1(inv_scaling_pyt(prediction, y_scaler_log)))
	prediction = models[1].predict(X_t)
	predictions.append(inv_scaling_pyt(prediction, y_scaler))
	prediction = models[2].predict(X_log)
	predictions.append(np.expm1(inv_scaling(prediction, y_scaler_log)))
	prediction = models[3].predict(X)
	predictions.append(inv_scaling(prediction, y_scaler))
	avg_predictions = np.mean(predictions, axis=0)
	return avg_predictions

# Training

In [19]:
# general models
## TabNet
tabnet = TabNetRegressor()
tabnet.load_model(models_path['tabnet'])
## XGBoost
xgboost = xgb.XGBRegressor()
xgboost.load_model(models_path['xgboost'])

models = [
    torch.load(models_path["feedforward"]).to(DEVICE),
    torch.load(models_path["mc_dropout"]).to(DEVICE),
    tabnet,
    xgboost
]



In [20]:
preds = ensemble_predict(models, X_test, X_test_t, X_test_log, X_test_t_log, y_scaler, y_scaler_log)
y_scaled = inv_scaling(y_test, y_scaler)
rmse = np.sqrt(mean_squared_error(y_scaled, preds))
print(f"RMSE Val loss: {rmse}")

RMSE Val loss: 3.6627801875161023


# Describe Validation

In [21]:
def describe_val(model, X, X_t, X_log, X_t_log, y, y_scaler, y_scaler_log):
	min_instance = {"prediction": float('inf'), "actual": 0, "index": 0}
	max_instance = {"prediction": 0, "actual": 0, "index": 0}
	
	predictions = ensemble_predict(model, X, X_t, X_log, X_t_log, y_scaler, y_scaler_log)
	y_scaled = inv_scaling(y, y_scaler)
	index_min = np.argmin(np.abs(predictions - y_scaled))
	min_instance["prediction"] = predictions[index_min].item()
	min_instance["actual"] = y_scaled[index_min].item()
	min_instance["index"] = index_min
	index_max = np.argmax(np.abs(predictions - y_scaled))
	max_instance["prediction"] = predictions[index_max].item()
	max_instance["actual"] = y_scaled[index_max].item()
	max_instance["index"] = index_max

	return min_instance, max_instance, predictions

In [22]:
# single thread model
print("Validation set single thread model")
min_instance, max_instance, predictions = describe_val(models, X_test, X_test_t, X_test_log, X_test_t_log, y_test, y_scaler, y_scaler_log)
y_scaled = inv_scaling(y_test, y_scaler)
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("---")
print("Min instance")
print(test_df.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model
Mean prediction: 23.23479461669922 | Std actual: 42.17317581176758
Mean actual: 23.631032844733987 | Std actual: 41.60096829835196
Mean Error: 1.576265782239755 | Std Error: 3.223901116720842
---
Min instance
total_time                                                            11.73
total_cpu_usage                                                        0.99
max_ram_usage                                                     45.234375
brand_raw                          Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    32
l2_cache_size                                                          32.0
l3_cache_size                                                          22.0
l2_cache_line_size                                                      