In [7]:
import numpy as np
import pandas as pd
import random
import os

from joblib import dump, load
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Fix random seed
seed = 42
np.random.seed(seed)
random.seed(seed)

# Pre-processing input data

In [8]:
# train_df and test_df are created in eda notebook
if not os.path.exists('csv/train_df.csv') or not os.path.exists('csv/test_df.csv'):
	print('Please run the eda notebook first')
	exit()
train_df = pd.read_csv('csv/train_df.csv')
test_df = pd.read_csv('csv/test_df.csv')
target = 'total_time_target'
features = test_df.columns.copy().drop(target).drop(['benchmark','brand_raw', 'brand_raw_target','vendor_id_raw', 'arch'])

# Spearman U Kendall correlation feature selection > 0.4
# features = ["total_time","max_ram_usage","brk","munmap","rss","vms","shared","data"]
# mean error: 3.709703484366483 | std error: 8.718651668556726

# log for total_times
train_df[[target, 'total_time']] = np.log1p(train_df[[target, 'total_time']])
test_df[[target, 'total_time']] = np.log1p(test_df[[target, 'total_time']])

# Split data
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

# Normalize data
x_scaler = MinMaxScaler(feature_range=(0, 1))
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)
y_scaler = MinMaxScaler(feature_range=(0, 1))
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = y_scaler.transform(y_test.values.reshape(-1, 1))

In [9]:
if not os.path.exists('../models/x_scaler.joblib') or not os.path.exists('../models/y_scaler.joblib'):
	dump(x_scaler, '../models/x_scaler.joblib')
	dump(y_scaler, '../models/y_scaler.joblib')

# Training

In [10]:
def inv_scaling(y, y_scaler):
    return y_scaler.inverse_transform(y.reshape(-1, 1))

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

# Train
predictions_train = model.predict(X_train)
predictions_train = inv_scaling(predictions_train, y_scaler)
y_train_scaled = inv_scaling(y_train, y_scaler)
loss = np.sqrt(mean_squared_error(y_train_scaled, predictions_train))

# Test
predictions = model.predict(X_test)
predictions = inv_scaling(predictions, y_scaler)
y_scaled = inv_scaling(y_test, y_scaler)
val_loss = np.sqrt(mean_squared_error(y_scaled, predictions))
print(f"RMSE Loss : {loss} - RMSE Val Loss: {val_loss}")

RMSE Loss : 0.11876714653360007 - RMSE Val Loss: 0.13633199268136872


In [12]:
# save model
dump(model, f'../models/linear_model.joblib')

['../models/linear_model.joblib']

# Load models

In [13]:
model = load('../models/linear_model.joblib')

In [14]:
def describe_val(model, X, y, y_scaler):
	min_instance = {"prediction": float('inf'), "actual": 0, "index": 0}
	max_instance = {"prediction": 0, "actual": 0, "index": 0}

	predictions = model.predict(X)
	predictions = np.expm1(inv_scaling(predictions, y_scaler))
	y_scaled = np.expm1(inv_scaling(y, y_scaler))
	index_min = np.argmin(np.abs(predictions - y_scaled))
	min_instance["prediction"] = predictions[index_min].item()
	min_instance["actual"] = y_scaled[index_min].item()
	min_instance["index"] = index_min
	index_max = np.argmax(np.abs(predictions - y_scaled))
	max_instance["prediction"] = predictions[index_max].item()
	max_instance["actual"] = y_scaled[index_max].item()
	max_instance["index"] = index_max

	return min_instance, max_instance, predictions

In [15]:
test_df_og = pd.read_csv('csv/test_df.csv')

In [16]:
_, _, predictions = describe_val(model, X_test, y_test, y_scaler)
y_scaled = np.expm1(inv_scaling(y_test, y_scaler))
df = pd.DataFrame(predictions, columns=['predictions'])
df['actual'] = y_scaled
df.to_csv('csv/predictions_lr.csv', index=False)

In [17]:
print("Validation set single thread model")
min_instance, max_instance, predictions = describe_val(model, X_test, y_test, y_scaler)
y_scaled = np.expm1(inv_scaling(y_test, y_scaler))
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(test_df_og.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df_og.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model
Mean prediction: 24.08753283482682 | Std actual: 47.83066962490797
Mean actual: 23.631032844733987 | Std actual: 41.600968298351965
Mean Error: 3.14659337866484 | Std Error: 9.707796819085699
Min instance
total_time                                                             0.88
total_cpu_usage                                                        0.99
max_ram_usage                                                     24.078125
brand_raw                         Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    24
l2_cache_size                                                           6.0
l3_cache_size                                                          30.0
l2_cache_line_size                                                      256


In [18]:
print("coef:",model.coef_, "| intercept:",model.intercept_.item())

coef: [[ 9.73881506e-01 -5.38925948e-03  5.74822778e-03 -6.79802720e-03
   1.10709571e-02 -2.59094967e-02  2.82046697e-03 -4.84245982e-02
   1.55289358e-02 -2.02772496e-02  6.36884751e-02 -1.70373099e+11
  -9.36989400e+09  3.04964138e+10  2.46141618e+09 -2.86392176e+10
   2.17686195e+11 -1.30886114e+10 -1.06245319e+10  3.16407183e+11
   1.17709266e-02 -2.79885229e+10 -1.63594853e+10 -1.19288240e+10
  -9.58543929e-03 -2.17686195e+11 -1.51995769e-02  3.90525513e-02
  -1.12015457e-02  5.40859374e-02  1.12640937e-02 -4.43953429e-02
   1.14050118e+00  6.39784547e-02 -2.48586948e-02  1.78028750e-02
  -1.16562237e+00]] | intercept: -30496413758.375675
