In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import random
import os
from joblib import dump, load

# Fix random seed
seed = 42
np.random.seed(seed)
random.seed(seed)

# Pre-processing input data

In [2]:
# train_df and test_df are created in eda notebook
if not os.path.exists('csv/train_df.csv') or not os.path.exists('csv/test_df.csv'):
	print('Please run the eda notebook first')
	exit()
train_df = pd.read_csv('csv/train_df.csv')
test_df = pd.read_csv('csv/test_df.csv')
target = 'total_time_target'
features = test_df.columns.copy().drop(target).drop(['benchmark','brand_raw', 'brand_raw_target','vendor_id_raw', 'arch'])

# Spearman U Kendall correlation feature selection > 0.4
# features = ["total_time","max_ram_usage","brk","munmap","rss","vms","shared","data"]
# mean error: 3.709703484366483 | std error: 8.718651668556726

# log for total_times
train_df[[target, 'total_time']] = np.log1p(train_df[[target, 'total_time']])
test_df[[target, 'total_time']] = np.log1p(test_df[[target, 'total_time']])

# Split data
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

# Normalize data
x_scaler = MinMaxScaler(feature_range=(0, 1))
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)
y_scaler = MinMaxScaler(feature_range=(0, 1))
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = y_scaler.transform(y_test.values.reshape(-1, 1))

In [3]:
if not os.path.exists('../models/x_scaler.joblib') or not os.path.exists('../models/y_scaler.joblib'):
	dump(x_scaler, '../models/x_scaler.joblib')
	dump(y_scaler, '../models/y_scaler.joblib')

# Training

In [4]:
def inv_scaling(y, y_scaler):
    return y_scaler.inverse_transform(y.reshape(-1, 1))

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)

# Train
predictions_train = model.predict(X_train)
predictions_train = inv_scaling(predictions_train, y_scaler)
y_train_scaled = inv_scaling(y_train, y_scaler)
loss = np.sqrt(mean_squared_error(y_train_scaled, predictions_train))

# Test
predictions = model.predict(X_test)
predictions = inv_scaling(predictions, y_scaler)
y_scaled = inv_scaling(y_test, y_scaler)
val_loss = np.sqrt(mean_squared_error(y_scaled, predictions))
print(f"RMSE Loss : {loss} - RMSE Val Loss: {val_loss}")

RMSE Loss : 0.11729050296859356 - RMSE Val Loss: 0.13450032219524993


In [6]:
# save model
dump(model, f'../models/linear_model.joblib')

['../models/linear_model.joblib']

# Load models

In [7]:
model = load('../models/linear_model.joblib')

In [8]:
def describe_val(model, X, y, y_scaler):
	min_instance = {"prediction": float('inf'), "actual": 0, "index": 0}
	max_instance = {"prediction": 0, "actual": 0, "index": 0}

	predictions = model.predict(X)
	predictions = np.expm1(inv_scaling(predictions, y_scaler))
	y_scaled = np.expm1(inv_scaling(y, y_scaler))
	index_min = np.argmin(np.abs(predictions - y_scaled))
	min_instance["prediction"] = predictions[index_min].item()
	min_instance["actual"] = y_scaled[index_min].item()
	min_instance["index"] = index_min
	index_max = np.argmax(np.abs(predictions - y_scaled))
	max_instance["prediction"] = predictions[index_max].item()
	max_instance["actual"] = y_scaled[index_max].item()
	max_instance["index"] = index_max

	return min_instance, max_instance, predictions

In [9]:
test_df_og = pd.read_csv('csv/test_df.csv')

In [10]:
print("Validation set single thread model")
min_instance, max_instance, predictions = describe_val(model, X_test, y_test, y_scaler)
y_scaled = np.expm1(inv_scaling(y_test, y_scaler))
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(test_df_og.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df_og.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model
Mean prediction: 22.77146059422678 | Std actual: 44.98680039102537
Mean actual: 23.401548499932737 | Std actual: 44.0307253488359
Mean Error: 2.6752771049312525 | Std Error: 7.7224077213252835
Min instance
total_time                                                            30.09
total_cpu_usage                                                        0.99
max_ram_usage                                                     62.207031
brand_raw                         Intel(R) Xeon(R) CPU E5-2643 v3 @ 3.40GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    12
l2_cache_size                                                           3.0
l3_cache_size                                                          20.0
l2_cache_line_size                                                      256

In [11]:
# get index of total_time_target < 2.00
index = test_df_og[test_df_og['total_time_target'] < 2.0].index
X_test_low = X_test[index]
y_test_low = y_test[index]
test_df_og_low = test_df_og.iloc[index].reset_index(drop=True)
min_instance, max_instance, predictions = describe_val(model, X_test_low, y_test_low, y_scaler)
y_scaled = inv_scaling(y_test_low, y_scaler)
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print("Validation set single thread model with total_time_target < 2.0")
print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(test_df_og_low.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df_og_low.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model with total_time_target < 2.0
Mean prediction: 0.7819675342794404 | Std actual: 0.5620442147314038
Mean actual: 0.500334335537253 | Std actual: 0.2995686663396718
Mean Error: 0.2926018329482411 | Std Error: 0.27236656410655335
Min instance
total_time                                                             1.59
total_cpu_usage                                                        0.99
max_ram_usage                                                     13.355469
brand_raw                         Intel(R) Xeon(R) CPU E5-2643 v3 @ 3.40GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    12
l2_cache_size                                                           3.0
l3_cache_size                                                          20.0
l2_cache_line_size                        

In [12]:
# get index of 2.00 > total_time_target > 150.0
index = test_df_og[(test_df_og['total_time_target'] <= 150.0) & (test_df_og['total_time_target'] >= 2.0)].index
X_test_med = X_test[index]
y_test_med = y_test[index]
test_df_og_med = test_df_og.iloc[index].reset_index(drop=True)
min_instance, max_instance, predictions = describe_val(model, X_test_med, y_test_med, y_scaler)
y_scaled = inv_scaling(y_test_med, y_scaler)
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print("Validation set single thread model with total_time_target in [2.0, 150.0]")
print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(test_df_og_med.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df_og_med.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model with total_time_target in [2.0, 150.0]
Mean prediction: 25.398474253867388 | Std actual: 23.278467740416215
Mean actual: 3.0454776818737144 | Std actual: 0.7425768880201676
Mean Error: 22.352996571993675 | Std Error: 22.667219662030526
Min instance
total_time                                                            30.09
total_cpu_usage                                                        0.99
max_ram_usage                                                     62.207031
brand_raw                         Intel(R) Xeon(R) CPU E5-2643 v3 @ 3.40GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    12
l2_cache_size                                                           3.0
l3_cache_size                                                          20.0
l2_cache_line_size              

In [13]:
# get index of total_time_target > 150.0
index = test_df_og[test_df_og['total_time_target'] > 150.0].index
X_test_high = X_test[index]
y_test_high = y_test[index]
test_df_og_high = test_df_og.iloc[index].reset_index(drop=True)
min_instance, max_instance, predictions = describe_val(model, X_test_high, y_test_high, y_scaler)
y_scaled = inv_scaling(y_test_high, y_scaler)
errors = np.abs(predictions - y_scaled)
mean_error = np.mean(errors)
std_error = np.std(errors)

print("Validation set single thread model with total_time_target > 150.00")
print(f"Mean prediction: {np.mean(predictions)} | Std actual: {np.std(predictions)}")
print(f"Mean actual: {np.mean(y_scaled)} | Std actual: {np.std(y_scaled)}")
print(f"Mean Error: {mean_error} | Std Error: {std_error}")
print("Min instance")
print(test_df_og_high.iloc[min_instance["index"]])
print(f"Min Prediction: {min_instance['prediction']} | Actual: {min_instance['actual']} | Error: {abs(min_instance['prediction'] - min_instance['actual'])}")
print("---")
print("Max instance")
print(test_df_og_high.iloc[max_instance["index"]])
print(f"Max Prediction: {max_instance['prediction']} | Actual: {max_instance['actual']} | Error: {abs(max_instance['prediction'] - max_instance['actual'])}")

Validation set single thread model with total_time_target > 150.00
Mean prediction: 265.0003490592625 | Std actual: 57.988730725035104
Mean actual: 5.587667460601355 | Std actual: 0.04794298966420108
Mean Error: 259.41268159866104 | Std Error: 57.94981849233364
Min instance
total_time                                                           180.64
total_cpu_usage                                                        0.99
max_ram_usage                                                     61.023438
brand_raw                          Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz
vendor_id_raw                                                  GenuineIntel
arch                                                                 X86_64
count                                                                    64
l2_cache_size                                                          32.0
l3_cache_size                                                          22.0
l2_cache_line_size                       

In [14]:
print("coef:",model.coef_, "| intercept:",model.intercept_.item())

coef: [[ 9.75098437e-01 -5.68713297e-03  6.66116461e-03  9.16932761e-02
  -3.26922821e-02 -4.78445737e-02  9.98038083e-03 -3.69154526e-02
   1.55888539e-02 -4.39803006e-03  5.97708731e-02  3.62969515e+10
  -7.74771447e+09 -5.16152820e+09  1.16694751e+09  1.64954757e+10
  -6.23289111e+09  5.66181982e+09 -1.16694751e+09 -4.62563320e+09
  -4.58892590e+09 -5.01065747e+09 -8.88217922e+09  1.20155142e-02
  -1.04004844e-02 -4.88107783e+09 -1.31026203e-02  3.67365374e-02
  -1.05680164e-02  5.40880622e-02  1.23006059e-02 -4.61068429e-02
   1.01700788e+00  5.29796536e-02 -2.19776227e-02  1.83246572e-02
  -1.03679409e+00]] | intercept: -5661819818.458075
