In [13]:
from os.path import dirname, abspath, join, exists
import os
import numpy as np
from random import seed
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.losses import Huber
import pandas as pd
import logging
from importlib import reload
import pickle
import time
path="data/"

# C-TFIDF

In [27]:
modeltype="resultsize" # one of "runtime" or "resultsize"

In [None]:
reload(logging)
logging.basicConfig(filename='logs/tfidf.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

train_dataset = pd.read_csv(f"{path}train.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                        dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                               "mm:": int, "dd": int}, memory_map=True)
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(1,5), max_features=500000)
x_transformed = tfidf.fit_transform(train_dataset["statement"])
pickle.dump(tfidf, open(f"models/ctfidf_{modeltype}.pkl", "wb"))

In [None]:
model = HuberRegressor(max_iter=10000)
min_modeltype = min(train_dataset[modeltype])
pred_modeltype = np.log(train_dataset[modeltype] + 1 - min_modeltype)
model.fit(x_transformed, pred_modeltype)

logging.info(f"Fit done for {modeltype}. Starting now with evaluation")
pickle.dump(model, open(f"models/{modeltype}.pkl", "wb"))

In [28]:
model = pickle.load(open(f"models/{modeltype}.pkl", "rb"))
tfidf = pickle.load(open(f"models/ctfidf_{modeltype}.pkl", "rb"))

In [30]:
reload(logging)
logging.basicConfig(filename='logs/tfidf.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

print("Starting evaluation")
test_dataset = pd.read_csv(f"{path}test.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                        dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                               "mm:": int, "dd": int}, memory_map=True)

x_test_transformed = tfidf.transform(test_dataset["statement"])
pred = model.predict(x_test_transformed)

min_modeltype = min(test_dataset[modeltype])
target_data = np.log(test_dataset[modeltype] + 1 - min_modeltype)

mse = mean_squared_error(target_data, pred)
mae = mean_absolute_error(target_data, pred)

h = Huber()
test_loss = h(target_data,pred).numpy()

logging.info(f"TFIDF. {modeltype}. Test loss: {test_loss}, MSE {modeltype}: {mse}, MAE {modeltype}: {mae}")

Starting evaluation


In [None]:
reload(logging)
logging.basicConfig(filename='logs/tfidf.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")
pred = [np.exp(x)-1+min_modeltype for x in pred]
if modeltype=="resultsize":
    qerror = [max(max(pred[i],1) / max(1,target_data[i]), max(1,target_data[i]) / max(1,pred[i])) for i in range(len(target_data))]
else:
    qerror = [max(pred[i] / target_data[i], target_data[i] / pred[i]) for i in range(len(target_data))]

logging.info("")
logging.info(f"Qerror for {modeltype}")
logging.info("Median: {}".format(np.median(qerror)))
logging.info("Mean: {}".format(np.mean(qerror)))
logging.info("Max: {}".format(np.max(qerror)))
logging.info("10th percentile: {}".format(np.percentile(qerror, 10)))
logging.info("20th percentile: {}".format(np.percentile(qerror, 20)))
logging.info("30th percentile: {}".format(np.percentile(qerror, 30)))
logging.info("40th percentile: {}".format(np.percentile(qerror, 40)))
logging.info("50th percentile: {}".format(np.percentile(qerror, 50)))
logging.info("60th percentile: {}".format(np.percentile(qerror, 60)))
logging.info("70th percentile: {}".format(np.percentile(qerror, 70)))
logging.info("80th percentile: {}".format(np.percentile(qerror, 80)))
logging.info("90th percentile: {}".format(np.percentile(qerror, 90)))
logging.info("95th percentile: {}".format(np.percentile(qerror, 95)))
logging.info("98th percentile: {}".format(np.percentile(qerror, 98)))

# Median

In [None]:
reload(logging)
logging.basicConfig(filename='logs/median.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

# Calculate Medians
train_dataset = pd.read_csv(f"{path}train.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                        dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                               "mm:": int, "dd": int}, memory_map=True)

min_runtime = min(train_dataset["runtime"])
min_resultsize = min(train_dataset["resultsize"])

pred_time = np.median(train_dataset["runtime"])
pred_size = np.median(train_dataset["resultsize"])

pred_time = np.log(pred_time + 1 - min_runtime)
pred_size = np.log(pred_size + 1 - min_resultsize)

print(f"Log Median Time: {pred_time}")
print(f"Log Median Size: {pred_size}")

In [26]:
reload(logging)
logging.basicConfig(filename='logs/median.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

# pred_time = 0.009949572809527275
# pred_size = 2.9444389791664403
logging.info(f"Starting the test-process for Median.")

test_dataset = pd.read_csv(f"{path}test.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                        dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                               "mm:": int, "dd": int}, memory_map=True)

min_runtime = min(test_dataset["runtime"])
min_resultsize = min(test_dataset["resultsize"])

target_time = np.log(test_dataset["runtime"] + 1 - min_runtime)
target_size = np.log(test_dataset["resultsize"] + 1 - min_resultsize)

count = len(test_dataset)

mse_runtime = mean_squared_error(target_time,[pred_time]*count)
mse_resultsize = mean_squared_error(target_size,[pred_size]*count)
mae_runtime = mean_absolute_error(target_time,[pred_time]*count)
mae_resultsize = mean_absolute_error(target_size,[pred_size]*count)

h = Huber()
test_loss_runtime = h(target_time,[pred_time]*count).numpy()
test_loss_resultsize = h(target_size,[pred_size]*count).numpy()

logging.info(f"Median. Test loss runtime: {test_loss_runtime}, Test loss resultsize: {test_loss_resultsize}, overall loss: {test_loss_runtime+test_loss_resultsize}, "
             f"MSE Runtime: {mse_runtime}, MSE Resultsize: {mse_resultsize}, "
             f"MAE Runtime: {mae_runtime}, MAE Resultsize: {mae_resultsize}")

In [None]:
# actual predictions
pred_time = [np.median(test_dataset["runtime"])]*count
pred_size = [np.median(test_dataset["resultsize"])]*count

qerror_time = [max(pred_time[i] / test_dataset["runtime"][i], test_dataset["runtime"][i] / pred_time[i]) for i in range(len(test_dataset["runtime"]))]
qerror_size = [max(max(pred_size[i],1) / max(1,test_dataset["resultsize"][i]), max(1,test_dataset["resultsize"][i]) / max(1,pred_size[i])) for i in range(len(test_dataset["resultsize"]))]

for (qerror,name) in [(qerror_time,"runtime"), (qerror_size,"resultsize")]:
    logging.info("")
    logging.info(f"Qerror for {name}")
    logging.info("Median: {}".format(np.median(qerror)))
    logging.info("Mean: {}".format(np.mean(qerror)))
    logging.info("Max: {}".format(np.max(qerror)))
    logging.info("10th percentile: {}".format(np.percentile(qerror, 10)))
    logging.info("20th percentile: {}".format(np.percentile(qerror, 20)))
    logging.info("30th percentile: {}".format(np.percentile(qerror, 30)))
    logging.info("40th percentile: {}".format(np.percentile(qerror, 40)))
    logging.info("50th percentile: {}".format(np.percentile(qerror, 50)))
    logging.info("60th percentile: {}".format(np.percentile(qerror, 60)))
    logging.info("70th percentile: {}".format(np.percentile(qerror, 70)))
    logging.info("80th percentile: {}".format(np.percentile(qerror, 80)))
    logging.info("90th percentile: {}".format(np.percentile(qerror, 90)))
    logging.info("95th percentile: {}".format(np.percentile(qerror, 95)))
    logging.info("98th percentile: {}".format(np.percentile(qerror, 98)))