In [None]:
import matplotlib
import os
from datetime import timedelta

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt, dates

sns.set_theme()
sns.set(context="notebook", font="Verdana", font_scale=1.5)

matplotlib.rcParams['figure.figsize'] = (16, 9)
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rcParams['lines.linewidth'] = 2.5

BASE_DIR = "old_synth_data/2023-11-15"
# BASE_DIR = "2023-11-16_13-27-31"
# BASE_DIR = "2023-11-16_14-17-41"

In [None]:
files = pd.read_csv(os.path.join(BASE_DIR, "files.csv"), index_col=["file_name", "pickle_files"])
files.rename(columns={"file_size": "pickle_file_size"}, inplace=True)
# files["File name"] = files.index.get_level_values(0)
# files["Pickle file name"] = files.index.get_level_values(1)
# files.rename(columns={
#     "num_pickles": "Number of pickle files",
#     "file_size": "File size (KiB)",
# }, inplace=True)
# files["File size (KiB)"] = files["File size (KiB)"] / 1024
files.head()

In [None]:
df = pd.read_csv(os.path.join(BASE_DIR, "data.csv"))
df.dropna(how="any", inplace=True)
df.drop_duplicates(inplace=True)

df["file_name"] = df["file_path"].str.split("/").str[-1]
df.set_index("file_name", inplace=True)

# Set increasing number for every row
df["Test number"] = pd.Series(range(1, len(df.index) + 1), index=df.index)
df["Duration"] = pd.to_timedelta(df["duration"])
df["Duration (s)"] = df["Duration"].dt.total_seconds()

print("#indices in df:", len(df.index))
print("#indices in files:", len(files.index))
df = pd.merge(df, files, left_index=True, right_on=files.index.get_level_values(0), how="left")

df.rename(columns={
    "start_time": "Start time",
    "end_time": "End time",
    "expected_duration_at_schedule_time": "Expected duration at schedule time",
    "max_concurrency_at_execution_time": "Max concurrency at execution time",
    "difference_with_deadline": "Difference with deadline",
    "file_path": "File path",
    "file_name": "File name",
    "total_kwh_used": "Total kWh used",
    "file_size": "File size (KiB)",
    "median_processing_speed_before_test": "Median processing speed before test (KiB/s)",
    "processing_speed": "Task processing speed (KiB/s)",
    "percentage_error": "Percentage error of predicted duration vs actual duration",
    "num_pickles": "Number of pickle files",
    "pickle_file_size": "Pickle file size (KiB)",
}, inplace=True)

df.sort_values("Start time", inplace=True)
# df.reset_index(inplace=True, drop=True)
# df.index.names = ["Test number"]

# Convert bytes to KiB
df["File size (KiB)"] = df["File size (KiB)"] / 1024
df["Task processing speed (KiB/s)"] = df["Task processing speed (KiB/s)"] / 1024
df["Median processing speed before test (KiB/s)"] = df["Median processing speed before test (KiB/s)"] / 1024

# Drop the first row (that test had no previous data to use, so used the fallback speed)
df.drop(df.index[0], inplace=True)

df.head()

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="Test number", y="Percentage error of predicted duration vs actual duration", ax=ax)
ax.set_ylim(-100, 100)
ax.set_ylabel("Running time prediction error (%)")
ax.xaxis.set_major_formatter('{x:,.0f}')
ax.yaxis.set_major_formatter('{x:,.0f}')


In [None]:
fig, ax = plt.subplots()
grouped_by_num_pickles = df.groupby("Number of pickle files")
for name, group in grouped_by_num_pickles:
    sns.histplot(group["File size (KiB)"], label=name, ax=ax)

ax.xaxis.set_major_formatter('{x:,.0f}')
ax.legend(["1 random pickle files", "2 random pickle files", "3 random pickle files"])

In [None]:
has_high_duration_map = np.abs(df["Percentage error of predicted duration vs actual duration"]) > 50
df_high_error = df[has_high_duration_map]

pickle_file_appearances = df.index.get_level_values(1).value_counts(normalize=True) * 100
pickle_file_appearances_with_high_error = df_high_error.index.get_level_values(1).value_counts(normalize=True) * 100
relative_increase = (pickle_file_appearances_with_high_error - pickle_file_appearances) / pickle_file_appearances * 100

df_error = pd.DataFrame(index=pickle_file_appearances.index)
df_error["File name"] = df_error.index
df_error["Proportion in all tests"] = pickle_file_appearances
df_error["Proportion in tests with high error"] = pickle_file_appearances_with_high_error
df_error["Relative increase (%)"] = relative_increase

df_error.sort_values("Relative increase (%)", inplace=True, ascending=False)
df_error_melted = df_error.melt(id_vars=["File name"], value_vars=["Proportion in all tests", "Proportion in tests with high error"], var_name="Type", value_name="Proportion (%)")

fig, axs = plt.subplots(nrows=2, sharex=True)
sns.barplot(data=df_error_melted, x="File name", y="Proportion (%)", hue="Type", ax=axs[0])
sns.barplot(data=df_error, x="File name", y="Relative increase (%)", ax=axs[1])

axs[1].set_ylim(-150, 150)

plt.xticks(rotation=45, ha="right")

for ax in axs:
    ax.set_xlabel("Pickle file name")

In [None]:
# Group items by index to 10 boxes
df["Test bin"] = pd.cut(df["Test number"], bins=10, precision=0)

In [None]:
# Drop values with a percentage error of more than 100% or less than -100%
#df = df[(df["Percentage error of predicted duration vs actual duration"] <= 100) & (df["Percentage error of predicted duration vs actual duration"] >= -100)]

In [None]:
# Drop values with a median processing speed outside of 2 standard deviations
#df = df[np.abs(df["Median processing speed before test (KiB/s)"] - df["Median processing speed before test (KiB/s)"].mean()) <= (2 * df["Median processing speed before test (KiB/s)"].std())]

In [None]:
# Drop values with a percentage error outside of 2 standard deviations
#df = df[np.abs(df["Percentage error of predicted duration vs actual duration"] - df["Percentage error of predicted duration vs actual duration"].mean()) <= (2 * df["Percentage error of predicted duration vs actual duration"].std())]

In [None]:
# Drop values with a processing speed outside of 2 standard deviations
#df = df[np.abs(df["Task processing speed (KiB/s)"] - df["Task processing speed (KiB/s)"].mean()) <= (2 * df["Task processing speed (KiB/s)"].std())]

In [None]:
# Drop tests that have kinggothalion.pkl as their second level index

# len_before = len(df.index.get_level_values(0).unique())
# indices_with_kinggothalion = df.index[df.index.get_level_values(1).isin(["kinggothalion.pkl", "bmkibler.pkl"])]
# df.drop(indices_with_kinggothalion.get_level_values(0), inplace=True)
# len_after = len(df.index.get_level_values(0).unique())
# 
# print("Dropped", len_before - len_after, "tests")

In [None]:
df.head()

In [None]:
df["Median processing speed before test (KiB/s) (moving average)"] = df["Median processing speed before test (KiB/s)"].rolling(10).mean()
df["Percentage error of predicted duration vs actual duration (moving average)"] = df["Percentage error of predicted duration vs actual duration"].rolling(10).mean()
df["Task processing speed (KiB/s) (moving average)"] = df["Task processing speed (KiB/s)"].rolling(10).mean()

In [None]:
fig, axs = plt.subplots(nrows=2, sharex=True)
sns.lineplot(
    data=df, 
    x="Test number", 
    y="Median processing speed before test (KiB/s)", 
    label="Median processing speed before test (KiB/s)", 
    ax=axs[0],
    errorbar=None,
)
axs[0].set_ylabel("KiB/s")

sns.lineplot(
    data=df, 
    x="Test number", 
    y="Percentage error of predicted duration vs actual duration", 
    label="Percentage error of predicted duration vs actual duration", 
    ax=axs[1],
    errorbar=None,
)
axs[1].set_ylim(-100, 100)
axs[1].set_ylabel("Prediction error (%)")

for ax in axs:
    ax.xaxis.set_major_formatter('{x:,.0f}')
    ax.yaxis.set_major_formatter('{x:,.0f}')

In [None]:
fig, ax = plt.subplots(figsize=(16, 5))
lines = ["Task processing speed (KiB/s)", "Median processing speed before test (KiB/s)"]
for line in lines:
    sns.lineplot(data=df, x="Test number", y=line, label=line, ax=ax, errorbar=None)
ax.set_ylabel("KiB/s")
ax.xaxis.set_major_formatter('{x:,.0f}')
ax.yaxis.set_major_formatter('{x:,.0f}')

In [None]:
fig, ax = plt.subplots()
lines = ["Task processing speed (KiB/s)", "Task processing speed (KiB/s) (moving average)"]
for line in lines:
    sns.lineplot(data=df, x="Test number", y=line, label=line, ax=ax, errorbar=None)
ax.set_ylabel("KiB/s")

In [None]:
fig, ax = plt.subplots()
sns.boxplot(data=df, x="Test bin", y="Task processing speed (KiB/s)", ax=ax) 

xticks = df["Test bin"].unique()
xtick_labels = [f"{int(x.left) + 1}-{int(x.right)}" for x in xticks]
ax.set_xticklabels(xtick_labels, rotation=45)

In [None]:
fig, ax = plt.subplots()
lines = ["Percentage error of predicted duration vs actual duration", "Percentage error of predicted duration vs actual duration (moving average)"]
for line in lines:
    sns.lineplot(data=df, x="Test number", y=line, label=line, ax=ax, errorbar=None)

ax.set_ylim(-100, 100)
ax.set_ylabel("Percentage error")

In [None]:
fig, ax = plt.subplots()
sns.boxplot(data=df, x="Test bin", y="Percentage error of predicted duration vs actual duration", ax=ax)
ax.set_ylim(-100, 100)
ax.set_ylabel("Task duration prediction error (%)")

xticks = df["Test bin"].unique()
xtick_labels = [f"{int(x.left) + 1}-{int(x.right)}" for x in xticks]
ax.set_xticklabels(xtick_labels, rotation=45)

In [None]:
g = sns.FacetGrid(df, col="Number of pickle files", height=6, aspect=1)
g.map_dataframe(sns.lineplot, x="Test number", y="Percentage error of predicted duration vs actual duration", errorbar=None)
g.set(ylim=(-100, 100))
# Add line at 0
for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='black')

In [None]:
g = sns.FacetGrid(df, col="Number of pickle files", height=6, aspect=1)
g.map_dataframe(
    sns.boxplot,
    x="Test bin",
    y="Percentage error of predicted duration vs actual duration"
)
g.set(ylim=(-100, 100))

xticks = df["Test bin"].unique()
xtick_labels = [f"{int(x.left) + 1}-{int(x.right)}" for x in xticks]

for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='red', alpha=0.5)
    ax.set_xticklabels(xtick_labels, rotation=90)
    ax.set_ylabel("Prediction error (%)")
    
plt.tight_layout()