In [None]:
import matplotlib
import os
from datetime import timedelta

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt, dates

sns.set_theme('notebook')
sns.set(font="Verdana", font_scale=1.5)

matplotlib.rcParams['figure.figsize'] = (16, 9)
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rcParams['lines.linewidth'] = 2.5

BASE_DIR = "2023-11-30"

In [None]:
df = pd.read_csv(os.path.join(BASE_DIR, "data.csv"), parse_dates=["start_time", "end_time"])
df.sort_values("start_time", inplace=True)
df.reset_index(inplace=True, drop=True)
df.index.names = ["Test number"]

df["file_size"] = df["file_size"] / 1024
df["File name"] = df["file_path"].apply(lambda x: os.path.basename(x))
df["Test bin"] = pd.cut(df.index, bins=10, precision=0)
df["mean_processing_speed"] = df["processing_speed"].expanding().mean()
df["median_processing_speed"] = df["processing_speed"].expanding().median()
df["duration"] = pd.to_timedelta(df["duration"])
df["Test duration (s)"] = df["duration"].apply(lambda x: x.total_seconds())

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df, x=df.index, y="mean_processing_speed", label="Mean", ax=ax)
sns.lineplot(data=df, x=df.index, y="median_processing_speed", label="Median", ax=ax)
ax.set_ylabel("Processing speed (B/s)")
# ax.set_ylim(-100, 100)

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df, x=df.index, y="percentage_error", ax=ax)
ax.set_ylabel("Error (%)")
ax.set_ylim(-100, 100)

In [None]:
fig, ax = plt.subplots()
sns.lineplot(data=df, x=df.index, y="processing_speed", ax=ax)
ax.set_ylabel("Processing speed (B/s)")
# ax.set_ylim(-100, 100)

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(16, 12))
sns.boxplot(data=df, x=df["Test bin"], y="percentage_error", ax=axs[0])
sns.lineplot(data=df, x=df.index, y="percentage_error", ax=axs[1])

xticks = df["Test bin"].unique()
xtick_labels = [f"{int(x.left) + 1}-{int(x.right)}" for x in xticks]

axs[0].set_xticklabels(xtick_labels)

for ax in axs:
    ax.set_ylim(-100, 100)
    ax.set_ylabel("Error (%)")

In [None]:
first_twelve_df = df[df.index < 12]
first_twelve_df["Test number"] = first_twelve_df.index

first_twelve_df

In [None]:
fig, ax = plt.subplots()
sns.regplot(data=df, x="file_size", y="processing_speed", logx=True, ax=ax)
ax.set_xlabel("File size (KiB)")
ax.set_ylabel("Processing speed (Bytes/s)")

In [None]:
fig, ax = plt.subplots()
sns.histplot(data=df, x="file_size", ax=ax)

In [None]:
df["File size bin"] = pd.cut(df["file_size"], bins=[0, 1024, 10240, np.infty], precision=0, labels=["<1KiB", "1KiB-10KiB", ">10KiB"])

In [None]:
g = sns.FacetGrid(df, col="File size bin", col_wrap=2, height=6, aspect=1.2, sharex=False, sharey=False)
g.map_dataframe(sns.regplot, x="file_size", y="processing_speed", logx=False)

small_median_speed = df[df["file_size"] < 1024]["processing_speed"].median()
medium_median_speed = df[(df["file_size"] >= 1024) & (df["file_size"] < 10240)]["processing_speed"].median()
large_median_speed = df[df["file_size"] >= 10240]["processing_speed"].median()

g.axes[0].axhline(small_median_speed, color="red", linestyle="--", label=f"Median speed: {small_median_speed:.0f} B/s")
g.axes[1].axhline(medium_median_speed, color="red", linestyle="--", label=f"Median speed: {medium_median_speed:.0f} B/s")
g.axes[2].axhline(large_median_speed, color="red", linestyle="--", label=f"Median speed: {large_median_speed:.0f} B/s")

for ax in g.axes:
    ax.set_xlabel("File size (KiB)")
    ax.set_ylabel("Processing speed (B/s)")
    ax.xaxis.set_major_formatter('{x:,.0f}')
    ax.yaxis.set_major_formatter('{x:,.0f}')

g.fig.tight_layout()
g.add_legend()

In [None]:
g = sns.FacetGrid(df, col="File size bin", col_wrap=3, height=6, aspect=1, sharex=False, sharey=False)
g.map_dataframe(sns.regplot, x="file_size", y="processing_speed", logx=False)

for ax in g.axes:
    ax.set_xlabel("File size (KiB)")
    ax.set_ylabel("Processing speed (B/s)")
    ax.xaxis.set_major_formatter('{x:,.0f}')
    ax.yaxis.set_major_formatter('{x:,.0f}')