### Imports

In [1]:
import sys

from matplotlib.pyplot import tick_params

sys.path.append(r"C:\Users\Sensei\DataGripProjects\DatabaseComparisons\mdbs_visualization")
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

import seaborn as sns

pal = sns.color_palette("deep").as_hex()
px.defaults.color_discrete_sequence = pal

px.defaults.template = "seaborn"

In [2]:
from visu_script import parse_files as pf

### Parameters

In [3]:
FILES_PATH = r"C:\Users\Sensei\DataGripProjects\DatabaseComparisons\mdbs_visualization\results_no_bulk_importing_with_container_restart"
FILES_PATH_UC7 = r"C:\Users\Sensei\Downloads\MDS\results"

In [4]:
OUTPUT_PATH = r"C:\Users\Sensei\DataGripProjects\DatabaseComparisons\mdbs_visualization\visu_script\svgs\\"

### Code

In [5]:
case_db_run_instances = pf.get_usecase_instances(FILES_PATH)

In [6]:
# Assume case_db_run_instances is already populated (as in your parsing code).
records = []
for run in case_db_run_instances:
    records.append({
        "database": run.database,
        "timestamp": run.timestamp,                # datetime
        "usecase": run.usecase,                   # integer (e.g. 1, 2, 3…)
        "usecase_full_name": run.usecase_full_name,
        "execution_time_s": run.execution_time,
        "execution_time_ms": run.execution_time_ms,
        "avg_cpu_percent": run.avg_cpu_percent,
        "peak_cpu_percent": run.peak_cpu_percent,
        "avg_mem_mb": run.avg_mem_mb,
        "peak_mem_mb": run.peak_mem_mb
    })

df = pd.DataFrame(records)
# If you want to sort or index by timestamp:
df = df.sort_values("timestamp").reset_index(drop=True)

### Plot1: Box‐Plot: Distribution of Execution Time by Database

In [7]:
fig = px.box(
    df,
    x="database",
    y="execution_time_s",
    color="database",
    points="all", 
    title="Execution‐Time Distribution by Database across all usecases"
)
fig.update_layout(
    title={
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="Database System",
    yaxis_title="Execution Time (seconds)"
)
fig.write_image(f"{OUTPUT_PATH}exec-time-by-database-avg.svg")

### Plot2: Line‐Plot Over Time: How Execution Time Evolves

In [8]:
# Example: pick a single usecase (e.g. usecase 1). You can loop or facet later.

desc = {
    1 : "Usecase 1 Filtering Query",
    2 : "Usecase 2 Bulk Update",
    3 : "Usecase 3 Schema Evolution",
    4 : "Usecase 4 Analytical Query",
    5 : "Usecase 5 Aggregation",
    6 : "Usecase 6 Filtering Query"
}

for uc, ucname in desc.items():
    plot_title = f"Average Execution Time for {ucname} Over Time"
    df_uc = df[df["usecase"] == uc].copy()

    # Group by date (or exact timestamp) & database
    # If you want to aggregate per‐day rather than per‐timestamp, extract date:
    df_uc["date_only"] = df_uc["timestamp"].dt.date

    # Compute mean execution_time per database per day:
    grouped = (
        df_uc
        .groupby(["date_only", "database"])
        .execution_time_s
        .mean()
        .reset_index(name="mean_exec_time_s")
    )

    fig = px.line(
        grouped,
        x="date_only",
        y="mean_exec_time_s",
        color="database",
        markers=True,
        title=plot_title
    )
    fig.update_layout(
        title={
            'text': plot_title,
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title="Date",
        yaxis_title="Average Execution Time (s)"
    )

    fig.update_layout(
    yaxis=dict(rangemode='tozero')
)
    fig.write_image(f"{OUTPUT_PATH}{ucname}.png")
    fig.show()

### Plot2: Line‐Plot Over Time: How Execution Time Evolves Usecase 7

In [9]:
# Example: pick a single usecase (e.g. usecase 1). You can loop or facet later.

desc = {
    7 : "Usecase 7 Bulk Import"
}


case_db_run_instances = pf.get_usecase_instances(FILES_PATH_UC7)

# Assume case_db_run_instances is already populated (as in your parsing code).
records = []
for run in case_db_run_instances:
    records.append({
        "database": run.database,
        "timestamp": run.timestamp,                # datetime
        "usecase": run.usecase,                   # integer (e.g. 1, 2, 3…)
        "usecase_full_name": run.usecase_full_name,
        "execution_time_s": run.execution_time,
        "execution_time_ms": run.execution_time_ms,
        "avg_cpu_percent": run.avg_cpu_percent,
        "peak_cpu_percent": run.peak_cpu_percent,
        "avg_mem_mb": run.avg_mem_mb,
        "peak_mem_mb": run.peak_mem_mb
    })

df = pd.DataFrame(records)
# If you want to sort or index by timestamp:
df = df.sort_values("timestamp").reset_index(drop=True)

for uc, ucname in desc.items():
    plot_title = f"Average Execution Time for {ucname} Over Time"
    df_uc = df[df["usecase"] == uc].copy()

    # Group by date (or exact timestamp) & database
    # If you want to aggregate per‐day rather than per‐timestamp, extract date:
    df_uc["date_only"] = df_uc["timestamp"].dt.date

    # Compute mean execution_time per database per day:
    grouped = (
        df_uc
        .groupby(["date_only", "database"])
        .execution_time_s
        .mean()
        .reset_index(name="mean_exec_time_s")
    )

    fig = px.line(
        grouped,
        x="date_only",
        y="mean_exec_time_s",
        color="database",
        markers=True,
        title=plot_title
    )
    fig.update_layout(
        title={
            'text': plot_title,
            'x': 0.5,
            'xanchor': 'center'
        },
        xaxis_title="Date",
        yaxis_title="Average Execution Time (s)"
    )

    fig.update_layout(
    yaxis=dict(rangemode='tozero')
)
    fig.write_image(f"{OUTPUT_PATH}{ucname}.png")
    fig.show()

In [10]:
import pandas as pd
import plotly.express as px
from pandas.api.types import CategoricalDtype

case_db_run_instances = pf.get_usecase_instances(FILES_PATH)

# build DataFrame
records = []
for run in case_db_run_instances:
    records.append({
        "database": run.database,
        "timestamp": run.timestamp,
        "usecase_full_name": run.usecase_full_name,
        "execution_time_s": run.execution_time,
    })
df = pd.DataFrame(records).sort_values("timestamp").reset_index(drop=True)

# compute mean
agg_exec = (
    df
    .groupby(["usecase_full_name", "database"])
    .execution_time_s
    .mean()
    .reset_index(name="mean_execution_time_s")
)

# rename to "Usecase N …"
renames = {
    "add solar panels":             "uc 3:<br>Schema Evolution",
    "average price per city":       "uc 5:<br>Aggregation",
    "filter by bedrooms and size":  "uc 1:<br>Filtering Query<br> Bedrooms and Size",
    "filter properties":            "uc 6:<br>Filtering Query Properties",
    "price analysis":               "uc 4:<br>Analytical Query",
    "update prices":                "uc 2:<br>Bulk Update",
}
agg_exec["usecase_full_name"] = agg_exec["usecase_full_name"].map(renames)

# define exact x-axis order
order = [
    "uc 1:<br>Filtering Query<br> Bedrooms and Size",
    "uc 2:<br>Bulk Update",
    "uc 3:<br>Schema Evolution",
    "uc 4:<br>Analytical Query",
    "uc 5:<br>Aggregation",
    "uc 6:<br>Filtering Query Properties",
]

# *** Option A: make it an ordered categorical (not strictly necessary if you use category_orders) ***
cat_type = CategoricalDtype(categories=order, ordered=True)
agg_exec["usecase_full_name"] = agg_exec["usecase_full_name"].astype(cat_type)

# plot with explicit category_orders
fig = px.bar(
    agg_exec,
    x="usecase_full_name",
    y="mean_execution_time_s",
    color="database",
    barmode="group",
    category_orders={"usecase_full_name": order},
    title="Average Execution Time per Usecase by Database"
)

# enforce it again by telling the x-axis to use our array order
fig.update_xaxes(
    categoryorder="array",
    categoryarray=order,
    tickangle=0, # Explicitly set tick angle to 0
    tickfont=dict(size=16),
    title_text="Usecases",
    title_font=dict(size=24),
)

# y-axis tweaks
fig.update_yaxes(
    type="log",
    title_text="Avg. Execution Time (s) (log scale)",
    tickvals=[0.01, 0.1, 1, 10, 100],
    ticktext=["0.01", "0.1", "1", "10", "100"],
    dtick=1,
    tickfont=dict(size=16),
    title_font=dict(size=24),
    exponentformat="power",
    showexponent="all",
    rangemode="tozero"
)

fig.update_layout(
    title={"text": "Average Execution Time per Usecase (1-6) by Database", "x":0.45, "xanchor":"center", "font" : {"size": 32}},
    margin=dict(t=60, b=100), # Increased bottom margin to accommodate multi-line labels
    legend=dict(
        traceorder="grouped",
        title=dict(
            text="Legend",
            font=dict(size=24)
        ),
        grouptitlefont=dict(
            size=16
        ),
        font=dict(
            size=20
        )
    ),
)

fig.write_image(f"{OUTPUT_PATH}Average-Execution-Time-Per-Usecase.png", width=1350, height=750, scale=2)
fig.show()


### Plot5: Box‐Plot of Peak CPU Usage by Database

In [11]:
from pandas.api.types import CategoricalDtype

# Assume case_db_run_instances is already populated (as in your parsing code).
records = []
for run in case_db_run_instances:
    records.append({
        "database": run.database,
        "timestamp": run.timestamp,                # datetime
        "usecase": run.usecase,                   # integer (e.g. 1, 2, 3…)
        "usecase_full_name": run.usecase_full_name,
        "execution_time_s": run.execution_time,
        "execution_time_ms": run.execution_time_ms,
        "avg_cpu_percent": run.avg_cpu_percent,
        "peak_cpu_percent": run.peak_cpu_percent,
        "avg_mem_mb": run.avg_mem_mb,
        "peak_mem_mb": run.peak_mem_mb
    })

df = pd.DataFrame(records)
# If you want to sort or index by timestamp:
df = df.sort_values("timestamp").reset_index(drop=True)

# rename to "Usecase N …"
renames = {
    "add solar panels":             "Schema Evolution",
    "average price per city":       "Aggregation",
    "filter by bedrooms and size":  "Filtering Query Bedrooms and Size",
    "filter properties":            "Filtering Query Properties",
    "price analysis":               "Analytical Query",
    "update prices":                "Bulk Update",
}
df["usecase_full_name"] = df["usecase_full_name"].map(renames)

# define exact x-axis order
order = [
    "Filtering Query Bedrooms and Size",
    "Bulk Update",
    "Schema Evolution",
    "Analytical Query",
    "Aggregation",
    "Filtering Query Properties",
]

# *** Option A: make it an ordered categorical (not strictly necessary if you use category_orders) ***
cat_type = CategoricalDtype(categories=order, ordered=True)
df["usecase_full_name"] = df["usecase_full_name"].astype(cat_type)





In [12]:



# === DATA PREPARATION FOR PLOTTING ===

# 1. Create a sorted dataframe of unique usecases to define the plot order and names
uc_meta = df[['usecase', 'usecase_full_name']].drop_duplicates().sort_values('usecase').reset_index(drop=True)

# 2. Create the shortened names for the legend
uc_meta['short_name'] = "uc" + uc_meta['usecase'].astype(str) + ": " + uc_meta['usecase_full_name'].astype(str)

# 3. Create a list of the ordered full names to drive the plotting loop
ordered_uc_full_names = uc_meta['usecase_full_name'].tolist()
short_name_map = pd.Series(uc_meta.short_name.values, index=uc_meta.usecase_full_name).to_dict()

fig = go.Figure()
db_names = df['database'].unique()
db_colors = px.colors.qualitative.Vivid
uc_colors = px.colors.qualitative.D3
db_color_map = {name: db_colors[i % len(db_colors)] for i, name in enumerate(db_names)}
uc_color_map = {name: uc_colors[i % len(uc_colors)] for i, name in enumerate(ordered_uc_full_names)}

# --- Spacing definition ---
BOX_WIDTH = 0.3
BOX_START_X_REL = -0.45
STRIPS_START_X_REL = BOX_START_X_REL + BOX_WIDTH + 0.05
STRIPS_END_X_REL = 0.45
STRIPS_TOTAL_WIDTH = STRIPS_END_X_REL - STRIPS_START_X_REL
STRIP_SPACING = STRIPS_TOTAL_WIDTH / len(ordered_uc_full_names) if ordered_uc_full_names else 0

# --- Add Traces ---
# This is the section adapted for 'peak_cpu_percent'
for i, db_name in enumerate(db_names):
    df_db = df[df['database'] == db_name]
    # Box plot now uses 'peak_cpu_percent'
    fig.add_trace(go.Box(
        y=df_db['peak_cpu_percent'], x0=i + BOX_START_X_REL + (BOX_WIDTH / 2), name=db_name,
        marker_color=db_color_map[db_name], width=BOX_WIDTH, boxpoints=False, showlegend=True,
        legendgroup="1-databases", legendgrouptitle_text="Databases"
    ))
    for j, uc_full_name in enumerate(ordered_uc_full_names):
        df_uc = df_db[df_db['usecase_full_name'] == uc_full_name]
        if df_uc.empty: continue
        strip_x_pos = i + STRIPS_START_X_REL + (j * STRIP_SPACING) + (STRIP_SPACING / 2)
        # Scatter plot (strips) now uses 'peak_cpu_percent'
        fig.add_trace(go.Scatter(
            x=[strip_x_pos] * len(df_uc), y=df_uc['peak_cpu_percent'], mode='markers',
            name=short_name_map[uc_full_name],
            marker_color=uc_color_map[uc_full_name], showlegend=(i == 0),
            legendgroup="2-usecases", legendgrouptitle_text="Usecases"
        ))

# --- Finalize Layout with Larger Fonts ---
# Layout titles and labels are updated for the new metric
fig.update_layout(
    title=dict(
        text="Peak CPU Usage: Boxplot and Usecase Strips by Database",
        x=0.45,
        font=dict(size=32)
    ),
    xaxis=dict(
        title=dict(text="Database", font=dict(size=24)),
        tickmode='array',
        tickvals=list(range(len(db_names))),
        ticktext=db_names,
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title=dict(text="Peak CPU Percent (%)", font=dict(size=24)),
        tickfont=dict(size=20),
    ),
    legend=dict(
        traceorder="grouped",
        title=dict(
            text="Legend",
            font=dict(size=24)
        ),
        grouptitlefont=dict(
            size=16
        ),
        font=dict(
            size=16
        )
    ),
    boxmode='overlay',
    template='seaborn',
)

fig.write_image(f"{OUTPUT_PATH}Average-Peak-CPU-Usage-Per-Usecase.png", width=1350, height=750, scale=2)
fig.show()

### Plot6: Box‐Plot of Average Memory (MB) by Database

In [13]:
fig = go.Figure()
db_names = df['database'].unique()
db_colors = px.colors.qualitative.Vivid
uc_colors = px.colors.qualitative.D3
db_color_map = {name: db_colors[i % len(db_colors)] for i, name in enumerate(db_names)}
uc_color_map = {name: uc_colors[i % len(uc_colors)] for i, name in enumerate(ordered_uc_full_names)}

# --- Spacing definition ---
BOX_WIDTH = 0.3
BOX_START_X_REL = -0.45
STRIPS_START_X_REL = BOX_START_X_REL + BOX_WIDTH + 0.05
STRIPS_END_X_REL = 0.45
STRIPS_TOTAL_WIDTH = STRIPS_END_X_REL - STRIPS_START_X_REL
STRIP_SPACING = STRIPS_TOTAL_WIDTH / len(ordered_uc_full_names) if ordered_uc_full_names else 0

# --- Add Traces ---
for i, db_name in enumerate(db_names):
    df_db = df[df['database'] == db_name]
    fig.add_trace(go.Box(
        y=df_db['avg_mem_mb'], x0=i + BOX_START_X_REL + (BOX_WIDTH / 2), name=db_name,
        marker_color=db_color_map[db_name], width=BOX_WIDTH, boxpoints=False, showlegend=True,
        legendgroup="1-databases", legendgrouptitle_text="Databases"
    ))
    for j, uc_full_name in enumerate(ordered_uc_full_names):
        df_uc = df_db[df_db['usecase_full_name'] == uc_full_name]
        if df_uc.empty: continue
        strip_x_pos = i + STRIPS_START_X_REL + (j * STRIP_SPACING) + (STRIP_SPACING / 2)
        fig.add_trace(go.Scatter(
            x=[strip_x_pos] * len(df_uc), y=df_uc['avg_mem_mb'], mode='markers',
            name=short_name_map[uc_full_name],
            marker_color=uc_color_map[uc_full_name], showlegend=(i == 0),
            legendgroup="2-usecases", legendgrouptitle_text="Usecases"
        ))

# --- Finalize Layout with Larger Fonts ---
fig.update_layout(
    title=dict(
        text="Average Memory Usage: Boxplot and Usecase Strips by Database",
        x=0.45,
        font=dict(size=32)
    ),
    xaxis=dict(
        title=dict(text="Database", font=dict(size=24)),
        tickmode='array',
        tickvals=list(range(len(db_names))),
        ticktext=db_names,
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title=dict(text="Average Memory (MB)", font=dict(size=24)),
        tickfont=dict(size=20),
    ),
    legend=dict(
        traceorder="grouped",
        title=dict(
            text="Legend",
            font=dict(size=24)
        ),
        grouptitlefont=dict(
            size=16
        ),
        font=dict(
            size=16
        )
    ),
    boxmode='overlay',
    template='seaborn',
)
fig.show()

fig.write_image(f"{OUTPUT_PATH}Average-Memory-Usage-Per-Usecase.png", width=1350, height=750, scale=2)