In [123]:
import re
import matplotlib.pyplot as plt
import numpy as np


def load_dmr_data(filepath):
    init_data = []
    last_fini = None

    with open(filepath, "r") as file:
        for line in file:
            stripped = line.strip()

            if stripped.startswith("DMR INIT"):
                timestamp_match = re.search(r"Timestamp:\s*(\d+\.\d+)", stripped)
                nodes_match = re.search(r"Nodes:\s*(\d+)", stripped)
                if timestamp_match and nodes_match:
                    timestamp = float(timestamp_match.group(1))
                    nodes = int(nodes_match.group(1))
                    init_data.append((timestamp, nodes))

            elif stripped.startswith("DMR FINI"):
                timestamp_match = re.search(r"Timestamp:\s*(\d+\.\d+)", stripped)
                nodes_match = re.search(r"Nodes:\s*(\d+)", stripped)
                if timestamp_match and nodes_match:
                    last_fini = (
                        float(timestamp_match.group(1)),
                        int(nodes_match.group(1)),
                    )

    if last_fini:
        init_data.append(last_fini)

    return init_data


# Load datasets
data1 = load_dmr_data("two-dmr-exec-newest/output_668.txt")
data2 = load_dmr_data("two-dmr-exec-newest/output_670.txt")

# Flatten to find the earliest start time
all_timestamps = [t for t, _ in data1 + data2]
if not all_timestamps:
    raise ValueError("No data found in either file.")

global_start_time = min(all_timestamps)


# Normalize to global start time
def normalize_by_global_start(data, global_start):
    return zip(*[(t - global_start, n) for t, n in data])


times1, nodes1 = normalize_by_global_start(data1, global_start_time)
times2, nodes2 = normalize_by_global_start(data2, global_start_time)

mall_times1 = [times1[0], *times1, times1[-1]]
mall_nodes1 = [0, *nodes1, 0]
mall_times2 = [times2[0], *times2, times2[-1]]
mall_nodes2 = [0, *nodes2, 0]

In [124]:
from datetime import datetime
import polars as pl
from pathlib import Path
import re


def get_slurm_df(file: Path) -> pl.DataFrame:
    rows = []

    with open(file, "r") as f:
        for line in f:
            fields = dict(re.findall(r"(\S+?)=(\S+)", line))
            rows.append(fields)

    df = pl.DataFrame(rows)
    fine_df = df.select(
        pl.col("JobId"), pl.col("JobState"), pl.col("StartTime"), pl.col("EndTime")
    ).filter(pl.col("JobState") == "COMPLETED")

    return fine_df


def find_jobcomp(jobid: str, jc_df: pl.DataFrame) -> tuple[str, str, str, str]:
    row = jc_df.row(by_predicate=pl.col("JobId") == jobid)
    return row


def get_timestamps(row: tuple[str, str, str, str]) -> tuple[float, float]:
    _, _, start, end = row
    start_ts = datetime.fromisoformat(start).timestamp()
    end_ts = datetime.fromisoformat(end).timestamp()
    return start_ts, end_ts


slurm_jobcomp_path = Path().resolve() / "slurm_jobcomp.log"

jobid1 = "976"
jobid2 = "977"

jc_df = get_slurm_df(file=slurm_jobcomp_path)

workload1 = find_jobcomp(jobid=jobid1, jc_df=jc_df)
workload2 = find_jobcomp(jobid=jobid2, jc_df=jc_df)

real_ts1 = get_timestamps(workload1)
real_ts2 = get_timestamps(workload2)
global_start_time = min(real_ts1[0], real_ts2[0])
normalized_ts1 = list(i - global_start_time for i in real_ts1)
normalized_ts2 = list(i - global_start_time for i in real_ts2)
baseline_ts1 = normalized_ts1[:1] + normalized_ts1
baseline_ts2 = normalized_ts2[:1] + normalized_ts2

baseline_nodes = [0, 3, 0]

In [125]:
def walk_workflow_log(file: Path) -> list[tuple[str, str]]:
    with open(file, "r") as fp:
        data = fp.read()

    pattern = r"Scheduled job /loop/([^/]+)/\d+\.\d+ with job id (\d+)"
    res = [(i.group(1), i.group(2)) for i in re.finditer(pattern, data)]
    return res


def get_timed_node_usage_from_jobcomp(
    file: Path, jc_df: pl.DataFrame
) -> tuple[list[float], list[int]]:
    steps = walk_workflow_log(file)

    exe_to_coeff = {"clustering": 3, "silhouette": 1, "annealing": 0}

    times = []
    nodes = []

    for step in steps:
        name, jobid = step
        coeff = exe_to_coeff[name]

        job_row = jc_df.row(by_predicate=pl.col("JobId") == jobid)
        _, _, start, end = job_row
        start_ts = datetime.fromisoformat(start).timestamp()
        end_ts = datetime.fromisoformat(end).timestamp()

        times.extend([start_ts, end_ts])
        nodes.extend([coeff, 0])

    return times, nodes


perf_folder = Path().resolve().parent / "perf"

wf_times1_raw, wf_nodes1 = get_timed_node_usage_from_jobcomp(
    file=perf_folder / "workflow_sleep_duo1_1" / "log.txt", jc_df=jc_df
)
wf_times2_raw, wf_nodes2 = get_timed_node_usage_from_jobcomp(
    file=perf_folder / "workflow_sleep_duo1_2" / "log.txt", jc_df=jc_df
)

wf_global_start = min(wf_times1_raw + wf_times2_raw)
wf_times1 = [i - wf_global_start for i in wf_times1_raw]
wf_times2 = [i - wf_global_start for i in wf_times2_raw]

In [None]:
def step_and_area(ax, x, y, label: str):
    linestyle = "solid"
    ax.fill_between([x[0], *x, x[-1]], [0, *y, 0], step="post", alpha=0.4)
    ax.step(
        [x[0], *x, x[-1]],
        [0, *y, 0],
        label=label,
        linewidth=2,
        linestyle=linestyle,
        where="post",
    )

# Plot
scaling = 0.85
fig, axs = plt.subplots(3, sharex=True, figsize=(6.4 * scaling, 4.8 * scaling))

linestyle = "solid"

axs[0].title.set_text("Baseline compute nodes usage")
step_and_area(axs[0], baseline_ts1, baseline_nodes, label="Job 1")
step_and_area(axs[0], baseline_ts2, baseline_nodes, label="Job 2")

axs[1].title.set_text("Workflow compute nodes usage")
step_and_area(axs[1], wf_times1, wf_nodes1, label="Job 1")
step_and_area(axs[1], wf_times2, wf_nodes2, label="Job 2")

axs[2].title.set_text("Malleability compute nodes usage")
step_and_area(axs[2], mall_times1, mall_nodes1, label="Job 1")
step_and_area(axs[2], mall_times2, mall_nodes2, label="Job 2")

axs[0].set_yticks(np.arange(0, 3 + 1, 1))
axs[1].set_yticks(np.arange(0, 3 + 1, 1))
axs[2].set_yticks(np.arange(0, 3 + 1, 1))
# axs[0].grid()
# axs[1].grid()
# axs[2].grid()


fig.supxlabel("Time since earliest job start (s)")
fig.supylabel("Number of allocated compute nodes")

fig.tight_layout()
# fig.show()
fig.savefig("comparison_plot.pdf", format="pdf")