In [None]:
import polars as pl
import altair as alt
import duckdb 

def parse_duration(col: str) -> pl.Expr:
    """
    Parse a duration string in the format 'XXd XXh XXmin XXs' into a Polars Duration type.
    
    Args:
        col: Name of the column containing duration strings.
    
    Returns:
        Polars expression that produces a Duration column.
    """
    return (
        pl.col(col)
                .str.extract_groups(r"(\d+)d\s*(\d+)h\s*(\d+)min\s*(\d+)s")
                .struct.rename_fields(["days", "hours", "minutes", "seconds"])
                .pipe(lambda x: pl.duration(
                    days=x.struct.field("days").cast(pl.Int64, strict=False),
                    hours=x.struct.field("hours").cast(pl.Int64, strict=False),
                    minutes=x.struct.field("minutes").cast(pl.Int64, strict=False),
                    seconds=x.struct.field("seconds").cast(pl.Int64, strict=False)
                ))
    )

In [5]:
pika_df = pl.read_csv("data/jobs.csv").with_columns(
    pl.col("Start").str.to_datetime(format="%d/%m/%Y %H:%M:%S"),
    pl.col("End").str.to_datetime(format="%d/%m/%Y %H:%M:%S"),
    parse_duration("Walltime").alias("Walltime"),
    parse_duration("Pending").alias("Pending"),
    parse_duration("Duration").alias("Duration"),
)

pika_df.sort("Pending")

Job ID,Project,Start,End,State,#Nodes,#Cores,#GPUs,Exclusive,Walltime,Pending,Duration,Core Hours,Used Walltime,Partition
i64,str,datetime[μs],datetime[μs],str,i64,i64,i64,i64,duration[μs],duration[μs],duration[μs],f64,f64,str
17341221,"""p_lv_internet""",2025-06-02 08:30:07,2025-06-02 09:00:16,"""timeout""",3,6,0,0,30m,0µs,30m 9s,3.02,100.5,"""barnard"""
17336715,"""p_lv_internet""",2025-05-30 22:20:13,2025-05-30 22:50:41,"""timeout""",3,6,0,0,30m,0µs,30m 28s,3.05,101.56,"""barnard"""
17324136,"""p_lv_internet""",2025-05-29 13:10:15,2025-05-29 13:40:37,"""timeout""",3,6,0,0,30m,0µs,30m 22s,3.04,101.22,"""barnard"""
17323529,"""p_lv_internet""",2025-05-28 19:15:47,2025-05-28 19:35:04,"""cancelled""",3,6,0,0,30m,0µs,19m 17s,1.93,64.28,"""barnard"""
17299108,"""p_lv_internet""",2025-05-26 08:55:07,2025-05-26 09:20:06,"""cancelled""",3,6,0,0,30m,0µs,24m 59s,2.5,83.28,"""barnard"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17157301,"""p_lv_internet""",2025-05-19 16:20:00,2025-05-19 16:50:08,"""timeout""",3,6,0,0,30m,14m 53s,30m 8s,3.01,100.44,"""barnard"""
17181314,"""p_lv_internet""",2025-05-20 11:33:33,2025-05-20 11:56:49,"""cancelled""",3,6,0,0,30m,18m 17s,23m 16s,2.33,77.56,"""barnard"""
17100910,"""p_lv_internet""",2025-05-15 16:43:08,2025-05-15 17:13:13,"""timeout""",3,6,0,0,30m,23m 5s,30m 5s,3.01,100.28,"""barnard"""
17252348,"""p_lv_internet""",2025-05-21 17:19:46,2025-05-21 17:50:01,"""timeout""",3,6,0,0,30m,34m 33s,30m 15s,3.02,100.83,"""barnard"""


In [None]:
pending_seconds_per_day = pika_df.select(
    pl.col("Start").dt.truncate("1d"),
    pl.col("Pending").dt.total_seconds()) \
    .group_by("Start") \
    .agg(pl.col("Pending").sum()) \
    .with_columns(pl.col("Pending").truediv(3600))

alt.Chart(pending_seconds_per_day, title="total pending time per day", width=600).mark_bar().encode(
    x="Start",
    y=alt.Y("Pending", title="pending time in h")
)

In [66]:
core_hours_per_day = pika_df.select(pl.col("Start").dt.truncate("1d"), pl.col("Core Hours"), pl.col("State")).group_by(pl.col("Start"), pl.col("State")).agg(pl.col("Core Hours").sum(), pl.len().alias("Job Count"))


core_hours_per_day_acc = alt.Chart(
    core_hours_per_day \
        .group_by("Start") \
        .agg(pl.col("Core Hours").sum()) \
        .sort("Start") \
        .select(pl.col("Start"), pl.col("Core Hours").cum_sum().alias("Core Hours Accumulated"))
    , width=600) \
    .mark_line() \
.encode(
    x="Start",
    y=alt.Y("Core Hours Accumulated", title="total uses CPU time in h"),
)


core_hours_base_chart = alt.Chart(core_hours_per_day, width=600).mark_bar().encode(
    x="Start",
    color=alt.Color("State", title="Job State")
)

core_hours_per_day_chart = core_hours_base_chart.encode(
    y=alt.Y("Core Hours", title="Used CPU time in h"),
).properties(title="Used CPU Resources in h")

jobs_per_day_chars = core_hours_base_chart.encode(
    y="Job Count",
).properties(title="Number of Scheduled Jobs per day")

(core_hours_per_day_chart + core_hours_per_day_acc).resolve_scale(y="independent") & jobs_per_day_chars
