In [2]:
# 01_build_vm_static.ipynb

from pathlib import Path

import polars as pl
from tqdm.auto import tqdm

# ---- copy from 00 (keep in sync) ----
RAW_DATA_DIR = Path("data_raw")  # TODO: set same as 00
INTERMEDIATE_DIR = Path("data_intermediate")
INTERMEDIATE_DIR.mkdir(parents=True, exist_ok=True)

SECONDS_PER_HOUR = 60 * 60
SECONDS_PER_DAY = 24 * SECONDS_PER_HOUR
WINDOW_END_SECONDS = 30 * SECONDS_PER_DAY

print("RAW_DATA_DIR     =", RAW_DATA_DIR.resolve())
print("INTERMEDIATE_DIR =", INTERMEDIATE_DIR.resolve())


RAW_DATA_DIR     = C:\Users\Layton\Desktop\cis-520\preprocess\data_raw
INTERMEDIATE_DIR = C:\Users\Layton\Desktop\cis-520\preprocess\data_intermediate


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def sec_to_day_expr(col: str | pl.Expr) -> pl.Expr:
    col_expr = pl.col(col) if isinstance(col, str) else col
    return (col_expr // SECONDS_PER_DAY).cast(pl.Int32)


def sec_to_hour_expr(col: str | pl.Expr) -> pl.Expr:
    col_expr = pl.col(col) if isinstance(col, str) else col
    return ((col_expr % SECONDS_PER_DAY) // SECONDS_PER_HOUR).cast(pl.Int32)


def add_lifetime(
    df: pl.DataFrame,
    created_col: str,
    deleted_col: str,
    window_end: int = WINDOW_END_SECONDS,
) -> pl.DataFrame:
    """
    Add lifetime_sec and lifetime_hours.

    If deleted is NULL, treat as active until window_end.
    """
    # If you know there are no NULLs, this when/otherwise is harmless.
    df = df.with_columns(
        pl.when(pl.col(deleted_col).is_null())
        .then(window_end)
        .otherwise(pl.col(deleted_col))
        .alias(deleted_col)
    )

    df = df.with_columns(
        (
            (pl.col(deleted_col) - pl.col(created_col))
            .clip(lower_bound=0)          # <-- instead of .clip_min(0)
        ).alias("lifetime_sec")
    )

    df = df.with_columns(
        (pl.col("lifetime_sec") / float(SECONDS_PER_HOUR)).alias("lifetime_hours")
    )

    return df

In [4]:
# File paths (under RAW_DATA_DIR)
SUBS_PATH = RAW_DATA_DIR / "subscriptions.csv"
DEP_PATH  = RAW_DATA_DIR / "deployment.csv"
VM_PATH   = RAW_DATA_DIR / "vmtable.csv"

print("Subscriptions file:", SUBS_PATH)
print("Deployment file   :", DEP_PATH)
print("VM table file     :", VM_PATH)

# Explicit schemas (tweak if your columns are named differently)
subs_schema = {
    "subscription_id": pl.Utf8,
    "ts_first_vm_created": pl.Int64,
    "count_vms_created": pl.Int64,
}

dep_schema = {
    "deployment_id": pl.Utf8,
    "deployment_size": pl.Int64,
}

vm_schema = {
    "vm_id": pl.Utf8,
    "subscription_id": pl.Utf8,
    "deployment_id": pl.Utf8,
    "ts_vm_created": pl.Int64,
    "ts_vm_deleted": pl.Int64,
    "max_cpu": pl.Float64,
    "avg_cpu": pl.Float64,
    "p95_max_cpu": pl.Float64,
    "vm_category": pl.Utf8,
    "vm_virtual_core_count": pl.Int64,
    "vm_memory_gb": pl.Float64,
}


Subscriptions file: data_raw\subscriptions.csv
Deployment file   : data_raw\deployment.csv
VM table file     : data_raw\vmtable.csv


In [8]:
steps = [
    "Read subscriptions",
    "Read deployments",
    "Read vmtable",
    "Join & feature engineering",
    "Save vm_static.parquet",
]

pbar = tqdm(total=len(steps), desc="vm_static pipeline")

# 1) Read subscriptions
print("\n[1] Reading subscriptions...")
subs_df = pl.read_csv(
    SUBS_PATH,
    has_header=True,
    schema=subs_schema,
)
subs_df = subs_df.with_columns(
    [
        (pl.col("ts_first_vm_created") // SECONDS_PER_DAY)
        .cast(pl.Int32)
        .alias("sub_first_day"),
        ((pl.col("ts_first_vm_created") % SECONDS_PER_DAY) // SECONDS_PER_HOUR)
        .cast(pl.Int32)
        .alias("sub_first_hour"),
    ]
)
print("subscriptions rows:", subs_df.height)
pbar.update(1)

# 2) Read deployments
print("\n[2] Reading deployments...")
dep_df = pl.read_csv(
    DEP_PATH,
    has_header=True,
    schema=dep_schema,
)
dep_df = dep_df.with_columns(
    (pl.col("deployment_size") + 1)
    .cast(pl.Float64)
    .log()
    .alias("log_deployment_size")
)
print("deployments rows:", dep_df.height)
pbar.update(1)

# 3) Read vmtable
print("\n[3] Reading vmtable (this is the ~500MB one)...")
vm_df = pl.read_csv(
    VM_PATH,
    has_header=True,
    schema=vm_schema,
)
print("vmtable rows:", vm_df.height)

# Add time features & lifetime
vm_df = vm_df.with_columns(
    [
        sec_to_day_expr("ts_vm_created").alias("day_idx"),
        sec_to_hour_expr("ts_vm_created").alias("hour_of_day"),
    ]
)
vm_df = add_lifetime(
    vm_df,
    created_col="ts_vm_created",
    deleted_col="ts_vm_deleted",
    window_end=WINDOW_END_SECONDS,
)

# Memory / core feature
vm_df = vm_df.with_columns(
    (
        pl.col("vm_memory_gb")
        / pl.col("vm_virtual_core_count")
        .cast(pl.Float64)
        .clip(lower_bound=1.0)   # avoid divide by zero
    ).alias("vm_mem_per_core")
)

print("vmtable with derived cols:")
print(vm_df.head())
pbar.update(1)

# 4) Join everything
print("\n[4] Joining vmtable + deployments + subscriptions...")

vm_static = (
    vm_df.join(dep_df, on="deployment_id", how="left")
    .join(subs_df, on="subscription_id", how="left")
)

print("vm_static rows:", vm_static.height)
print("vm_static columns:", vm_static.columns[:20], "...")
print(vm_static.head())
pbar.update(1)

# 5) Save
out_path = INTERMEDIATE_DIR / "vm_static.parquet"
print(f"\n[5] Writing {out_path} ...")
vm_static.write_parquet(out_path)
pbar.update(1)
pbar.close()

print("\nAll done. vm_static saved.")


vm_static pipeline:  40%|████      | 2/5 [01:43<02:35, 51.95s/it]



[1] Reading subscriptions...
subscriptions rows: 5957

[2] Reading deployments...
deployments rows: 35940

[3] Reading vmtable (this is the ~500MB one)...




vmtable rows: 2013766
vmtable with derived cols:
shape: (5, 16)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ vm_id     ┆ subscript ┆ deploymen ┆ ts_vm_cre ┆ … ┆ hour_of_d ┆ lifetime_ ┆ lifetime_ ┆ vm_mem_p │
│ ---       ┆ ion_id    ┆ t_id      ┆ ated      ┆   ┆ ay        ┆ sec       ┆ hours     ┆ er_core  │
│ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆ str       ┆ str       ┆ i64       ┆   ┆ i32       ┆ i64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ H5CxmMoVc ┆ BSXOcywx8 ┆ 3J17LcV4g ┆ 0         ┆ … ┆ 0         ┆ 1539300   ┆ 427.58333 ┆ 0.75     │
│ ZSpjgGboh ┆ pUU0DueDo ┆ XjFat62qh ┆           ┆   ┆           ┆           ┆ 3         ┆          │
│ nVA3R+7uC ┆ 6UMol1YzR ┆ VFRfoiWAr ┆           ┆   ┆           ┆           ┆           ┆          │
│ Te/…      ┆ 6tn…      ┆ H

vm_static pipeline: 100%|██████████| 5/5 [00:01<00:00,  3.18it/s]


All done. vm_static saved.



