In [1]:
from pathlib import Path
import polars as pl

INTERMEDIATE_DIR = Path("data_intermediate")
FINAL_DIR = Path("data_final")
FINAL_DIR.mkdir(parents=True, exist_ok=True)

vm_full_labeled_path = INTERMEDIATE_DIR / "vm_full_labeled.parquet"
print("vm_full_labeled_path:", vm_full_labeled_path.resolve())


vm_full_labeled_path: /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_intermediate/vm_full_labeled.parquet


In [2]:
print("Loading vm_full_labeled...")
df = pl.read_parquet(vm_full_labeled_path)
print("Rows:", df.height, "Cols:", len(df.columns))

needed_cols = [
    "subscription_id",
    "ts_vm_created",
    "lifetime_hours",
    "cpu_mean",
    "p95_max_cpu",
    "cpu_frac_gt_60",
    "day_night_ratio",
    "critical",
]
missing = [c for c in needed_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in vm_full_labeled: {missing}")

# Sort by tenant and time
df = df.sort(["subscription_id", "ts_vm_created"])
print("Sorted rows:", df.height)


Loading vm_full_labeled...
Rows: 909281 Cols: 62
Sorted rows: 909281


In [3]:
# Helper: cast critical to int for sums
df = df.with_columns(
    [
        pl.lit(1).alias("one"),
        pl.col("critical").cast(pl.Int64).alias("critical_i"),
        (pl.col("lifetime_hours") ** 2).alias("lifetime_sq"),
        (pl.col("cpu_mean") ** 2).alias("cpu_mean_sq"),
        (pl.col("p95_max_cpu") ** 2).alias("p95_sq"),
        (pl.col("cpu_frac_gt_60") ** 2).alias("frac_gt60_sq"),
        (pl.col("day_night_ratio") ** 2).alias("dn_sq"),
    ]
)

df = df.with_columns(
    [
        # cumulative counts
        pl.col("one").cum_sum().over("subscription_id").alias("cum_n_vms"),
        pl.col("critical_i").cum_sum().over("subscription_id").alias(
            "cum_n_critical"
        ),
        # cumulative sums & sq-sums
        pl.col("lifetime_hours")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_lifetime"),
        pl.col("lifetime_sq")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_lifetime_sq"),
        pl.col("cpu_mean")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_cpu_mean"),
        pl.col("cpu_mean_sq")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_cpu_mean_sq"),
        pl.col("p95_max_cpu")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_p95"),
        pl.col("p95_sq").cum_sum().over("subscription_id").alias("cum_sum_p95_sq"),
        pl.col("cpu_frac_gt_60")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_frac_gt60"),
        pl.col("frac_gt60_sq")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_frac_gt60_sq"),
        pl.col("day_night_ratio")
        .cum_sum()
        .over("subscription_id")
        .alias("cum_sum_dn"),
        pl.col("dn_sq").cum_sum().over("subscription_id").alias("cum_sum_dn_sq"),
    ]
)

print("Cumulative columns added.")


Cumulative columns added.


In [4]:
# Previous count and sums (history = cum - current)
df = df.with_columns(
    [
        (pl.col("cum_n_vms") - 1).alias("hist_n_vms"),
        (pl.col("cum_n_critical") - pl.col("critical_i")).alias("hist_n_critical"),
        (pl.col("cum_sum_lifetime") - pl.col("lifetime_hours")).alias(
            "hist_sum_lifetime"
        ),
        (
            pl.col("cum_sum_lifetime_sq") - pl.col("lifetime_sq")
        ).alias("hist_sum_lifetime_sq"),
        (pl.col("cum_sum_cpu_mean") - pl.col("cpu_mean")).alias(
            "hist_sum_cpu_mean"
        ),
        (
            pl.col("cum_sum_cpu_mean_sq") - pl.col("cpu_mean_sq")
        ).alias("hist_sum_cpu_mean_sq"),
        (pl.col("cum_sum_p95") - pl.col("p95_max_cpu")).alias("hist_sum_p95"),
        (pl.col("cum_sum_p95_sq") - pl.col("p95_sq")).alias("hist_sum_p95_sq"),
        (pl.col("cum_sum_frac_gt60") - pl.col("cpu_frac_gt_60")).alias(
            "hist_sum_frac_gt60"
        ),
        (
            pl.col("cum_sum_frac_gt60_sq") - pl.col("frac_gt60_sq")
        ).alias("hist_sum_frac_gt60_sq"),
        (pl.col("cum_sum_dn") - pl.col("day_night_ratio")).alias("hist_sum_dn"),
        (pl.col("cum_sum_dn_sq") - pl.col("dn_sq")).alias("hist_sum_dn_sq"),
    ]
)

# has past?
df = df.with_columns(
    (pl.col("hist_n_vms") > 0).cast(pl.Int8).alias("hist_has_past")
)


In [5]:
# Avoid divide-by-zero: all these are conditional on hist_n_vms > 0
df = df.with_columns(
    [
        # critical fraction
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_n_critical") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_critical_frac"),
        # lifetime mean / std
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_sum_lifetime") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_lifetime_mean"),
        pl.when(pl.col("hist_n_vms") > 0)
        .then(
            (
                pl.col("hist_sum_lifetime_sq") / pl.col("hist_n_vms")
                - (pl.col("hist_sum_lifetime") / pl.col("hist_n_vms")) ** 2
            )
            .clip(lower_bound=0.0)
            .sqrt()
        )
        .otherwise(0.0)
        .alias("hist_lifetime_std"),
        # cpu_mean mean
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_sum_cpu_mean") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_cpu_mean_mean"),
        # p95 mean
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_sum_p95") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_p95_mean"),
        # frac_gt60 mean
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_sum_frac_gt60") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_frac_gt60_mean"),
        # day_night_ratio mean
        pl.when(pl.col("hist_n_vms") > 0)
        .then(pl.col("hist_sum_dn") / pl.col("hist_n_vms"))
        .otherwise(0.0)
        .alias("hist_day_night_ratio_mean"),
    ]
)

print(
    df.select(
        [
            "subscription_id",
            "ts_vm_created",
            "critical",
            "hist_has_past",
            "hist_n_vms",
            "hist_critical_frac",
            "hist_lifetime_mean",
            "hist_cpu_mean_mean",
        ]
    ).head(10)
)


shape: (10, 8)
┌────────────┬────────────┬──────────┬────────────┬────────────┬───────────┬───────────┬───────────┐
│ subscripti ┆ ts_vm_crea ┆ critical ┆ hist_has_p ┆ hist_n_vms ┆ hist_crit ┆ hist_life ┆ hist_cpu_ │
│ on_id      ┆ ted        ┆ ---      ┆ ast        ┆ ---        ┆ ical_frac ┆ time_mean ┆ mean_mean │
│ ---        ┆ ---        ┆ i8       ┆ ---        ┆ i32        ┆ ---       ┆ ---       ┆ ---       │
│ str        ┆ i64        ┆          ┆ i8         ┆            ┆ f64       ┆ f64       ┆ f64       │
╞════════════╪════════════╪══════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╡
│ ++OvONy4Fe ┆ 0          ┆ 0        ┆ 0          ┆ 0          ┆ 0.0       ┆ 0.0       ┆ 0.0       │
│ 3c5KwLPfuO ┆            ┆          ┆            ┆            ┆           ┆           ┆           │
│ Z9o0oUwYS9 ┆            ┆          ┆            ┆            ┆           ┆           ┆           │
│ …          ┆            ┆          ┆            ┆            ┆           ┆

In [6]:
print("Max hist_n_vms:", df.select(pl.col("hist_n_vms").max()).item())
print(
    "Rows with hist_n_vms >= 1:",
    df.filter(pl.col("hist_n_vms") >= 1).height,
)

# Check the same example subscription you used before
example_sub = (
    df.select(
        pl.col("subscription_id"),
        pl.col("subscription_id").n_unique().alias("dummy"),
    )
    .select("subscription_id")
    .to_series()[0]
)  # or just hard-code your fXW7... string

sub_hist_df = (
    df.filter(pl.col("subscription_id") == example_sub)
    .select(
        [
            "subscription_id",
            "ts_vm_created",
            "critical",
            "hist_n_vms",
            "hist_has_past",
        ]
    )
    .sort("ts_vm_created")
)
print(sub_hist_df.head(20))


Max hist_n_vms: 56315
Rows with hist_n_vms >= 1: 903392
shape: (11, 5)
┌─────────────────────────────────┬───────────────┬──────────┬────────────┬───────────────┐
│ subscription_id                 ┆ ts_vm_created ┆ critical ┆ hist_n_vms ┆ hist_has_past │
│ ---                             ┆ ---           ┆ ---      ┆ ---        ┆ ---           │
│ str                             ┆ i64           ┆ i8       ┆ i32        ┆ i8            │
╞═════════════════════════════════╪═══════════════╪══════════╪════════════╪═══════════════╡
│ ++OvONy4Fe3c5KwLPfuOZ9o0oUwYS9… ┆ 0             ┆ 0        ┆ 0          ┆ 0             │
│ ++OvONy4Fe3c5KwLPfuOZ9o0oUwYS9… ┆ 0             ┆ 1        ┆ 1          ┆ 1             │
│ ++OvONy4Fe3c5KwLPfuOZ9o0oUwYS9… ┆ 0             ┆ 1        ┆ 2          ┆ 1             │
│ ++OvONy4Fe3c5KwLPfuOZ9o0oUwYS9… ┆ 0             ┆ 1        ┆ 3          ┆ 1             │
│ ++OvONy4Fe3c5KwLPfuOZ9o0oUwYS9… ┆ 0             ┆ 1        ┆ 4          ┆ 1             │
│ …      

In [7]:
K_MIN_HISTORY = 3  # same idea as before

df_all = df

df_filtered = df_all.filter(pl.col("hist_n_vms") >= K_MIN_HISTORY)

print("Total rows with history features :", df_all.height)
print(f"Rows with hist_n_vms >= {K_MIN_HISTORY}:", df_filtered.height)

# Drop helper columns before saving
drop_cols = [
    "one",
    "critical_i",
    "lifetime_sq",
    "cpu_mean_sq",
    "p95_sq",
    "frac_gt60_sq",
    "dn_sq",
    "cum_n_vms",
    "cum_n_critical",
    "cum_sum_lifetime",
    "cum_sum_lifetime_sq",
    "cum_sum_cpu_mean",
    "cum_sum_cpu_mean_sq",
    "cum_sum_p95",
    "cum_sum_p95_sq",
    "cum_sum_frac_gt60",
    "cum_sum_frac_gt60_sq",
    "cum_sum_dn",
    "cum_sum_dn_sq",
    "hist_sum_lifetime",
    "hist_sum_lifetime_sq",
    "hist_sum_cpu_mean",
    "hist_sum_cpu_mean_sq",
    "hist_sum_p95",
    "hist_sum_p95_sq",
    "hist_sum_frac_gt60",
    "hist_sum_frac_gt60_sq",
    "hist_sum_dn",
    "hist_sum_dn_sq",
]

df_all_clean = df_all.drop(drop_cols)
df_filt_clean = df_filtered.drop(drop_cols)

out_all = FINAL_DIR / "vm_request_table_all.parquet"
out_filtered = FINAL_DIR / "vm_request_table.parquet"

df_all_clean.write_parquet(out_all)
df_filt_clean.write_parquet(out_filtered)

print("Saved:")
print("  all rows  ->", out_all.resolve())
print("  filtered  ->", out_filtered.resolve())


Total rows with history features : 909281
Rows with hist_n_vms >= 3: 894280
Saved:
  all rows  -> /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_final/vm_request_table_all.parquet
  filtered  -> /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_final/vm_request_table.parquet
