In [1]:
from pathlib import Path

import polars as pl
import numpy as np


# --- paths (same as before) ---
INTERMEDIATE_DIR = Path("data_intermediate")
INTERMEDIATE_DIR.mkdir(parents=True, exist_ok=True)

vm_static_path = INTERMEDIATE_DIR / "vm_static.parquet"
vm_usage_path  = INTERMEDIATE_DIR / "vm_usage_agg.parquet"

print("vm_static_path:", vm_static_path.resolve())
print("vm_usage_path :", vm_usage_path.resolve())


vm_static_path: /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_intermediate/vm_static.parquet
vm_usage_path : /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_intermediate/vm_usage_agg.parquet


In [2]:
# Time & label thresholds (keep consistent with 00)

SECONDS_PER_HOUR = 60 * 60

LONG_LIVED_HOURS        = 18.0
THRESH_FRAC_GT_60       = 0.25   # fraction of samples
THRESH_P95              = 0.70   # p95 of CPU (0–1), we'll divide percent by 100

# Legacy diurnal thresholds (kept for the strong_diurnal column)
THRESH_DAY_NIGHT_RATIO  = 1.00
THRESH_DAY_MEAN         = 0.30   # mean day CPU (0–1), we'll divide percent by 100

# Periodicity thresholds (using hourly means)
PERIODICITY_MIN_ENERGY  = 0.10   # minimum fraction of energy at 24h frequency
PERIODICITY_RATIO_8_12  = 1.20   # 24h energy must beat 8h/12h by this ratio

# Minimum readings per VM to keep (can tweak)
MIN_READINGS = 12

print("LONG_LIVED_HOURS       =", LONG_LIVED_HOURS)
print("MIN_READINGS           =", MIN_READINGS)
print("PERIODICITY_MIN_ENERGY =", PERIODICITY_MIN_ENERGY)
print("PERIODICITY_RATIO_8_12 =", PERIODICITY_RATIO_8_12)


LONG_LIVED_HOURS       = 18.0
MIN_READINGS           = 12
PERIODICITY_MIN_ENERGY = 0.1
PERIODICITY_RATIO_8_12 = 1.2


In [3]:
print("Loading vm_static...")
vm_static = pl.read_parquet(vm_static_path)
print("vm_static rows:", vm_static.height)
print("vm_static cols:", len(vm_static.columns))

print("\nLoading vm_usage_agg...")
vm_usage = pl.read_parquet(vm_usage_path)
print("vm_usage rows:", vm_usage.height)
print("vm_usage cols:", len(vm_usage.columns))
print("vm_usage sample:")
print(vm_usage.head())


Loading vm_static...
vm_static rows: 2013766
vm_static cols: 22

Loading vm_usage_agg...
vm_usage rows: 2013767
vm_usage cols: 90
vm_usage sample:
shape: (5, 90)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ vm_id     ┆ n_reading ┆ sum_avg   ┆ sum_avg_s ┆ … ┆ cpu_hour_ ┆ cpu_hour_ ┆ cpu_hour_ ┆ cpu_hour │
│ ---       ┆ s         ┆ ---       ┆ q         ┆   ┆ 20_mean   ┆ 21_mean   ┆ 22_mean   ┆ _23_mean │
│ str       ┆ ---       ┆ f64       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆ u32       ┆           ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ iDRcL2N6S ┆ 1         ┆ 0.154114  ┆ 0.023751  ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.154114 │
│ V3Hte50J0 ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ wA9PsBXw2 ┆           ┆     

In [4]:
# Columns we definitely want from usage
keep_usage_cols = (
    ["vm_id",
     "n_readings",
     "max_cpu",
     "cpu_mean",
     "cpu_std",
     "cpu_frac_gt_60",
     "cpu_frac_gt_80",
     "day_cpu_mean",
     "night_cpu_mean",
     "day_night_ratio"]
    +
    [c for c in vm_usage.columns if c.startswith("cpu_hour_") and c.endswith("_mean")]
)

vm_usage_small = vm_usage.select(keep_usage_cols)
print("Reduced usage cols:", vm_usage_small.columns)


Reduced usage cols: ['vm_id', 'n_readings', 'max_cpu', 'cpu_mean', 'cpu_std', 'cpu_frac_gt_60', 'cpu_frac_gt_80', 'day_cpu_mean', 'night_cpu_mean', 'day_night_ratio', 'cpu_hour_0_mean', 'cpu_hour_1_mean', 'cpu_hour_2_mean', 'cpu_hour_3_mean', 'cpu_hour_4_mean', 'cpu_hour_5_mean', 'cpu_hour_6_mean', 'cpu_hour_7_mean', 'cpu_hour_8_mean', 'cpu_hour_9_mean', 'cpu_hour_10_mean', 'cpu_hour_11_mean', 'cpu_hour_12_mean', 'cpu_hour_13_mean', 'cpu_hour_14_mean', 'cpu_hour_15_mean', 'cpu_hour_16_mean', 'cpu_hour_17_mean', 'cpu_hour_18_mean', 'cpu_hour_19_mean', 'cpu_hour_20_mean', 'cpu_hour_21_mean', 'cpu_hour_22_mean', 'cpu_hour_23_mean']


In [5]:
print("Joining vm_static and vm_usage_small on vm_id (inner join)...")

vm_full = vm_static.join(
    vm_usage_small,
    on="vm_id",
    how="inner",
)

print("vm_full rows:", vm_full.height)
print("vm_full cols:", len(vm_full.columns))
print(vm_full.head())


Joining vm_static and vm_usage_small on vm_id (inner join)...
vm_full rows: 2013766
vm_full cols: 55
shape: (5, 55)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ vm_id     ┆ subscript ┆ deploymen ┆ ts_vm_cre ┆ … ┆ cpu_hour_ ┆ cpu_hour_ ┆ cpu_hour_ ┆ cpu_hour │
│ ---       ┆ ion_id    ┆ t_id      ┆ ated      ┆   ┆ 20_mean   ┆ 21_mean   ┆ 22_mean   ┆ _23_mean │
│ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆ str       ┆ str       ┆ i64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ iDRcL2N6S ┆ 96NakTym2 ┆ JZGr1w0Kg ┆ 947700    ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.154114 │
│ V3Hte50J0 ┆ g1dhZhYQ0 ┆ Quj2iDQY8 ┆           ┆   ┆           ┆           ┆           ┆          │
│ wA9PsBXw2 ┆ XbTw0xcYv ┆ bCorCp+P2 ┆           ┆   ┆           ┆           

In [6]:
before = vm_full.height
vm_full = vm_full.filter(pl.col("n_readings") >= MIN_READINGS)
after = vm_full.height

print(f"Filtered by n_readings >= {MIN_READINGS}: {before} → {after}")


Filtered by n_readings >= 12: 2013766 → 909281


In [7]:
# Periodicity from hourly means (24h vs 12h/8h)
hour_cols = [c for c in vm_full.columns if c.startswith("cpu_hour_") and c.endswith("_mean")]
print("Computing periodicity features from", len(hour_cols), "hourly mean columns")

def periodicity_tuple(hourly_values):
    arr = np.array(hourly_values, dtype=float)
    if arr.size != 24:
        return [0.0, 0.0, 0.0]
    arr = np.nan_to_num(arr, nan=0.0)
    arr = arr - arr.mean()
    spec = np.abs(np.fft.rfft(arr)) ** 2
    total = float(spec.sum())
    if total <= 0:
        return [0.0, 0.0, 0.0]
    f1 = float(spec[1]) if spec.size > 1 else 0.0  # 24h
    f2 = float(spec[2]) if spec.size > 2 else 0.0  # 12h
    f3 = float(spec[3]) if spec.size > 3 else 0.0  # 8h
    energy_frac_24 = f1 / total
    ratio_24_vs_8_12 = f1 / max(f2, f3, 1e-6)
    periodic_enough = float(
        (energy_frac_24 >= PERIODICITY_MIN_ENERGY)
        and (ratio_24_vs_8_12 >= PERIODICITY_RATIO_8_12)
    )
    return [energy_frac_24, ratio_24_vs_8_12, periodic_enough]

vm_full = vm_full.with_columns(
    pl.concat_list(hour_cols).alias("hourly_vec")
)

vm_full = vm_full.with_columns(
    pl.col("hourly_vec").map_elements(
        periodicity_tuple,
        return_dtype=pl.List(pl.Float64),
    ).alias("periodicity_feats")
)

vm_full = vm_full.with_columns(
    pl.col("periodicity_feats").list.get(0).alias("periodicity_energy_24"),
    pl.col("periodicity_feats").list.get(1).alias("periodicity_ratio_24_vs_8_12"),
    pl.col("periodicity_feats").list.get(2).cast(pl.Int8).alias("periodic_enough"),
).drop(["hourly_vec", "periodicity_feats"])

print(
    vm_full.select(
        ["periodicity_energy_24", "periodicity_ratio_24_vs_8_12", "periodic_enough"]
    ).head()
)


Computing periodicity features from 24 hourly mean columns
shape: (5, 3)
┌───────────────────────┬──────────────────────────────┬─────────────────┐
│ periodicity_energy_24 ┆ periodicity_ratio_24_vs_8_12 ┆ periodic_enough │
│ ---                   ┆ ---                          ┆ ---             │
│ f64                   ┆ f64                          ┆ i8              │
╞═══════════════════════╪══════════════════════════════╪═════════════════╡
│ 0.101272              ┆ 1.017154                     ┆ 0               │
│ 0.064193              ┆ 0.358735                     ┆ 0               │
│ 0.151667              ┆ 1.04482                      ┆ 0               │
│ 0.31772               ┆ 3.001851                     ┆ 1               │
│ 0.139144              ┆ 0.71656                      ┆ 0               │
└───────────────────────┴──────────────────────────────┴─────────────────┘


In [8]:
# Compute boolean components
vm_full = vm_full.with_columns(
    [
        # long_lived: lifetime in hours
        (pl.col("lifetime_hours") >= LONG_LIVED_HOURS).alias("long_lived"),

        # sustained_high: high fraction >60% OR high p95
        (
            (pl.col("cpu_frac_gt_60") >= THRESH_FRAC_GT_60)
            | ((pl.col("p95_max_cpu") / 100.0) >= THRESH_P95)
        ).alias("sustained_high"),

        # strong_diurnal: legacy day/night ratio and absolute day mean (kept for analysis)
        (
            (pl.col("day_night_ratio") >= THRESH_DAY_NIGHT_RATIO)
            & ((pl.col("day_cpu_mean") / 100.0) >= THRESH_DAY_MEAN)
        ).alias("strong_diurnal"),
    ]
)

# Final critical label: periodic AND (long_lived OR sustained_high)
vm_full = vm_full.with_columns(
    (
        (pl.col("periodic_enough") == 1)
        & (pl.col("long_lived") | pl.col("sustained_high"))
    )
    .cast(pl.Int8)
    .alias("critical")
)

print(vm_full.select(
    ["vm_id", "lifetime_hours",
     "cpu_frac_gt_60", "p95_max_cpu",
     "day_cpu_mean", "day_night_ratio",
     "periodicity_energy_24", "periodicity_ratio_24_vs_8_12", "periodic_enough",
     "long_lived", "sustained_high", "strong_diurnal", "critical"]
).head())


shape: (5, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ vm_id     ┆ lifetime_ ┆ cpu_frac_ ┆ p95_max_c ┆ … ┆ long_live ┆ sustained ┆ strong_di ┆ critical │
│ ---       ┆ hours     ┆ gt_60     ┆ pu        ┆   ┆ d         ┆ _high     ┆ urnal     ┆ ---      │
│ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ i8       │
│           ┆ f64       ┆ f64       ┆ f64       ┆   ┆ bool      ┆ bool      ┆ bool      ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ d8YTWmfMd ┆ 1.083333  ┆ 0.0       ┆ 71.388793 ┆ … ┆ false     ┆ true      ┆ false     ┆ 0        │
│ Xa1Sy8m5m ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ kFt4r1/3I ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ MIh…      ┆           ┆           ┆           ┆   ┆           ┆           

In [9]:
print("Critical label distribution:")
print(
    vm_full
    .group_by("critical")
    .count()
)
n_total = vm_full.height
n_crit  = vm_full.filter(pl.col("critical") == 1).height
print(f"Critical VMs: {n_crit} / {n_total} = {n_crit / n_total:.3f}")


Critical label distribution:
shape: (2, 2)
┌──────────┬────────┐
│ critical ┆ count  │
│ ---      ┆ ---    │
│ i8       ┆ u32    │
╞══════════╪════════╡
│ 0        ┆ 594946 │
│ 1        ┆ 314335 │
└──────────┴────────┘
Critical VMs: 314335 / 909281 = 0.346


  .count()


In [10]:
out_path = INTERMEDIATE_DIR / "vm_full_labeled.parquet"
vm_full.write_parquet(out_path)
print("Saved vm_full_labeled to:", out_path.resolve())


Saved vm_full_labeled to: /home/laytc/GitHubRepo/Penn/CIS-5200-Final-Project/preprocess/data_intermediate/vm_full_labeled.parquet
