In [1]:
from pathlib import Path
import polars as pl

FINAL_DIR = Path("data_final")
FINAL_DIR.mkdir(parents=True, exist_ok=True)

request_table_path = FINAL_DIR / "vm_request_table.parquet"
print("vm_request_table path:", request_table_path.resolve())


vm_request_table path: C:\Users\Layton\Desktop\cis-520\preprocess\data_final\vm_request_table.parquet


In [2]:
print("Loading vm_request_table...")
df = pl.read_parquet(request_table_path)
print("Rows:", df.height, "Cols:", len(df.columns))

if "ts_vm_created" not in df.columns:
    raise ValueError("ts_vm_created column missing from vm_request_table.")

# Recompute day_idx = floor(ts_vm_created / 86400)
SECONDS_PER_DAY = 24 * 60 * 60

df = df.with_columns(
    (pl.col("ts_vm_created") // SECONDS_PER_DAY)
    .cast(pl.Int32)
    .alias("day_idx")
)

print(
    "day_idx range:",
    df.select(
        pl.col("day_idx").min().alias("min_day_idx"),
        pl.col("day_idx").max().alias("max_day_idx"),
    )
)



Loading vm_request_table...
Rows: 894280 Cols: 69
day_idx range: shape: (1, 2)
┌─────────────┬─────────────┐
│ min_day_idx ┆ max_day_idx │
│ ---         ┆ ---         │
│ i32         ┆ i32         │
╞═════════════╪═════════════╡
│ 0           ┆ 29          │
└─────────────┴─────────────┘


In [3]:
df = df.with_columns(
    pl.when(pl.col("day_idx") <= 19)
    .then(pl.lit("train"))
    .when(pl.col("day_idx") <= 24)
    .then(pl.lit("val"))
    .otherwise(pl.lit("test"))
    .alias("split")
)

print(
    df
    .group_by("split")
    .count()
    .sort("split")
)


shape: (3, 2)
┌───────┬────────┐
│ split ┆ count  │
│ ---   ┆ ---    │
│ str   ┆ u32    │
╞═══════╪════════╡
│ test  ┆ 131849 │
│ train ┆ 632426 │
│ val   ┆ 130005 │
└───────┴────────┘


  .count()


In [4]:
# 1) single file with split column
combined_path = FINAL_DIR / "vm_request_table_with_split.parquet"
df.write_parquet(combined_path)
print("Wrote combined table with split ->", combined_path.resolve())

# 2) separate files
train_df = df.filter(pl.col("split") == "train")
val_df   = df.filter(pl.col("split") == "val")
test_df  = df.filter(pl.col("split") == "test")

train_path = FINAL_DIR / "vm_train.parquet"
val_path   = FINAL_DIR / "vm_val.parquet"
test_path  = FINAL_DIR / "vm_test.parquet"

train_df.write_parquet(train_path)
val_df.write_parquet(val_path)
test_df.write_parquet(test_path)

print("Saved:")
print("  train ->", train_path.resolve())
print("  val   ->", val_path.resolve())
print("  test  ->", test_path.resolve())


Wrote combined table with split -> C:\Users\Layton\Desktop\cis-520\preprocess\data_final\vm_request_table_with_split.parquet
Saved:
  train -> C:\Users\Layton\Desktop\cis-520\preprocess\data_final\vm_train.parquet
  val   -> C:\Users\Layton\Desktop\cis-520\preprocess\data_final\vm_val.parquet
  test  -> C:\Users\Layton\Desktop\cis-520\preprocess\data_final\vm_test.parquet


In [5]:
def critical_stats(df_split: pl.DataFrame, name: str):
    n_total = df_split.height
    if n_total == 0:
        print(f"{name}: 0 rows")
        return
    n_crit = df_split.filter(pl.col("critical") == 1).height
    print(f"{name}: {n_crit} / {n_total} critical = {n_crit / n_total:.3f}")

print("Critical label stats by split:")
critical_stats(train_df, "train")
critical_stats(val_df,   "val")
critical_stats(test_df,  "test")


Critical label stats by split:
train: 16541 / 632426 critical = 0.026
val: 3949 / 130005 critical = 0.030
test: 2872 / 131849 critical = 0.022


In [6]:
print("\nUnique tenants (subscriptions) per split:")
print(
    train_df.select(pl.col("subscription_id").n_unique().alias("train_subs"))
    .hstack(
        val_df.select(pl.col("subscription_id").n_unique().alias("val_subs"))
    )
    .hstack(
        test_df.select(pl.col("subscription_id").n_unique().alias("test_subs"))
    )
)



Unique tenants (subscriptions) per split:
shape: (1, 3)
┌────────────┬──────────┬───────────┐
│ train_subs ┆ val_subs ┆ test_subs │
│ ---        ┆ ---      ┆ ---       │
│ u32        ┆ u32      ┆ u32       │
╞════════════╪══════════╪═══════════╡
│ 3778       ┆ 1899     ┆ 1795      │
└────────────┴──────────┴───────────┘


In [7]:
print("\nvm_category distribution in train:")
print(train_df.group_by("vm_category").count().sort("count", descending=True))

print("\nvm_virtual_core_count stats in train:")
print(
    train_df.select(
        pl.col("vm_virtual_core_count").min().alias("min_cores"),
        pl.col("vm_virtual_core_count").max().alias("max_cores"),
        pl.col("vm_virtual_core_count").mean().alias("mean_cores"),
    )
)



vm_category distribution in train:
shape: (3, 2)
┌───────────────────┬────────┐
│ vm_category       ┆ count  │
│ ---               ┆ ---    │
│ str               ┆ u32    │
╞═══════════════════╪════════╡
│ Unkown            ┆ 370164 │
│ Delay-insensitive ┆ 208331 │
│ Interactive       ┆ 53931  │
└───────────────────┴────────┘

vm_virtual_core_count stats in train:
shape: (1, 3)
┌───────────┬───────────┬────────────┐
│ min_cores ┆ max_cores ┆ mean_cores │
│ ---       ┆ ---       ┆ ---        │
│ i64       ┆ i64       ┆ f64        │
╞═══════════╪═══════════╪════════════╡
│ 1         ┆ 16        ┆ 3.034866   │
└───────────┴───────────┴────────────┘


  print(train_df.group_by("vm_category").count().sort("count", descending=True))
