# CatBoost model for VM criticality

Using only arrival-time features and the time-based train/val/test splits.


In [1]:
import polars as pl
import numpy as np
import pandas as pd

from pathlib import Path
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    confusion_matrix,
)

# --- Configuration ---
DATA_DIR = Path("../data_final")
TRAIN_PATH = DATA_DIR / "vm_train.parquet"
VAL_PATH   = DATA_DIR / "vm_val.parquet"
TEST_PATH  = DATA_DIR / "vm_test.parquet"

TARGET_COL = "critical"


In [2]:
# 1) Load Data Splits
# -----------------------------------------------------------------------------
print("Loading split files...")
df_train = pl.read_parquet(TRAIN_PATH)
df_val   = pl.read_parquet(VAL_PATH)
df_test  = pl.read_parquet(TEST_PATH)

print(f"Train rows: {df_train.height}")
print(f"Val rows:   {df_val.height}")
print(f"Test rows:  {df_test.height}")


Loading split files...
Train rows: 632426
Val rows:   130005
Test rows:  131849


In [3]:
# 2) Strict Feature Selection (23 arrival-time features)
# -----------------------------------------------------------------------------
feat_timing = [
    "day_idx",
    "hour_of_day",
    "ts_vm_created",
    "ts_first_vm_created",
]

feat_static = [
    "vm_virtual_core_count",
    "vm_memory_gb",
    "vm_mem_per_core",
    "deployment_size",
    "log_deployment_size",
    "count_vms_created",
    "sub_first_day",
    "sub_first_hour",
]

feat_history = [
    "hist_n_vms",
    "hist_n_critical",
    "hist_has_past",
    "hist_critical_frac",
    "hist_lifetime_mean",
    "hist_lifetime_std",
    "hist_cpu_mean_mean",
    "hist_p95_mean",
    "hist_frac_gt60_mean",
    "hist_day_night_ratio_mean",
]

feat_categorical = ["vm_category"]

ALL_FEATURES = feat_timing + feat_static + feat_history + feat_categorical

print("\n--- Feature Check ---")
print(f"Timing: {len(feat_timing)}")
print(f"Static: {len(feat_static)}")
print(f"History: {len(feat_history)}")
print(f"Categorical: {len(feat_categorical)}")
print(f"Total: {len(ALL_FEATURES)}")



--- Feature Check ---
Timing: 4
Static: 8
History: 10
Categorical: 1
Total: 23


In [4]:
# 3) Helpers to prepare CatBoost inputs
# -----------------------------------------------------------------------------
def prepare_df(df_polars):
    # Select features + target and convert to pandas
    df_pd = df_polars.select(ALL_FEATURES + [TARGET_COL]).to_pandas()
    X = df_pd[ALL_FEATURES].copy()
    y = df_pd[TARGET_COL].values.ravel()
    # Ensure categorical dtype
    for col in feat_categorical:
        X[col] = X[col].astype("category")
    return X, y


X_train, y_train = prepare_df(df_train)
X_val, y_val     = prepare_df(df_val)
X_test, y_test   = prepare_df(df_test)

# Indices for CatBoost categorical features
cat_indices = [ALL_FEATURES.index(c) for c in feat_categorical]

# Class weights (neg/pos)
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
class_weights = [1.0, neg_count / pos_count]

print(f"Train positive rate: {y_train.mean():.2%} (count={pos_count})")
print(f"Class weights: {class_weights}")


Train positive rate: 35.93% (count=227239)
Class weights: [1.0, np.float64(1.7830874101716694)]


In [5]:
# 4) Train CatBoost with early stopping (optimize PR-friendly behaviour)
# -----------------------------------------------------------------------------
train_pool = Pool(X_train, y_train, cat_features=cat_indices)
val_pool   = Pool(X_val, y_val, cat_features=cat_indices)

params = dict(
    loss_function="Logloss",
    eval_metric="PRAUC",   # focus on PR AUC
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=5,
    iterations=1500,
    random_seed=42,
    od_type="Iter",
    od_wait=80,
    use_best_model=True,
    subsample=0.9,
    colsample_bylevel=0.8,
    class_weights=class_weights,
    verbose=200,
)

clf = CatBoostClassifier(**params)
clf.fit(train_pool, eval_set=val_pool)


0:	learn: 0.8285065	test: 0.8165690	best: 0.8165690 (0)	total: 188ms	remaining: 4m 42s
200:	learn: 0.8671036	test: 0.8358504	best: 0.8363609 (166)	total: 28.2s	remaining: 3m 2s
Stopped by overfitting detector  (80 iterations wait)

bestTest = 0.8363608646
bestIteration = 166

Shrink model to first 167 iterations.


<catboost.core.CatBoostClassifier at 0x7fd66d6ec590>

In [6]:
# 5) Threshold search (maximize F1 on val) and metric helper
# -----------------------------------------------------------------------------
from dataclasses import dataclass


@dataclass
class EvalResult:
    split: str
    roc_auc: float
    pr_auc: float
    f1: float
    threshold: float
    report: str
    confusion: np.ndarray


def pick_threshold(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1_scores = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1] + 1e-9)
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx], f1_scores[best_idx]


def evaluate_split(y_true, y_prob, split_name, threshold):
    preds = (y_prob >= threshold).astype(int)
    return EvalResult(
        split=split_name,
        roc_auc=roc_auc_score(y_true, y_prob),
        pr_auc=average_precision_score(y_true, y_prob),
        f1=classification_report(y_true, preds, output_dict=True)["weighted avg"]["f1-score"],
        threshold=threshold,
        report=classification_report(y_true, preds),
        confusion=confusion_matrix(y_true, preds),
    )


# Choose threshold on val
val_prob = clf.predict_proba(X_val)[:, 1]
best_thr, best_f1 = pick_threshold(y_val, val_prob)
print(f"Best val threshold (by F1): {best_thr:.4f} | F1={best_f1:.4f}")


Best val threshold (by F1): 0.5733 | F1=0.6713


In [7]:
# 6) Full evaluation on val and test
# -----------------------------------------------------------------------------
val_metrics = evaluate_split(y_val, val_prob, "val", best_thr)
test_prob = clf.predict_proba(X_test)[:, 1]
test_metrics = evaluate_split(y_test, test_prob, "test", best_thr)

for m in [val_metrics, test_metrics]:
    print(f"\n--- {m.split.upper()} ---")
    print(m.report)
    print(f"ROC AUC: {m.roc_auc:.4f}")
    print(f"PR  AUC: {m.pr_auc:.4f}")
    print(f"F1 (weighted): {m.f1:.4f}")
    print("Confusion matrix:\n", m.confusion)



--- VAL ---
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     90628
           1       0.68      0.66      0.67     39377

    accuracy                           0.80    130005
   macro avg       0.77      0.76      0.77    130005
weighted avg       0.80      0.80      0.80    130005

ROC AUC: 0.8477
PR  AUC: 0.7632
F1 (weighted): 0.8031
Confusion matrix:
 [[78506 12122]
 [13360 26017]]

--- TEST ---
              precision    recall  f1-score   support

           0       0.83      0.84      0.83     89173
           1       0.65      0.63      0.64     42676

    accuracy                           0.77    131849
   macro avg       0.74      0.73      0.74    131849
weighted avg       0.77      0.77      0.77    131849

ROC AUC: 0.8203
PR  AUC: 0.7363
F1 (weighted): 0.7710
Confusion matrix:
 [[75072 14101]
 [15920 26756]]


In [8]:
# 7) Save artifacts
# -----------------------------------------------------------------------------
ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

model_path = ARTIFACT_DIR / "catboost_model.cbm"
metrics_path = ARTIFACT_DIR / "catboost_metrics.json"

clf.save_model(model_path)

import json
with open(metrics_path, "w") as f:
    json.dump(
        {
            "val": {
                "roc_auc": val_metrics.roc_auc,
                "pr_auc": val_metrics.pr_auc,
                "f1_weighted": val_metrics.f1,
                "threshold": val_metrics.threshold,
            },
            "test": {
                "roc_auc": test_metrics.roc_auc,
                "pr_auc": test_metrics.pr_auc,
                "f1_weighted": test_metrics.f1,
                "threshold": test_metrics.threshold,
            },
        },
        f,
        indent=2,
    )

print(f"Saved model to {model_path}")
print(f"Saved metrics to {metrics_path}")


Saved model to artifacts/catboost_model.cbm
Saved metrics to artifacts/catboost_metrics.json
