In [1]:
# %% [markdown]
# # MLP model for VM criticality (arrival-time features only)
# 
# - Uses the same 23 arrival-time features as the cascade model
# - Preprocessing: impute + one-hot + scale
# - Model: MLPClassifier with class reweighting via sample_weight
# - Threshold tuning for better recall/F1 on criticals

# %%
import os
import numpy as np
import pandas as pd
import polars as pl

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

RANDOM_STATE = 520
np.random.seed(RANDOM_STATE)

# Try paths relative to repo root and to models/
CANDIDATE_PATHS = [
    "../data_final/vm_request_table_with_split.parquet",
]

DATA_PATH = None
for path in CANDIDATE_PATHS:
    if os.path.exists(path):
        DATA_PATH = path
        break

if DATA_PATH is None:
    raise FileNotFoundError(
        f"Could not find vm_request_table_with_split.parquet. "
        f"Tried: {CANDIDATE_PATHS}. CWD={os.getcwd()}"
    )

print("Using DATA_PATH:", DATA_PATH)


Using DATA_PATH: ../data_final/vm_request_table_with_split.parquet


In [2]:
# %% [markdown]
# ## Load dataset and restrict to arrival-time features

# %%
df_pl = pl.read_parquet(DATA_PATH)

print("Columns:", len(df_pl.columns))
print("Rows:", df_pl.height)
print("Split counts:")
print(df_pl["split"].value_counts())
print("Critical label counts:")
print(df_pl["critical"].value_counts())

# %%
arrival_feature_cols = [
    "ts_vm_created",
    "day_idx",
    "hour_of_day",
    "vm_category",
    "vm_virtual_core_count",
    "vm_memory_gb",
    "vm_mem_per_core",
    "deployment_size",
    "log_deployment_size",
    "ts_first_vm_created",
    "count_vms_created",
    "sub_first_day",
    "sub_first_hour",
    "hist_n_vms",
    "hist_n_critical",
    "hist_has_past",
    "hist_critical_frac",
    "hist_lifetime_mean",
    "hist_lifetime_std",
    "hist_cpu_mean_mean",
    "hist_p95_mean",
    "hist_frac_gt60_mean",
    "hist_day_night_ratio_mean",
]

missing = [c for c in arrival_feature_cols if c not in df_pl.columns]
assert not missing, f"Missing expected arrival-time cols: {missing}"

label_col = "critical"
split_col = "split"

train_pl = df_pl.filter(pl.col(split_col) == "train")
val_pl   = df_pl.filter(pl.col(split_col) == "val")
test_pl  = df_pl.filter(pl.col(split_col) == "test")

train = train_pl.select(arrival_feature_cols + [label_col]).to_pandas()
val   = val_pl.select(arrival_feature_cols + [label_col]).to_pandas()
test  = test_pl.select(arrival_feature_cols + [label_col]).to_pandas()

print("Train shape:", train.shape)
print("Val shape:  ", val.shape)
print("Test shape: ", test.shape)

# %%
X_train = train[arrival_feature_cols]
y_train = train[label_col].astype(int).values

X_val = val[arrival_feature_cols]
y_val = val[label_col].astype(int).values

X_test = test[arrival_feature_cols]
y_test = test[label_col].astype(int).values

print("Positive rate (critical=1):")
for name, y in [("train", y_train), ("val", y_val), ("test", y_test)]:
    print(f"{name:5s}: {y.mean():.4f}")


Columns: 73
Rows: 894280
Split counts:
shape: (3, 2)
┌───────┬────────┐
│ split ┆ count  │
│ ---   ┆ ---    │
│ str   ┆ u32    │
╞═══════╪════════╡
│ test  ┆ 131849 │
│ val   ┆ 130005 │
│ train ┆ 632426 │
└───────┴────────┘
Critical label counts:
shape: (2, 2)
┌──────────┬────────┐
│ critical ┆ count  │
│ ---      ┆ ---    │
│ i8       ┆ u32    │
╞══════════╪════════╡
│ 0        ┆ 584988 │
│ 1        ┆ 309292 │
└──────────┴────────┘
Train shape: (632426, 24)
Val shape:   (130005, 24)
Test shape:  (131849, 24)
Positive rate (critical=1):
train: 0.3593
val  : 0.3029
test : 0.3237


In [3]:
# %% [markdown]
# ## Preprocessing and threshold utilities
# 
# - Categorical: impute + one-hot
# - Numeric: median impute + scale
# - Helpers: precision/recall/F1 and threshold search

# %%
categorical_features = ["vm_category"]
numeric_features = [c for c in arrival_feature_cols if c not in categorical_features]

# Categorical pipeline: impute then one-hot encode
cat_imputer = SimpleImputer(strategy="most_frequent")

try:
    ohe = OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=False,  # sklearn >= 1.2
    )
except TypeError:
    ohe = OneHotEncoder(
        handle_unknown="ignore",
        sparse=False,         # older sklearn versions
    )

categorical_transformer = Pipeline(
    steps=[
        ("imputer", cat_imputer),
        ("ohe", ohe),
    ]
)

# Numeric pipeline: median impute + scale
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numeric_features),
    ]
)

# %%
def precision_recall_f1(y_true, y_pred):
    """Return (precision, recall, f1) for positive class 1."""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )
    return precision, recall, f1


def choose_best_threshold_with_min_recall(
    y_true,
    p_hat,
    num_grid=200,
    lo=0.01,
    hi=0.99,
    min_recall=0.6,
):
    """
    Sweep thresholds and pick the one that maximizes F1 for class 1
    subject to recall >= min_recall. If no threshold satisfies that,
    pick the one with best recall (ties broken by F1).
    """
    y_true = np.asarray(y_true)
    p_hat = np.asarray(p_hat)

    thresholds = np.linspace(lo, hi, num_grid)
    best_t = 0.5
    best_prec = 0.0
    best_rec = 0.0
    best_f1 = 0.0
    best_meets = False

    for t in thresholds:
        y_pred = (p_hat >= t).astype(int)
        prec, rec, f1 = precision_recall_f1(y_true, y_pred)

        if rec >= min_recall:
            if (not best_meets) or (f1 > best_f1):
                best_meets = True
                best_t, best_prec, best_rec, best_f1 = t, prec, rec, f1
        else:
            if (not best_meets) and (rec > best_rec or (rec == best_rec and f1 > best_f1)):
                best_t, best_prec, best_rec, best_f1 = t, prec, rec, f1

    return best_t, best_prec, best_rec, best_f1


In [4]:
# %% [markdown]
# ## Compute sample weights for imbalanced labels
# 
# We upweight critical (1) examples by N_neg / N_pos.

# %%
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
pos_weight = n_neg / n_pos
print(f"Train pos={n_pos}, neg={n_neg}, pos_weight={pos_weight:.2f}")

sample_weight_train = np.where(y_train == 1, pos_weight, 1.0)


Train pos=227239, neg=405187, pos_weight=1.78


In [5]:
# %% [markdown]
# ## Oversample positive (critical) class for MLP
# 
# MLP in this sklearn version does not support sample_weight, so we
# handle class imbalance by oversampling critical=1 examples to
# roughly match the number of non-critical=0 examples.

# %%
from sklearn.utils import resample

# X_train is a DataFrame, y_train is a 1D numpy array (ints)
mask_pos = (y_train == 1)
mask_neg = (y_train == 0)

X_pos = X_train[mask_pos]
y_pos = y_train[mask_pos]

X_neg = X_train[mask_neg]
y_neg = y_train[mask_neg]

print("Original counts:")
print("  neg:", len(y_neg), " pos:", len(y_pos))

# Oversample positives up to the same size as negatives
X_pos_up, y_pos_up = resample(
    X_pos,
    y_pos,
    replace=True,
    n_samples=len(y_neg),
    random_state=RANDOM_STATE,
)

# Combine and shuffle
X_train_os = pd.concat([X_neg, X_pos_up], axis=0)
y_train_os = np.concatenate([y_neg, y_pos_up])

perm = np.random.permutation(len(y_train_os))
X_train_os = X_train_os.iloc[perm]
y_train_os = y_train_os[perm]

print("Oversampled counts:")
print("  neg:", (y_train_os == 0).sum(), " pos:", (y_train_os == 1).sum())


Original counts:
  neg: 405187  pos: 227239
Oversampled counts:
  neg: 405187  pos: 405187


In [6]:
# %% [markdown]
# ## MLP model
# 
# - 2 hidden layers with ReLU
# - Early stopping
# - Trained on oversampled data (no sample_weight needed)

# %%
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    alpha=1e-4,            # L2 regularization
    batch_size=256,
    learning_rate_init=1e-3,
    max_iter=50,           # bump to 100 if it underfits
    early_stopping=True,
    n_iter_no_change=5,
    random_state=RANDOM_STATE,
    verbose=True,
)

mlp_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("mlp", mlp),
    ]
)

# %%
# Fit on the oversampled training data
mlp_pipeline.fit(
    X_train_os,
    y_train_os,
)


Iteration 1, loss = 0.49415038
Validation score: 0.753387
Iteration 2, loss = 0.47935724
Validation score: 0.755757
Iteration 3, loss = 0.47316614
Validation score: 0.760668
Iteration 4, loss = 0.46901257
Validation score: 0.763259
Iteration 5, loss = 0.46583451
Validation score: 0.764543
Iteration 6, loss = 0.46365086
Validation score: 0.765875
Iteration 7, loss = 0.46156895
Validation score: 0.767850
Iteration 8, loss = 0.45978307
Validation score: 0.766048
Iteration 9, loss = 0.45811569
Validation score: 0.769306
Iteration 10, loss = 0.45666127
Validation score: 0.771996
Iteration 11, loss = 0.45520925
Validation score: 0.772058
Iteration 12, loss = 0.45410303
Validation score: 0.771515
Iteration 13, loss = 0.45293315
Validation score: 0.769910
Iteration 14, loss = 0.45206412
Validation score: 0.771354
Iteration 15, loss = 0.45112702
Validation score: 0.773575
Iteration 16, loss = 0.45010383
Validation score: 0.772132
Iteration 17, loss = 0.44935810
Validation score: 0.772526
Iterat



In [7]:
# %% [markdown]
# ## Baseline performance (threshold = 0.5)

# %%
p_train = mlp_pipeline.predict_proba(X_train)[:, 1]
p_val   = mlp_pipeline.predict_proba(X_val)[:, 1]
p_test  = mlp_pipeline.predict_proba(X_test)[:, 1]

# %%
y_val_05 = (p_val >= 0.5).astype(int)
print("=== MLP on VAL (threshold=0.5) ===")
print(confusion_matrix(y_val, y_val_05))
print(classification_report(y_val, y_val_05, digits=4))
prec_val_05, rec_val_05, f1_val_05 = precision_recall_f1(y_val, y_val_05)
print(f"VAL (critical=1): prec={prec_val_05:.4f}, rec={rec_val_05:.4f}, f1={f1_val_05:.4f}")

# %%
y_test_05 = (p_test >= 0.5).astype(int)
print("=== MLP on TEST (threshold=0.5) ===")
print(confusion_matrix(y_test, y_test_05))
print(classification_report(y_test, y_test_05, digits=4))
prec_test_05, rec_test_05, f1_test_05 = precision_recall_f1(y_test, y_test_05)
print(f"TEST (critical=1): prec={prec_test_05:.4f}, rec={rec_test_05:.4f}, f1={f1_test_05:.4f}")


=== MLP on VAL (threshold=0.5) ===
[[66880 23748]
 [10583 28794]]
              precision    recall  f1-score   support

           0     0.8634    0.7380    0.7958     90628
           1     0.5480    0.7312    0.6265     39377

    accuracy                         0.7359    130005
   macro avg     0.7057    0.7346    0.7111    130005
weighted avg     0.7679    0.7359    0.7445    130005

VAL (critical=1): prec=0.5480, rec=0.7312, f1=0.6265
=== MLP on TEST (threshold=0.5) ===
[[57793 31380]
 [11897 30779]]
              precision    recall  f1-score   support

           0     0.8293    0.6481    0.7276     89173
           1     0.4952    0.7212    0.5872     42676

    accuracy                         0.6718    131849
   macro avg     0.6622    0.6847    0.6574    131849
weighted avg     0.7211    0.6718    0.6821    131849

TEST (critical=1): prec=0.4952, rec=0.7212, f1=0.5872


In [8]:
# %% [markdown]
# ## Threshold tuning on VAL for critical=1
# 
# Goal: better tradeoff between recall and precision than 0.5.

# %%
MIN_RECALL = 0.65  # adjust up/down depending on how aggressive you want recall

t_opt, prec_opt, rec_opt, f1_opt = choose_best_threshold_with_min_recall(
    y_true=y_val,
    p_hat=p_val,
    num_grid=200,
    lo=0.01,
    hi=0.99,
    min_recall=MIN_RECALL,
)

print("Chosen threshold on VAL:")
print(f"t_opt={t_opt:.3f}, prec={prec_opt:.4f}, rec={rec_opt:.4f}, f1={f1_opt:.4f}")


Chosen threshold on VAL:
t_opt=0.650, prec=0.6261, rec=0.6552, f1=0.6403


In [9]:
# %% [markdown]
# ## Final tuned MLP performance

# %%
# VAL
y_val_tuned = (p_val >= t_opt).astype(int)
print("=== Tuned MLP on VAL ===")
print(confusion_matrix(y_val, y_val_tuned))
print(classification_report(y_val, y_val_tuned, digits=4))
prec_val_tuned, rec_val_tuned, f1_val_tuned = precision_recall_f1(y_val, y_val_tuned)
print(f"VAL (critical=1): prec={prec_val_tuned:.4f}, rec={rec_val_tuned:.4f}, f1={f1_val_tuned:.4f}")

# %%
# TEST
y_test_tuned = (p_test >= t_opt).astype(int)
print("=== Tuned MLP on TEST ===")
print(confusion_matrix(y_test, y_test_tuned))
print(classification_report(y_test, y_test_tuned, digits=4))
prec_test_tuned, rec_test_tuned, f1_test_tuned = precision_recall_f1(y_test, y_test_tuned)
print(f"TEST (critical=1): prec={prec_test_tuned:.4f}, rec={rec_test_tuned:.4f}, f1={f1_test_tuned:.4f}")


=== Tuned MLP on VAL ===
[[75224 15404]
 [13578 25799]]
              precision    recall  f1-score   support

           0     0.8471    0.8300    0.8385     90628
           1     0.6261    0.6552    0.6403     39377

    accuracy                         0.7771    130005
   macro avg     0.7366    0.7426    0.7394    130005
weighted avg     0.7802    0.7771    0.7785    130005

VAL (critical=1): prec=0.6261, rec=0.6552, f1=0.6403
=== Tuned MLP on TEST ===
[[64431 24742]
 [14153 28523]]
              precision    recall  f1-score   support

           0     0.8199    0.7225    0.7681     89173
           1     0.5355    0.6684    0.5946     42676

    accuracy                         0.7050    131849
   macro avg     0.6777    0.6955    0.6814    131849
weighted avg     0.7278    0.7050    0.7120    131849

TEST (critical=1): prec=0.5355, rec=0.6684, f1=0.5946
