In [27]:
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# --- Configuration ---
DATA_DIR = Path("../data_final")
TRAIN_PATH = DATA_DIR / "vm_train.parquet"
VAL_PATH   = DATA_DIR / "vm_val.parquet"
TEST_PATH  = DATA_DIR / "vm_test.parquet"

TARGET_COL = "critical"

In [28]:
# 1. Load Data Splits
# ------------------------------------------------------------------------------
print("Loading split files...")
df_train = pl.read_parquet(TRAIN_PATH)
df_val   = pl.read_parquet(VAL_PATH)
df_test  = pl.read_parquet(TEST_PATH)

print(f"Train rows: {df_train.height}")

Loading split files...
Train rows: 632426


In [31]:
# 2. Strict Feature Selection (Added Timestamps)
# ------------------------------------------------------------------------------

# A. Identifiers & Timing (Updated with Raw Timestamps)
feat_timing = [
    "day_idx", 
    "hour_of_day",
    "ts_vm_created",       # Added per request
    "ts_first_vm_created"  # Added per request (Tenant join time)
]

# B. Static VM Config & Deployment Metadata
feat_static = [
    "vm_virtual_core_count",
    "vm_memory_gb",
    "vm_mem_per_core",
    "deployment_size",
    "log_deployment_size",
    "count_vms_created",
    "sub_first_day",
    "sub_first_hour"
]

# C. Tenant History Features (Safe)
feat_history = [c for c in df_train.columns if c.startswith("hist_")]

# D. Categorical Features
feat_categorical = ["vm_category"]

# Combine
SAFE_NUMERIC_COLS = feat_timing + feat_static + feat_history
SAFE_CATEGORICAL_COLS = feat_categorical
ALL_SAFE_FEATURES = SAFE_NUMERIC_COLS + SAFE_CATEGORICAL_COLS

print(f"\nTotal Features: {len(ALL_SAFE_FEATURES)}")


Total Features: 23


In [32]:
# 3. Prepare X and y
# ------------------------------------------------------------------------------
def get_X_y(df_polars):
    X = df_polars.select(ALL_SAFE_FEATURES).to_pandas()
    y = df_polars.select(TARGET_COL).to_pandas().values.ravel()
    return X, y

X_train, y_train = get_X_y(df_train)
X_val, y_val     = get_X_y(df_val)
X_test, y_test   = get_X_y(df_test)

In [33]:
# 4. Build Robust Pipeline
# ------------------------------------------------------------------------------
# Numeric Pipeline: Median Imputation -> Scaling
# Scaling is CRITICAL here because ts_vm_created is ~2,000,000 while
# vm_virtual_core_count is ~4.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline: Unknown Imputation -> OneHotEncoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, SAFE_NUMERIC_COLS),
        ('cat', categorical_transformer, SAFE_CATEGORICAL_COLS)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        max_iter=1000, 
        solver='lbfgs', 
        random_state=42
    ))
])

In [35]:
# 5. Train & Evaluate
# ------------------------------------------------------------------------------
print("\nTraining Logistic Regression...")
model.fit(X_train, y_train)
print("Training Complete.")

# Validation
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]

print("\n--- Validation Report ---")
print(classification_report(y_val, y_val_pred))
print(f"Validation ROC AUC: {roc_auc_score(y_val, y_val_prob):.4f}")

# Test
y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

print("\n--- Test Report ---")
print(classification_report(y_test, y_test_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_test_prob):.4f}")


Training Logistic Regression...
Training Complete.

--- Validation Report ---
              precision    recall  f1-score   support

           0       1.00      0.81      0.90    126056
           1       0.13      0.90      0.23      3949

    accuracy                           0.82    130005
   macro avg       0.56      0.86      0.56    130005
weighted avg       0.97      0.82      0.87    130005

Validation ROC AUC: 0.8981

--- Test Report ---
              precision    recall  f1-score   support

           0       1.00      0.81      0.89    128977
           1       0.09      0.85      0.16      2872

    accuracy                           0.81    131849
   macro avg       0.54      0.83      0.53    131849
weighted avg       0.98      0.81      0.88    131849

Test ROC AUC: 0.8760


In [37]:
# 6. Feature Importance (Check if timestamps matter)
# ------------------------------------------------------------------------------
cat_names = model.named_steps['preprocessor'].named_transformers_['cat']['encoder'].get_feature_names_out(SAFE_CATEGORICAL_COLS)
final_feature_names = SAFE_NUMERIC_COLS + list(cat_names)
coeffs = model.named_steps['classifier'].coef_[0]

coef_df = pd.DataFrame({
    'feature': final_feature_names,
    'coefficient': coeffs,
    'abs_coeff': np.abs(coeffs)
}).sort_values(by='abs_coeff', ascending=False)

print("\nTop 10 Overall Features:")
print(coef_df.head(10))


Top 10 Overall Features:
                          feature  coefficient  abs_coeff
5                    vm_memory_gb    -1.390185   1.390185
18             hist_cpu_mean_mean     1.365360   1.365360
6                 vm_mem_per_core     0.802334   0.802334
15             hist_critical_frac     0.703456   0.703456
24             vm_category_Unkown    -0.657247   0.657247
20            hist_frac_gt60_mean    -0.472322   0.472322
21      hist_day_night_ratio_mean    -0.433790   0.433790
22  vm_category_Delay-insensitive    -0.395550   0.395550
17              hist_lifetime_std     0.356563   0.356563
19                  hist_p95_mean     0.335068   0.335068
