# Train XGBoost (GPU Accelerated)

GPU-accelerated XGBoost training using `tree_method="gpu_hist"` with MLflow tracking.

**Requirements:**
- Databricks ML Runtime 17.3 LTS GPU (`17.3.x-gpu-ml-scala2.13`)
- GPU-enabled cluster (NC-series Azure VMs)
- Single GPU: NC4as_T4_v3 (1x T4, 16 GB)
- Multi-GPU: NC16as_T4_v3 (4x T4, 64 GB total)

**Parameters:**
- `data_size`: Dataset size preset
- `node_type`: GPU VM type (NC4asT4v3, NC16asT4v3)
- `gpu_id`: GPU device ID (0 for single GPU)
- `run_mode`: full or smoke

## Setup Widgets

In [None]:
dbutils.widgets.dropdown("data_size", "tiny", ["tiny", "small", "medium", "medium_large", "large", "xlarge"], "Data Size")
dbutils.widgets.text("node_type", "NC4asT4v3", "Node Type")
dbutils.widgets.text("gpu_id", "0", "GPU ID")
dbutils.widgets.dropdown("run_mode", "full", ["full", "smoke"], "Run Mode")
dbutils.widgets.text("catalog", "brian_gen_ai", "Catalog")
dbutils.widgets.text("schema", "xgb_scaling", "Schema")
dbutils.widgets.text("table_name", "", "Table Name (override)")  # Optional: override auto table name

In [None]:
# --- Global Error Tracking ---
# Collects errors during execution so the exit cell can report them
# even if training fails partway through.
_notebook_errors = []

def log_error(error_msg, exc=None):
    """Log an error for later reporting. Does not raise."""
    import traceback
    entry = {"error": str(error_msg)}
    if exc:
        entry["traceback"] = traceback.format_exc()
    _notebook_errors.append(entry)
    print(f"ERROR LOGGED: {error_msg}")

# Get widget values
data_size = dbutils.widgets.get("data_size")
node_type = dbutils.widgets.get("node_type")
gpu_id = dbutils.widgets.get("gpu_id")
run_mode = dbutils.widgets.get("run_mode")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
table_name_override = dbutils.widgets.get("table_name").strip()

# Add repo root to sys.path
import sys, os
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
repo_root = "/".join(notebook_path.split("/")[:-2])
sys.path.insert(0, f"/Workspace{repo_root}")

# Use shared size presets from src/config.py (single source of truth)
from src.config import get_preset, PRESETS

# Determine input table
if table_name_override:
    # Use explicit table name override
    input_table = f"{catalog}.{schema}.{table_name_override}"
    # Use override name in run name
    data_size_label = table_name_override.replace("imbalanced_", "")
    preset = None  # No preset when using override
else:
    # Use preset-based table name
    preset = get_preset(data_size)
    table_suffix = preset.table_suffix
    input_table = f"{catalog}.{schema}.imbalanced_{table_suffix}"
    data_size_label = data_size

# Run naming
run_name = f"smoke_gpu_{node_type}" if run_mode == "smoke" else f"{data_size_label}_gpu_{node_type}"

print(f"Data size: {data_size}")
print(f"Node type: {node_type}")
print(f"GPU ID: {gpu_id}")
print(f"Run mode: {run_mode}")
print(f"Input table: {input_table}")
print(f"Run name: {run_name}")
if preset:
    print(f"Preset: {preset.name} ({preset.rows:,} rows, {preset.total_features} features)")

## Environment Validation

In [None]:
from src.validate_env import validate_environment
validate_environment(
    track="gpu-scaling",
    expected_workers=0,  # Single node with GPU
    require_gpu=True,
)

# Also check GPU availability and capture GPU info for later logging
import subprocess
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=name,memory.total,memory.free", "--format=csv,noheader"],
    capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
    print(f"GPU(s) detected:\n{result.stdout}")
    gpu_name = result.stdout.strip().split(",")[0].strip()
    gpu_mem_total = float(result.stdout.split(",")[1].strip().split()[0])  # MiB
    print(f"GPU name: {gpu_name}")
    print(f"GPU memory: {gpu_mem_total/1024:.1f} GB")
else:
    raise RuntimeError("No GPU detected! Use GPU ML Runtime (17.3.x-gpu-ml-scala2.13)")

## MLflow Setup

In [None]:
import os

# Enable system metrics logging BEFORE importing mlflow
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"

import mlflow

# Also call the enable function after import
mlflow.enable_system_metrics_logging()

# Get current user for experiment path
user_email = spark.sql("SELECT current_user()").collect()[0][0]
experiment_path = f"/Users/{user_email}/xgb_scaling_benchmark"

# Set experiment
mlflow.set_experiment(experiment_path)
print(f"MLflow experiment: {experiment_path}")
print(f"System metrics logging enabled: {os.environ.get('MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING')}")

## Load Data

In [None]:
# --- Memory Check ---
# Warn if estimated data size exceeds available RAM before loading.
import psutil

if preset:
    estimated_gb = (preset.rows * preset.total_features * 8) / 1e9  # 8 bytes per float64
    available_gb = psutil.virtual_memory().available / 1e9
    print(f"Estimated data size: {estimated_gb:.1f} GB")
    print(f"Available RAM: {available_gb:.1f} GB")
    if estimated_gb > available_gb * 0.8:
        msg = f"WARNING: Estimated data size {estimated_gb:.1f} GB exceeds 80% of available RAM {available_gb:.1f} GB"
        print(msg)
        log_error(msg)
    else:
        print(f"Memory check OK: {estimated_gb:.1f} GB < {available_gb * 0.8:.1f} GB (80% threshold)")
else:
    print("Memory check skipped: using table_name override (no preset dimensions available)")

In [None]:
import time

print(f"Loading data from: {input_table}")
load_start = time.time()

# Read from Delta table and convert to pandas
df_spark = spark.table(input_table)
df = df_spark.toPandas()

load_time = time.time() - load_start
print(f"Loaded {len(df):,} rows x {len(df.columns)} columns in {load_time:.1f}s")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Estimate GPU memory needed for DMatrix
# XGBoost GPU hist needs roughly 4-8x the raw data size for histograms
raw_data_gb = df.memory_usage(deep=True).sum() / 1e9
estimated_gpu_gb = raw_data_gb * 6  # Conservative estimate
print(f"Raw data: {raw_data_gb:.2f} GB")
print(f"Estimated GPU memory needed: {estimated_gpu_gb:.2f} GB")
if estimated_gpu_gb > gpu_mem_total / 1024:
    print(f"WARNING: Estimated GPU memory {estimated_gpu_gb:.1f} GB may exceed GPU memory {gpu_mem_total/1024:.1f} GB")
    print("Consider using external_memory mode or a larger GPU")
else:
    print(f"GPU memory check OK: {estimated_gpu_gb:.1f} GB < {gpu_mem_total/1024:.1f} GB")

# Create MLflow dataset from pandas (avoids Spark credential scope issues)
mlflow_dataset = mlflow.data.from_pandas(
    df,
    source=input_table,  # Reference source table
    name=data_size_label,
    targets="label",
)
print(f"MLflow dataset created: {mlflow_dataset.name} (source: {input_table})")

## Prepare Features and Labels

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and label
X = df.drop(columns=["label"])
y = df["label"]

# Class distribution
class_counts = y.value_counts().sort_index()
minority_ratio = class_counts[1] / len(y)
print(f"Class distribution:")
print(f"  Class 0 (majority): {class_counts[0]:,} ({class_counts[0]/len(y)*100:.2f}%)")
print(f"  Class 1 (minority): {class_counts[1]:,} ({class_counts[1]/len(y)*100:.2f}%)")

# Calculate scale_pos_weight for imbalance
scale_pos_weight = class_counts[0] / class_counts[1]
print(f"\nscale_pos_weight: {scale_pos_weight:.2f}")

# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTrain set: {len(X_train):,} rows")
print(f"Test set: {len(X_test):,} rows")
print(f"Train minority: {y_train.sum():,} ({y_train.mean()*100:.2f}%)")
print(f"Test minority: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")

## XGBoost GPU Training

In [None]:
import xgboost as xgb
import os

# XGBoost hyperparameters - GPU accelerated with histogram method
xgb_params = {
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",
    "gpu_id": int(gpu_id),
    "n_estimators": 100,
    "max_depth": 6,
    "learning_rate": 0.1,
    "scale_pos_weight": scale_pos_weight,
    "random_state": 42,
    "verbosity": 1,
}

print("XGBoost GPU parameters:")
for k, v in xgb_params.items():
    print(f"  {k}: {v}")

In [None]:
# Start MLflow run and train
with mlflow.start_run(run_name=run_name, log_system_metrics=True) as run:
    run_id = run.info.run_id
    print(f"MLflow run ID: {run_id}")
    print(f"MLflow run name: {run_name}")

    # Log input dataset
    mlflow.log_input(mlflow_dataset, context="training")
    print(f"Logged input dataset: {input_table}")

    # Log parameters
    mlflow.log_param("data_size", data_size)
    mlflow.log_param("node_type", node_type)
    mlflow.log_param("run_mode", run_mode)
    mlflow.log_param("input_table", input_table)
    mlflow.log_param("n_rows", len(df))
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("minority_ratio", round(minority_ratio, 4))
    mlflow.log_param("train_size", len(X_train))
    mlflow.log_param("test_size", len(X_test))

    # Log GPU-specific parameters
    mlflow.log_param("training_mode", "gpu_single")
    mlflow.log_param("gpu_type", gpu_name)
    mlflow.log_param("gpu_memory_gb", round(gpu_mem_total / 1024, 1))
    mlflow.log_param("tree_method", "gpu_hist")
    mlflow.log_param("gpu_id", int(gpu_id))

    # Log XGBoost params
    for k, v in xgb_params.items():
        mlflow.log_param(f"xgb_{k}", v)

    # Log data load time
    mlflow.log_metric("data_load_time_sec", load_time)

    # Train model
    print("\nTraining XGBoost with GPU (tree_method=gpu_hist)...")
    train_start = time.time()

    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train)

    train_time = time.time() - train_start
    print(f"Training completed in {train_time:.1f}s")

    mlflow.log_metric("train_time_sec", train_time)

    # Predictions
    print("\nGenerating predictions...")
    pred_start = time.time()

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    pred_time = time.time() - pred_start
    mlflow.log_metric("predict_time_sec", pred_time)

    # Evaluation
    print("\nEvaluating...")
    from sklearn.metrics import (
        average_precision_score,
        roc_auc_score,
        f1_score,
        precision_score,
        recall_score,
        classification_report,
        confusion_matrix,
    )

    # Metrics
    auc_pr = average_precision_score(y_test, y_pred_proba)
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"\nResults:")
    print(f"  AUC-PR (primary): {auc_pr:.4f}")
    print(f"  AUC-ROC: {auc_roc:.4f}")
    print(f"  F1: {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")

    # Log metrics
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.log_metric("auc_roc", auc_roc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix:")
    print(f"  TN: {cm[0,0]:,}  FP: {cm[0,1]:,}")
    print(f"  FN: {cm[1,0]:,}  TP: {cm[1,1]:,}")

    mlflow.log_metric("true_negatives", cm[0, 0])
    mlflow.log_metric("false_positives", cm[0, 1])
    mlflow.log_metric("false_negatives", cm[1, 0])
    mlflow.log_metric("true_positives", cm[1, 1])

    # Classification report
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Log GPU memory estimate
    mlflow.log_metric("raw_data_gb", round(raw_data_gb, 2))
    mlflow.log_metric("estimated_gpu_mem_gb", round(estimated_gpu_gb, 2))

    # Total time
    total_time = load_time + train_time + pred_time
    mlflow.log_metric("total_time_sec", total_time)

    print(f"\n" + "=" * 50)
    print(f"Run complete: {run_name}")
    print(f"Total time: {total_time:.1f}s")
    print(f"MLflow run ID: {run_id}")
    print(f"=" * 50)

## Shutdown

In [None]:
import json

result = {
    "status": "ok" if not _notebook_errors else "errors",
    "run_name": run_name,
    "run_id": run_id,
    "data_size": data_size,
    "node_type": node_type,
    "gpu_type": gpu_name,
    "gpu_id": int(gpu_id),
    "tree_method": "gpu_hist",
    "n_rows": len(df),
    "auc_pr": round(auc_pr, 4),
    "train_time_sec": round(train_time, 1),
    "total_time_sec": round(total_time, 1),
}

if _notebook_errors:
    result["errors"] = _notebook_errors

result_json = json.dumps(result)
print(f"\nNotebook result: {result_json}")

dbutils.notebook.exit(result_json)