
# Memory Spike Predictor (PDF Processing)

Predict **peak memory usage (MB)** for PDF processing jobs *before* they run, using a scikit-learn Pipeline.
This notebook is structured so you can start with **synthetic data** and later **swap in real features**.

**What you'll do:**
1. Generate or load data (features → `peak_mem_mb` target)
2. Train a `Pipeline(OneHotEncoder + GradientBoostingRegressor)`
3. Evaluate with MAE, RMSE, R², and MAPE
4. Visualize residuals and predicted vs actual
5. Save the trained `pipeline.pkl`
6. Use `predict(features)` for inference
7. (Optional) Export to ONNX for JVM-native scoring


In [None]:

import sys, platform
print("Python:", sys.version)
print("Platform:", platform.platform())


In [None]:

import os, json, math, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:

BASE_DIR = '/mnt/data/memory_spike_nb'
PIPELINE_PATH = f"/mnt/data/memory_spike_nb/pipeline.pkl"
METRICS_PATH = f"/mnt/data/memory_spike_nb/metrics.json"
DATA_PATH = f"/mnt/data/memory_spike_nb/sample_data.csv"
print("Artifacts will be saved under:", BASE_DIR)



## 1) Data: synthesize or load

Start with realistic synthetic data. Later, replace `synthesize_data` or add a `load_data(path)`
function to bring in your real, sanitized dataset with the same schema.


In [None]:

def synthesize_data(n=7000, seed=7):
    rng = np.random.default_rng(seed)
    size_mb = rng.lognormal(mean=1.3, sigma=0.7, size=n)
    pages = np.clip((rng.normal(30, 30, size=n)).round().astype(int), 1, 1500)
    image_page_ratio = np.clip(rng.beta(2.2, 3.5, size=n), 0, 1)
    dpi_estimate = np.clip((rng.normal(220, 80, size=n)).round().astype(int), 72, 600)
    avg_image_size_kb = np.clip(rng.lognormal(mean=4.8, sigma=0.6, size=n), 20, 5000)
    fonts_embedded_pct = np.clip(rng.normal(0.75, 0.18, size=n), 0, 1)
    xref_error_count = rng.poisson(0.06, size=n)
    ocr_required = rng.integers(0, 2, size=n)
    producer = rng.choice(
        ["Adobe","iText","PDFBox","Ghostscript","Unknown","Scanner"],
        size=n, p=[0.35,0.12,0.12,0.08,0.18,0.15]
    )

    latent = (
        60
        + 22 * np.log1p(size_mb)
        + 0.08 * pages
        + 900 * (image_page_ratio ** 1.7)
        + 0.006 * np.clip(dpi_estimate - 150, 0, None) * pages**0.3
        + 0.03 * avg_image_size_kb
        + 240 * (1 - fonts_embedded_pct)
        + 65 * np.tanh(xref_error_count)
        + 120 * ocr_required * (image_page_ratio > 0.5)
    )
    producer_bias = {"Adobe": -40.0, "iText": -10.0, "PDFBox": 0.0, "Ghostscript": 20.0, "Unknown": 25.0, "Scanner": 70.0}
    latent += np.vectorize(producer_bias.get)(producer)
    noise = rng.normal(0, 60 + 0.6 * np.sqrt(np.maximum(latent, 1)), size=n)
    peak_mem_mb = np.clip(latent + noise, 150, 12000)

    df = pd.DataFrame(dict(
        size_mb=np.round(size_mb,3),
        pages=pages,
        image_page_ratio=np.round(image_page_ratio,3),
        dpi_estimate=dpi_estimate,
        avg_image_size_kb=np.round(avg_image_size_kb,1),
        fonts_embedded_pct=np.round(fonts_embedded_pct,3),
        xref_error_count=xref_error_count,
        ocr_required=ocr_required.astype(int),
        producer=producer,
        peak_mem_mb=np.round(peak_mem_mb,1),
    ))
    return df

df = synthesize_data(n=7000, seed=7)
df.head()


In [None]:

df.sample(500, random_state=1).to_csv(DATA_PATH, index=False)
DATA_PATH



## 2) Train a scikit-learn Pipeline

We use a `ColumnTransformer` to one-hot encode the categorical `producer` and keep numeric features as-is.
Then we train a `GradientBoostingRegressor`.


In [None]:

X = df.drop(columns=["peak_mem_mb"])
y = df["peak_mem_mb"].values

num_features = ["size_mb","pages","image_page_ratio","dpi_estimate",
                "avg_image_size_kb","fonts_embedded_pct","xref_error_count","ocr_required"]
cat_features = ["producer"]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ("num", "passthrough", num_features),
])

pipe = Pipeline([
    ("pre", pre),
    ("gbr", GradientBoostingRegressor(random_state=0)),
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)

pipe



## 3) Evaluate
We report:
- **MAE** (mean absolute error)
- **RMSE** (root mean squared error)
- **R²**
- **MAPE** (percentage)


In [None]:

y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mape = float(np.mean(np.abs((y_test - y_pred) / np.maximum(y_test, 1e-6))) * 100.0)
mae, rmse, r2, mape



## 4) Visualizations
Residuals vs predicted, and predicted vs actual (sorted).


In [None]:

# Residuals
residuals = y_test - y_pred
plt.figure()
plt.scatter(y_pred, residuals, s=6)
plt.axhline(0, linestyle="--")
plt.xlabel("Predicted peak MB"); plt.ylabel("Residual (actual - pred)")
plt.title("Residuals vs Prediction")
plt.show()


In [None]:

# Predicted vs Actual
idx = np.argsort(y_test)
plt.figure()
plt.plot(y_test[idx], label="Actual")
plt.plot(y_pred[idx], label="Predicted")
plt.xlabel("Sample (sorted by actual)"); plt.ylabel("Peak Memory (MB)")
plt.title("Predicted vs Actual (test set)")
plt.legend()
plt.show()



## 5) Save artifacts
Export the trained `pipeline.pkl` and a `metrics.json` next to it.


In [None]:

with open(PIPELINE_PATH, "wb") as f:
    pickle.dump(pipe, f)

metrics = {"mae": float(mae), "rmse": float(rmse), "r2": float(r2), "mape_pct": float(mape)}
with open(METRICS_PATH, "w") as f:
    json.dump(metrics, f, indent=2)

PIPELINE_PATH, METRICS_PATH



## 6) Quick inference helper
`predict(features: dict)` → returns predicted peak MB.


In [None]:

def predict(features: dict) -> float:
    import pandas as pd, pickle
    pipe_ = pickle.load(open(PIPELINE_PATH, "rb"))
    return float(pipe_.predict(pd.DataFrame([features]))[0])

# Example:
predict({
    "size_mb": 18.0,
    "pages": 420,
    "image_page_ratio": 0.92,
    "dpi_estimate": 300,
    "avg_image_size_kb": 850.0,
    "fonts_embedded_pct": 0.35,
    "xref_error_count": 2,
    "ocr_required": 1,
    "producer": "Unknown",
})



## 7) (Optional) Export to ONNX
If `skl2onnx` is available, export the pipeline for JVM-native inference with ONNX Runtime.


In [None]:

try:
    from skl2onnx import to_onnx
    from skl2onnx.common.data_types import FloatTensorType, StringTensorType, Int64TensorType

    initial_types = [
        ("producer", StringTensorType([None, 1])),
        ("size_mb", FloatTensorType([None, 1])),
        ("pages", Int64TensorType([None, 1])),
        ("image_page_ratio", FloatTensorType([None, 1])),
        ("dpi_estimate", Int64TensorType([None, 1])),
        ("avg_image_size_kb", FloatTensorType([None, 1])),
        ("fonts_embedded_pct", FloatTensorType([None, 1])),
        ("xref_error_count", Int64TensorType([None, 1])),
        ("ocr_required", Int64TensorType([None, 1])),
    ]
    onnx_model = to_onnx(pipe, initial_types=initial_types, target_opset=17)
    ONNX_PATH = f"{BASE_DIR}/pipeline.onnx"
    with open(ONNX_PATH, "wb") as f:
        f.write(onnx_model.SerializeToString())
    print("Exported:", ONNX_PATH)
except Exception as e:
    print("ONNX export skipped or failed:", e)



## 8) Deployment notes
- Load `pipeline.pkl` in a Python FastAPI sidecar and expose `/predict`
- Or export to ONNX for in-process Java inference with ONNX Runtime
- Keep a `thresholdMb` (e.g., 3500) in your Spring config to decide routing
