[Reference](https://medium.com/@Rohan_Dutt/10-stages-of-machine-learning-that-actually-move-the-accuracy-needle-not-the-toy-stuff-aac2c0e078c8)

# STAGE 1: Precision Data Sanitization (Treat Your Dataset Like Production Code)

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Load data
df = pd.read_csv("data.csv")

# 1. Outlier Removal (IQR)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# 2. Predictive Imputation
target = "target_column"
features = df.drop(columns=[target])
missing_cols = features.columns[features.isnull().any()]
for col in missing_cols:
    not_null = features[features[col].notnull()]
    is_null = features[features[col].isnull()]
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(not_null.drop(columns=[col]), not_null[col])
    features.loc[is_null.index, col] = model.predict(is_null.drop(columns=[col]))

# 3. Feature Scaling
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# STAGE 2: Intelligence-Grade Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from category_encoders import TargetEncoder

df = pd.read_csv("housing.csv")

# 1. Domain Transform
df["log_price"] = np.log1p(df["price"])

# 2. Interaction Feature
df["price_per_sqft"] = df["price"] / df["sqft_living"]

# 3. Target Encoding (High Cardinality)
encoder = TargetEncoder(cols=["zipcode"], smoothing=10)
df["zipcode_encoded"] = encoder.fit_transform(df["zipcode"], df["price"])

# 4. Polynomial Interactions (Controlled)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(df[["sqft_living", "bedrooms"]])

# STAGE 3: Strategic Data Splitting

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("healthcare.csv")
X = df.drop(columns=["target", "patient_id"])
y = df["target"]

# 1. Stratified Split (Classification)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2. Time-Based Split (Forecasting)
tscv = TimeSeriesSplit(n_splits=5)

# 3. Leakage-Safe Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])
pipeline.fit(X_train, y_train)

# STAGE 4: Capital-Allocation Model Selection (Invest in Algorithms That Pay Off)

In [4]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

df = pd.read_csv("data.csv")
X = df.drop(columns=["target"])
y = df["target"]
models = {
    "Linear": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=300),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        eval_metric="logloss"
    )
}
for name, model in models.items():
    score = cross_val_score(model, X, y, cv=5, scoring="roc_auc").mean()
    print(f"{name}: AUC={score:.3f}")

## AutoML Shortcut: Bruteforce the Search Space

In [5]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(
    generations=5,
    population_size=40,
    scoring="roc_auc",
    verbosity=2,
    random_state=42
)
tpot.fit(X, y)

# STAGE 5: High-Velocity Hyperparameter Optimization

In [6]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "eval_metric": "logloss"
    }
    model = XGBClassifier(**params, random_state=42)
    return cross_val_score(model, X, y, cv=3, scoring="roc_auc").mean()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

# STAGE 6: Discipline-Driven Regularization

In [7]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1. L1 vs L2 (Linear Models)
l1_model = LogisticRegression(penalty="l1", solver="saga", C=0.5)
l2_model = LogisticRegression(penalty="l2", C=1.0)

# 2. Tree Regularization
xgb_model = XGBClassifier(
    max_depth=4,
    reg_alpha=0.1,   # L1
    reg_lambda=1.5  # L2
)

# 3. Dropout + Early Stopping
nn = Sequential([
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
nn.compile(optimizer="adam", loss="binary_crossentropy")
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

# STAGE 7: Portfolio-Grade Ensembling (Diversify Models, Control Risk)

In [8]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

base_models = [
    ("linear", LogisticRegression(max_iter=2000)),
    ("rf", RandomForestClassifier(n_estimators=300, max_depth=8)),
    ("xgb", XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        eval_metric="logloss"
    ))
]
stack = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)
stack.fit(X_train, y_train)

# STAGE 8: Forensic Model Interpretability (Trust, but Verify)

In [9]:
import shap
import lime
import lime.lime_tabular
from sklearn.inspection import PartialDependenceDisplay


# 1. SHAP (Tree-Based Models)
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_val)
shap.summary_plot(shap_values, X_val)

# 2. LIME (Local Explanation)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns,
    class_names=["No", "Yes"],
    mode="classification"
)
exp = lime_explainer.explain_instance(
    X_val.iloc[0].values,
    stack.predict_proba
)

# 3. Partial Dependence Plot
PartialDependenceDisplay.from_estimator(
    xgb_model,
    X_val,
    features=["age", "income"]
)

# STAGE 9: Model Monitoring (Assume Failure, Detect Early)

In [10]:
import numpy as np
from scipy.stats import ks_2samp
from sklearn.metrics import roc_auc_score

# 1. Data Drift (KS Test)
def ks_drift(train_col, prod_col, alpha=0.05):
    stat, p_value = ks_2samp(train_col, prod_col)
    return p_value < alpha  # True = drift detected
drift_detected = ks_drift(
    X_train["age"],
    X_prod["age"]
)

# 2. Concept Drift (Metric Tracking)
prod_auc = roc_auc_score(y_prod, model.predict_proba(X_prod)[:, 1])
# Compare vs. historical baseline
if prod_auc < baseline_auc - 0.05:
    alert = "Concept drift suspected"

# 3. Shadow Deployment Check
old_preds = old_model.predict_proba(X_prod)[:, 1]
new_preds = new_model.predict_proba(X_prod)[:, 1]
delta = np.mean(np.abs(new_preds - old_preds))

# STAGE 10: Deployment (Ship Models That Survive Contact)

In [11]:
# 1. Convert Model to ONNX
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [("input", FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)
with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

# 2. FastAPI Inference Server
from fastapi import FastAPI
import onnxruntime as ort
import numpy as np

app = FastAPI()
session = ort.InferenceSession("model.onnx")
@app.post("/predict")
def predict(features: list):
    inputs = np.array([features], dtype=np.float32)
    preds = session.run(None, {"input": inputs})
    return {"prediction": float(preds[0][0])}

# 3. Dockerfile
FROM python:3.10-slim
COPY . /app
WORKDIR /app
RUN pip install fastapi uvicorn onnxruntime
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]