# AutoML: Classification (XGBoost)

This template trains a binary classifier using XGBoost.

- Parameters are injected via Papermill (top cell tagged `parameters`).
- Data is loaded via `amprenta_rag.notebook.automl_helpers.load_dataset_as_dataframe()`.
- Model artifacts are registered via `amprenta_rag.ml.registry.MLModelRegistry`.



In [None]:
# Parameters (Papermill)
# NOTE: This cell should be tagged as "parameters" by Papermill.

dataset_id = ""  # UUID string
target_column = "target"
feature_columns = []  # list[str]; empty means infer numeric columns
test_size = 0.2
random_state = 42
model_name = "automl_classification_xgboost"



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

from amprenta_rag.notebook.automl_helpers import (
    generate_classification_report,
    load_dataset_as_dataframe,
    plot_confusion_matrix,
    register_trained_model,
)

try:
    import xgboost as xgb
except Exception as e:
    raise ImportError("xgboost is required for this template") from e



In [None]:
df = load_dataset_as_dataframe(dataset_id)

if not target_column or target_column not in df.columns:
    raise ValueError(f"target_column '{target_column}' missing")

if feature_columns:
    X = df[feature_columns]
else:
    # Infer numeric features except target
    X = df.select_dtypes(include=["number"]).drop(columns=[target_column], errors="ignore")

y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=float(test_size),
    random_state=int(random_state),
    stratify=y,
)

clf = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    eval_metric="logloss",
    random_state=int(random_state),
)

clf.fit(X_train, y_train)

proba = clf.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

metrics = generate_classification_report(y_test, pred, y_proba=proba)
metrics["auc"] = float(roc_auc_score(y_test, proba))
metrics["accuracy"] = float(accuracy_score(y_test, pred))
metrics



In [None]:
# Confusion matrix
fig = plot_confusion_matrix(y_test, pred)
fig



In [None]:
# Register model in MLModelRegistry (requires DB + migrations for ml_models)
try:
    reg_entry = register_trained_model(
        clf,
        name=model_name,
        metrics={k: float(v) for k, v in metrics.items()},
        dataset_id=dataset_id,
        model_type="automl_classification",
        framework="xgboost",
        features=list(X.columns),
        hyperparameters=getattr(clf, "get_params", lambda: {})(),
        description="AutoML classification template (XGBoost)",
    )
    reg_entry
except Exception as e:
    print(f"Model registry not available in this environment: {e}")

