# Entrenamientos de modelos: Industrial Factory Detection

Este notebook documenta el proceso de entrenamiento y optimización de los modelos predeterminados que serán utilizados en la API de inferencia desplegada con BentoML.

In [37]:
import pandas as pd
from pathlib import Path
import joblib
import json

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

from sklearn.metrics import accuracy_score
try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None


## Carga dataset

In [34]:
pd.set_option("display.max_columns", None)
pd.set_option("display.precision", 4)

DATA_PATH = "data/Industrial_fault_detection.csv"
df = pd.read_csv(DATA_PATH)
print(f"Filas: {df.shape[0]:,}  |  Columnas: {df.shape[1]}")
df.head()

Filas: 1,000  |  Columnas: 37


Unnamed: 0,Temperature,Vibration,Pressure,Flow_Rate,Current,Voltage,FFT_Temp_0,FFT_Vib_0,FFT_Pres_0,FFT_Temp_1,FFT_Vib_1,FFT_Pres_1,FFT_Temp_2,FFT_Vib_2,FFT_Pres_2,FFT_Temp_3,FFT_Vib_3,FFT_Pres_3,FFT_Temp_4,FFT_Vib_4,FFT_Pres_4,FFT_Temp_5,FFT_Vib_5,FFT_Pres_5,FFT_Temp_6,FFT_Vib_6,FFT_Pres_6,FFT_Temp_7,FFT_Vib_7,FFT_Pres_7,FFT_Temp_8,FFT_Vib_8,FFT_Pres_8,FFT_Temp_9,FFT_Vib_9,FFT_Pres_9,Fault_Type
0,46.0061,2.0384,56.7758,6.1844,12.4095,215.7624,772.4031,32.4365,971.8053,3.7606,0.734,30.8746,8.7876,1.1573,4.3689,22.0885,1.29,33.9704,2.5275,0.4367,23.4286,2.2032,1.6654,25.8206,2.5275,0.4367,23.4286,22.0885,1.29,33.9704,8.7876,1.1573,4.3689,3.7606,0.734,30.8746,0
1,62.5292,2.5737,76.1598,8.2792,14.9064,215.4659,767.6024,32.3956,962.4815,8.3651,0.7246,27.2683,12.5535,1.1308,8.1044,19.4903,1.2831,41.3641,6.9389,0.4133,25.0722,7.0038,1.6244,35.1444,6.9389,0.4133,25.0722,19.4903,1.2831,41.3641,12.5535,1.1308,8.1044,8.3651,0.7246,27.2683,0
2,77.295,3.2435,92.3726,9.1728,15.0541,202.0436,765.9651,32.032,956.2995,9.5598,0.9344,30.1371,12.2918,0.8034,13.2624,21.109,0.9723,43.4171,5.4635,0.5342,20.1791,5.3665,1.988,28.9623,5.4635,0.5342,20.1791,21.109,0.9723,43.4171,12.2918,0.8034,13.2624,9.5598,0.9344,30.1371,0
3,76.5642,3.1429,94.1496,13.7754,16.4179,216.6991,763.9364,33.0399,956.5322,10.2519,0.1692,29.9199,10.3211,1.1399,13.2934,20.8637,0.6063,43.6489,6.5774,1.1686,19.948,7.3951,2.9958,28.7295,6.5774,1.1686,19.948,20.8637,0.6063,43.6489,10.3211,1.1399,13.2934,10.2519,0.1692,29.9199,0
4,78.2816,3.14,94.441,11.1131,10.8994,227.3283,746.7549,33.0187,950.2128,18.3131,0.1727,36.0651,14.4416,1.1491,7.1494,8.1633,0.6273,45.4618,17.8478,1.1783,14.7549,9.7864,3.0169,22.4101,17.8478,1.1783,14.7549,8.1633,0.6273,45.4618,14.4416,1.1491,7.1494,18.3131,0.1727,36.0651,3


In [24]:
TARGET_COL = "Fault_Type"  

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

print("X:", X.shape, "| y:", y.shape)


X: (1000, 36) | y: (1000,)


## Model training

In [None]:
RANDOM_STATE = 10
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (800, 36) Test: (200, 36)


In [None]:
RANDOM_STATE = 10
CV = 5
N_JOBS = -1

models_and_grids = {
    "logreg": (
        Pipeline([
            ("scale", StandardScaler()),
            ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))
        ]),
        {
            "clf__C": [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
            "clf__solver": ["lbfgs", "saga"],
        }
    ),

    "svm_rbf": (
        Pipeline([
            ("scale", StandardScaler()),
            ("clf", SVC())
        ]),
        {
            "clf__C": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
            "clf__gamma": ["scale", "auto", 0.01, 0.05, 0.1, 0.2],
        }
    ),

    "knn": (
        Pipeline([
            ("scale", StandardScaler()),
            ("clf", KNeighborsClassifier())
        ]),
        {
            "clf__n_neighbors": list(range(3, 26, 2)),
            "clf__weights": ["uniform", "distance"],
            "clf__p": [1, 2],
        }
    ),

    "mlp": (
        Pipeline([
            ("scale", StandardScaler()),
            ("clf", MLPClassifier(max_iter=600, random_state=RANDOM_STATE))
        ]),
        {
            "clf__hidden_layer_sizes": [(50,), (100,), (100, 50), (200, 100)],
            "clf__activation": ["relu", "tanh"],
            "clf__alpha": [0.0001, 0.0005, 0.001, 0.005, 0.01],
        }
    ),

    "rf": (
        RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        {
            "n_estimators": [200, 400, 800],
            "max_depth": [None, 10, 20],
            "max_features": ["sqrt", "log2"],
        }
    ),

    "gb": (
        GradientBoostingClassifier(random_state=RANDOM_STATE),
        {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [2, 3, 4],
        }
    ),

    "hgb": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [None, 3, 5],
            "max_leaf_nodes": [31, 63],
        }
    ),

    "xgb": (
        XGBClassifier(
            random_state=RANDOM_STATE,
            eval_metric="mlogloss",
            tree_method="hist",
        ),
        {
            "n_estimators": [200, 400],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [3, 5, 7],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0],
        }
    )
}

In [28]:
results = []
best_models = {}

out_dir = Path("artifacts/models_default")
out_dir.mkdir(parents=True, exist_ok=True)

for name, (estimator, param_grid) in models_and_grids.items():
    print(f"\n=== GridSearch: {name} ===")

    grid = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring="accuracy",
        cv=CV,
        n_jobs=N_JOBS,
        verbose=0,
    )
    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    best_models[name] = best

    y_pred = best.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    results.append({
        "model": name,
        "cv_best_acc": float(grid.best_score_),
        "test_acc": float(test_acc),
        "best_params": grid.best_params_,
    })

    joblib.dump(best, out_dir / f"{name}_best.joblib")
    print(f"✅ {name}: cv={grid.best_score_:.4f} | test={test_acc:.4f}")



=== GridSearch: logreg ===
✅ logreg: cv=0.7250 | test=0.7250

=== GridSearch: svm_rbf ===
✅ svm_rbf: cv=0.7250 | test=0.7250

=== GridSearch: knn ===
✅ knn: cv=0.7250 | test=0.7250

=== GridSearch: mlp ===




✅ mlp: cv=0.6287 | test=0.5750

=== GridSearch: rf ===
✅ rf: cv=0.7250 | test=0.7250

=== GridSearch: gb ===
✅ gb: cv=0.7137 | test=0.7150

=== GridSearch: hgb ===
✅ hgb: cv=0.7163 | test=0.7200

=== GridSearch: xgb ===
✅ xgb: cv=0.7200 | test=0.7150


Los modelos se almacenan en formato .joblib ya que es el método recomendado por scikit-learn para serializar pipelines completos de forma eficiente, reproducible y compatible con el despliegue posterior mediante BentoML

In [30]:
results_df = pd.DataFrame(results).sort_values("test_acc", ascending=False)
results_df

Unnamed: 0,model,cv_best_acc,test_acc,best_params
0,logreg,0.725,0.725,"{'clf__C': 0.01, 'clf__solver': 'lbfgs'}"
1,svm_rbf,0.725,0.725,"{'clf__C': 0.1, 'clf__gamma': 'scale'}"
2,knn,0.725,0.725,"{'clf__n_neighbors': 15, 'clf__p': 1, 'clf__we..."
4,rf,0.725,0.725,"{'max_depth': 10, 'max_features': 'log2', 'n_e..."
6,hgb,0.7163,0.72,"{'learning_rate': 0.05, 'max_depth': 3, 'max_l..."
5,gb,0.7137,0.715,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est..."
7,xgb,0.72,0.715,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
3,mlp,0.6287,0.575,"{'clf__activation': 'tanh', 'clf__alpha': 0.00..."


Probando modelos de distinta complejidad y observamos que todos convergen a una accuracy similar, lo que indica que el límite lo marca el dataset.

In [None]:
# Save feature names for API use
out_dir = Path("artifacts/models_default")
out_dir.mkdir(parents=True, exist_ok=True)

feature_names = list(X.columns)
(out_dir / "feature_names.json").write_text(json.dumps(feature_names, indent=2), encoding="utf-8")
print("Guardado feature_names.json con", len(feature_names), "features")


Guardado feature_names.json con 36 features
