In [None]:
from google.colab import files

In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    precision_recall_curve,
)

# columnas que esperaba tu notebook ANTES de la ingeniería
RAW_FEATURES = [
    "id",          # puede o no venir
    "age",         # en días
    "gender",
    "height",
    "weight",
    "ap_hi",
    "ap_lo",
    "cholesterol",
    "gluc",
    "smoke",
    "alco",
    "active",
]


class BasicValidator(BaseEstimator, TransformerMixin):
    """
    - Valida que vengan las columnas mínimas.
    - Elimina 'id' si viene.
    - Verifica que no haya nulos (como en tu notebook, que comprobaste que no había).
      Si hay nulos, levantamos un error: así sabes que en producción te mandaron
      algo incompleto.
    """

    def __init__(self, required_cols=None, drop_id=True):
        self.required_cols = required_cols
        self.drop_id = drop_id

    def fit(self, X, y=None):
        X = X.copy()
        self._check_columns(X)
        return self

    def transform(self, X):
        X = X.copy()
        self._check_columns(X)

        # eliminar id si está
        if self.drop_id and "id" in X.columns:
            X = X.drop(columns=["id"])

        # check nulos
        if X.isnull().any().any():
            null_cols = X.columns[X.isnull().any()].tolist()
            raise ValueError(
                f"Se encontraron valores nulos en columnas: {null_cols}. "
                "El pipeline asume datos completos como en el entrenamiento."
            )

        # check duplicados (como hiciste en el EDA)
        # No los eliminamos aquí, solo avisamos.
        dups = X.duplicated().sum()
        if dups > 0:
            # en producción puedes cambiar este print por un log
            print(f"[AVISO] Se encontraron {dups} filas duplicadas en la entrada.")

        return X

    def _check_columns(self, X):
        if self.required_cols is None:
            return
        missing = [c for c in self.required_cols if c not in X.columns]
        if missing:
            raise ValueError(
                f"Faltan columnas requeridas para el modelo: {missing}. "
                f"Se recibieron: {list(X.columns)}"
            )


class ClinicalNormalizer(BaseEstimator, TransformerMixin):
    """
    Replica la lógica de “limpieza clínica” que hiciste:
    - altura entre 120 y 220
    - peso entre 40 y 200
    - ap_hi entre 80 y 250
    - ap_lo entre 40 y 140
    - ap_hi >= ap_lo

    En el notebook eliminabas las filas fuera de rango.
    Aquí NO las eliminamos: las CLIPPEAMOS para que el modelo pueda predecir.
    """

    def __init__(self):
        # rangos usados en tu notebook
        self.height_min, self.height_max = 120, 220
        self.weight_min, self.weight_max = 40, 200
        self.ap_hi_min, self.ap_hi_max = 80, 250
        self.ap_lo_min, self.ap_lo_max = 40, 140

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # altura
        X["height"] = X["height"].clip(self.height_min, self.height_max)

        # peso
        X["weight"] = X["weight"].clip(self.weight_min, self.weight_max)

        # presiones
        X["ap_hi"] = X["ap_hi"].clip(self.ap_hi_min, self.ap_hi_max)
        X["ap_lo"] = X["ap_lo"].clip(self.ap_lo_min, self.ap_lo_max)

        # asegurar sistólica >= diastólica
        mask = X["ap_hi"] < X["ap_lo"]
        if mask.any():
            # si llega invertido, lo intercambiamos
            tmp = X.loc[mask, "ap_hi"].copy()
            X.loc[mask, "ap_hi"] = X.loc[mask, "ap_lo"]
            X.loc[mask, "ap_lo"] = tmp

        return X


class FeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Hace lo mismo que tú hiciste:
    - convertir age (días) -> age_years
    - eliminar age
    - crear bmi
    - crear ap_ratio
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # edad en años
        X["age_years"] = (X["age"] / 365).round(1)

        # eliminar columna age original
        X = X.drop(columns=["age"])

        # bmi
        X["bmi"] = X["weight"] / ((X["height"] / 100) ** 2)

        # relación sistólica/diastólica
        X["ap_ratio"] = X["ap_hi"] / X["ap_lo"]

        return X


class OutlierCapper(BaseEstimator, TransformerMixin):
    """
    Recorta outliers exactamente como en tu notebook:
    - bmi: [12, 60]
    - ap_hi: max 240
    - ap_lo: max 130
    - ap_ratio: [1.0, 3.5]
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X["bmi"] = X["bmi"].clip(lower=12, upper=60)
        X["ap_hi"] = X["ap_hi"].clip(upper=240)
        X["ap_lo"] = X["ap_lo"].clip(upper=130)
        X["ap_ratio"] = X["ap_ratio"].clip(lower=1.0, upper=3.5)

        return X


def main():
    # --------------------------------------------------------
    # 3.1 Cargar datos (ajusta la ruta a tu entorno)
    # --------------------------------------------------------
    # En Colab era en Drive. Aquí lo dejamos relativo.
    csv_path = "/content/drive/MyDrive/Machine Learning/Semana 09/application_record.csv"
    df = pd.read_csv(csv_path, sep=";")

    print("Shape original:", df.shape)
    print("\n=== INFO DEL DATASET ===")
    print(df.info())
    print("\n=== NULOS ===")
    print(df.isnull().sum().sort_values(ascending=False))
    print("\n=== DUPLICADOS ===")
    print(df.duplicated().sum())

    # --------------------------------------------------------
    # 3.2 Separar X, y
    # --------------------------------------------------------
    y = df["cardio"].copy()
    X = df.drop(columns=["cardio"]).copy()

    # --------------------------------------------------------
    # 3.3 Definir columnas numéricas a escalar (las mismas que escalaste)
    # --------------------------------------------------------
    numeric_features = [
        "age_years",
        "height",
        "weight",
        "bmi",
        "ap_hi",
        "ap_lo",
        "ap_ratio",
    ]

    # columnas que se dejan pasar tal cual
    passthrough_features = [
        "gender",
        "cholesterol",
        "gluc",
        "smoke",
        "alco",
        "active",
    ]

    # --------------------------------------------------------
    # 3.4 Preprocesador de columnas (como tu StandardScaler)
    # --------------------------------------------------------
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", "passthrough", passthrough_features),
        ],
        remainder="drop",
    )

    # --------------------------------------------------------
    # 3.5 Modelo (misma Regresión Logística que usaste)
    # --------------------------------------------------------
    log_model = LogisticRegression(
        solver="liblinear",
        class_weight="balanced",
        max_iter=1000,
        random_state=42,
    )

    # --------------------------------------------------------
    # 3.6 Pipeline COMPLETO
    # --------------------------------------------------------
    pipeline = Pipeline(
        steps=[
            ("validator", BasicValidator(required_cols=RAW_FEATURES)),
            ("clinical", ClinicalNormalizer()),
            ("features", FeatureEngineer()),
            ("outliers", OutlierCapper()),
            ("preprocess", preprocessor),
            ("model", log_model),
        ]
    )

    # --------------------------------------------------------
    # 3.7 Train / Test split (mismo 80/20, estratificado)
    # --------------------------------------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    # --------------------------------------------------------
    # 3.8 Entrenar
    # --------------------------------------------------------
    pipeline.fit(X_train, y_train)

    # --------------------------------------------------------
    # 3.9 Evaluar y encontrar umbral óptimo (como hiciste)
    # --------------------------------------------------------
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    # el último valor de thresholds no tiene F1, así que ignoramos el nan
    best_idx = np.nanargmax(f1_scores)
    best_threshold = thresholds[best_idx]

    # aplicar umbral
    y_pred_opt = (y_pred_proba >= best_threshold).astype(int)

    acc = accuracy_score(y_test, y_pred_opt)
    prec = precision_score(y_test, y_pred_opt)
    rec = recall_score(y_test, y_pred_opt)
    f1 = f1_score(y_test, y_pred_opt)
    auc = roc_auc_score(y_test, y_pred_proba)

    print("\n=== UMBRAL ÓPTIMO ENCONTRADO ===")
    print(f"Umbral óptimo       : {best_threshold:.3f}")
    print(f"Precision en umbral : {precisions[best_idx]:.3f}")
    print(f"Recall en umbral    : {recalls[best_idx]:.3f}")
    print(f"F1-score en umbral  : {f1_scores[best_idx]:.3f}")

    print("\n=== MÉTRICAS CON UMBRAL ÓPTIMO ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"ROC-AUC  : {auc:.4f}")

    print("\n=== REPORTE DE CLASIFICACIÓN ===")
    print(classification_report(y_test, y_pred_opt, digits=4))

    # --------------------------------------------------------
    # 3.10 Guardar en .pkl (pipeline + umbral)
    # --------------------------------------------------------
    artefacto = {
        "pipeline": pipeline,
        "threshold": float(best_threshold),
        "feature_order_info": {
            "numeric_features": numeric_features,
            "passthrough_features": passthrough_features,
        },
        "version": "1.0.0",
    }

    # Guarda el modelo en el directorio actual (/content)
    output_path = "/content/drive/MyDrive/Machine Learning/content/cardio_pipeline.pkl"
    joblib.dump(artefacto, output_path)

    print(f"\n✅ Modelo guardado correctamente en: {output_path}")

In [None]:

# ============================================================
# 4. FUNCIÓN DE PREDICCIÓN (USO EN PRODUCCIÓN)
# ============================================================

def predict_from_raw_dict(list_of_patients, model_path="cardio_pipeline.pkl"):
    """
    list_of_patients: lista de dicts con las columnas crudas tal cual vendrían del sistema.
    Ejemplo:
    [
        {
            "id": 1,
            "age": 18393,
            "gender": 2,
            "height": 168,
            "weight": 62,
            "ap_hi": 110,
            "ap_lo": 80,
            "cholesterol": 1,
            "gluc": 1,
            "smoke": 0,
            "alco": 0,
            "active": 1,
        },
        ...
    ]
    """
    artefacto = joblib.load("/content/drive/MyDrive/Machine Learning/Proyecto Machine Learning/cardio_train.csv")
    pipeline = artefacto["pipeline"]
    threshold = artefacto["threshold"]

    # convertir a DataFrame
    df_new = pd.DataFrame(list_of_patients)

    # pasar por pipeline
    proba = pipeline.predict_proba(df_new)[:, 1]
    preds = (proba >= threshold).astype(int)

    # devolver como lista de dicts
    results = []
    for i, row in enumerate(list_of_patients):
        results.append(
            {
                "input": row,
                "proba_cardio": float(proba[i]),
                "pred_cardio": int(preds[i]),
                "threshold_used": float(threshold),
            }
        )
    return results


# ============================================================
# 5. ENTRY POINT
# ============================================================
if __name__ == "__main__":
    # 1) entrenamos y guardamos
    main()

    # 2) ejemplo de uso inmediato con datos crudos (SIN preprocesar)
    ejemplo_pacientes = [
        {
            "id": 999,
            "age": 18393,  # en días
            "gender": 1,
            "height": 168,
            "weight": 62,
            "ap_hi": 110,
            "ap_lo": 80,
            "cholesterol": 1,
            "gluc": 1,
            "smoke": 0,
            "alco": 0,
            "active": 1,
        },
        {
            "id": 1000,
            "age": 23200,
            "gender": 2,
            "height": 160,
            "weight": 90,
            "ap_hi": 150,
            "ap_lo": 100,
            "cholesterol": 3,
            "gluc": 1,
            "smoke": 0,
            "alco": 0,
            "active": 1,
        },
    ]

    resultados = predict_from_raw_dict(ejemplo_pacientes, "/content/drive/MyDrive/Machine Learning/Proyecto Machine Learning/pipeline.pkl")
    print("\n=== PREDICCIONES DE PRUEBA ===")
    for r in resultados:
        print(r)

Shape original: (438557, 1)

=== INFO DEL DATASET ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                                                         Non-Null Count   Dtype 
---  ------                                                                                                                                                                                                                                                         --------------   ----- 
 0   ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS  438557 n

KeyError: 'cardio'

In [None]:
import joblib
import pandas as pd

# 1) cargar el artefacto
artefacto = joblib.load("/content/drive/MyDrive/Machine Learning/Proyecto Machine Learning/pipeline.pkl")
pipeline = artefacto["pipeline"]
threshold = artefacto["threshold"]

# 2) ejemplo de datos tal cual vienen de tu sistema
nuevos = [
    {
        "id": 1,
        "age": 18393,     # en días
        "gender": 1,
        "height": 150,
        "weight": 62,
        "ap_hi": 110,
        "ap_lo": 80,
        "cholesterol": 1,
        "gluc": 1,
        "smoke": 0,
        "alco": 0,
        "active": 1,
    },
    {
        "id": 2,
        "age": 19600,
        "gender": 2,
        "height": 160,
        "weight": 90,
        "ap_hi": 150,
        "ap_lo": 100,
        "cholesterol": 3,
        "gluc": 1,
        "smoke": 0,
        "alco": 0,
        "active": 1,
    },
]

df_new = pd.DataFrame(nuevos)

# 3) predecir
proba = pipeline.predict_proba(df_new)[:, 1]
preds = (proba >= threshold).astype(int)

for row, p, y in zip(nuevos, proba, preds):
    print(row["id"], "→ prob:", round(float(p), 3), "→ pred:", int(y))


1 → prob: 0.221 → pred: 0
2 → prob: 0.927 → pred: 1


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
