In [11]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

Cargar Data Funcion

In [12]:
RANDOM_STATE = 42
DATA_PATH = "./data/animal_disease_prediction_cleaned.csv"
OUT_DIR = Path("./outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)

In [13]:
def load_data():
    df = pd.read_csv(DATA_PATH)
    y = df["Disease_Prediction"].astype(str)
    X = df.drop(columns=["Disease_Prediction"])
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    return X, y, cat_cols, num_cols


Pipeline para trees Funcion

In [14]:
def preprocessor_for_trees(cat_cols, num_cols):
    # Árbol/Forest: Ordinal para categóricas (rápido y sin explosionar dimensiones); numéricas pasan tal cual
    return ColumnTransformer(
        transformers=[
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop"
    )


Pipeline para SVM

In [15]:
def preprocessor_for_svm(cat_cols, num_cols):
    # SVM: One-Hot para categóricas y escalado en numéricas; salida es esparsa
    return ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), cat_cols),
            ("num", StandardScaler(with_mean=False), num_cols),
        ],
        remainder="drop",
    )


Evaluacion y reporte Funcion 

In [16]:
def eval_and_report(name, y_true_val, y_pred_val, y_true_test, y_pred_test):
    rows = []
    for split, yt, yp in [("val", y_true_val, y_pred_val), ("test", y_true_test, y_pred_test)]:
        acc = accuracy_score(yt, yp)
        f1m = f1_score(yt, yp, average="macro")
        print(f"\n[{name}] {split.upper()} - accuracy={acc:.4f} macroF1={f1m:.4f}")
        print(classification_report(yt, yp, digits=4))
        print("Matriz de confusión:")
        print(confusion_matrix(yt, yp))
        rows.append({"model": name, "split": split, "accuracy": acc, "macro_f1": f1m})
    return rows

In [17]:
X, y, cat_cols, num_cols = load_data()

# Split 60/20/20
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp
)

models = {
    "DecisionTree": (
        preprocessor_for_trees(cat_cols, num_cols),
        DecisionTreeClassifier(max_depth=8, min_samples_leaf=10, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "RandomForest": (
        preprocessor_for_trees(cat_cols, num_cols),
        RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "LinearSVM": (
        preprocessor_for_svm(cat_cols, num_cols),
        LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE)
    ),
}

all_rows = []
for name, (pre, clf) in models.items():
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred_val = pipe.predict(X_val)
    y_pred_test = pipe.predict(X_test)

    rows = eval_and_report(name, y_val, y_pred_val, y_test, y_pred_test)
    all_rows.extend(rows)

df_metrics = pd.DataFrame(all_rows).sort_values(["split", "macro_f1"], ascending=[True, False])
out_csv = OUT_DIR / "tabular_metrics.csv"
df_metrics.to_csv(out_csv, index=False)
print(f"\nMétricas guardadas en: {out_csv.resolve()}")

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'