In [40]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt



Cargar Data Funcion

In [41]:
RANDOM_STATE = 42
DATA_PATH = "./data/animal_disease_prediction_cleaned.csv"
OUT_DIR = Path("./outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)

In [42]:
def load_data():
    df = pd.read_csv(DATA_PATH)
    y = df["Disease_Prediction"].astype(str)
    X = df.drop(columns=["Disease_Prediction"])
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    return X, y, cat_cols, num_cols


Pipeline para trees Funcion

In [43]:
def preprocessor_for_trees(cat_cols, num_cols):
    # Árbol/Forest: Ordinal para categóricas (rápido y sin explosionar dimensiones); numéricas pasan tal cual
    return ColumnTransformer(
        transformers=[
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop"
    )


Pipeline para SVM

In [44]:
def preprocessor_for_svm(cat_cols, num_cols):
    # One-Hot denso + escalado estándar en numéricas
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer
    return ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ("num", StandardScaler(), num_cols),
        ],
        remainder="drop",
    )

Evaluacion y reporte Funcion 

In [45]:
def eval_and_report(name, y_true_val, y_pred_val, y_true_test, y_pred_test):
    rows = []
    for split, yt, yp in [("val", y_true_val, y_pred_val), ("test", y_true_test, y_pred_test)]:
        acc = accuracy_score(yt, yp)
        f1m = f1_score(yt, yp, average="macro")
        print(f"\n[{name}] {split.upper()} - accuracy={acc:.4f} macroF1={f1m:.4f}")
        print(classification_report(yt, yp, digits=4))
        print("Matriz de confusión:")
        print(confusion_matrix(yt, yp))
        rows.append({"model": name, "split": split, "accuracy": acc, "macro_f1": f1m})
    return rows

In [46]:
X, y, cat_cols, num_cols = load_data()

# Split 60/20/20
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp
)

In [47]:
models = {
    "DecisionTree": (
        preprocessor_for_trees(cat_cols, num_cols),
        DecisionTreeClassifier(max_depth=8, min_samples_leaf=10, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "RandomForest": (
        preprocessor_for_trees(cat_cols, num_cols),
        RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "LinearSVM": (
        preprocessor_for_svm(cat_cols, num_cols),
        LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE)
    ),
}

In [48]:
all_rows = []
for name, (pre, clf) in models.items():
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred_val = pipe.predict(X_val)
    y_pred_test = pipe.predict(X_test)

    rows = eval_and_report(name, y_val, y_pred_val, y_test, y_pred_test)
    all_rows.extend(rows)

df_metrics = pd.DataFrame(all_rows).sort_values(["split", "macro_f1"], ascending=[True, False])
out_csv = OUT_DIR / "tabular_metrics.csv"
df_metrics.to_csv(out_csv, index=False)
print(f"\nMétricas guardadas en: {out_csv.resolve()}")


[DecisionTree] VAL - accuracy=0.2800 macroF1=0.4981
                       precision    recall  f1-score   support

    Allergic Rhinitis     1.0000    1.0000    1.0000        60
            Arthritis     1.0000    1.0000    1.0000        60
               Asthma     0.0000    0.0000    0.0000       180
           Bronchitis     1.0000    1.0000    1.0000        60
            Chlamydia     0.0000    0.0000    0.0000       120
       Conjunctivitis     1.0000    1.0000    1.0000        60
          Coronavirus     0.0259    1.0000    0.0505        60
                Cough     0.9055    0.7985    0.8486       660
            Distemper     1.0000    0.0986    0.1795       720
                  FIV     0.0503    1.0000    0.0957        60
                  Flu     0.7299    0.1512    0.2505       840
     Fungal Infection     1.0000    1.0000    1.0000       120
      Gastroenteritis     1.0000    0.2019    0.3359       540
            Heartworm     1.0000    0.5500    0.7097       120
 


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




[RandomForest] VAL - accuracy=1.0000 macroF1=1.0000
                       precision    recall  f1-score   support

    Allergic Rhinitis     1.0000    1.0000    1.0000        60
            Arthritis     1.0000    1.0000    1.0000        60
               Asthma     1.0000    1.0000    1.0000       180
           Bronchitis     1.0000    1.0000    1.0000        60
            Chlamydia     1.0000    1.0000    1.0000       120
       Conjunctivitis     1.0000    1.0000    1.0000        60
          Coronavirus     1.0000    1.0000    1.0000        60
                Cough     1.0000    1.0000    1.0000       660
            Distemper     1.0000    1.0000    1.0000       720
                  FIV     1.0000    1.0000    1.0000        60
                  Flu     1.0000    1.0000    1.0000       840
     Fungal Infection     1.0000    1.0000    1.0000       120
      Gastroenteritis     1.0000    1.0000    1.0000       540
            Heartworm     1.0000    1.0000    1.0000       120
 