In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms


Cargar Data Funcion

In [2]:
RANDOM_STATE = 42
DATA_PATH = "C:/Users/USER/Downloads/data/data/animal_disease_prediction_cleaned.csv"
OUT_DIR = Path("./outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
def load_data():
    df = pd.read_csv(DATA_PATH)
    y = df["Disease_Prediction"].astype(str)
    X = df.drop(columns=["Disease_Prediction"])
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    return X, y, cat_cols, num_cols


Pipeline para trees Funcion

In [4]:
def preprocessor_for_trees(cat_cols, num_cols):
    # Árbol/Forest: Ordinal para categóricas (rápido y sin explosionar dimensiones); numéricas pasan tal cual
    return ColumnTransformer(
        transformers=[
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop"
    )


Pipeline para SVM

In [5]:
def preprocessor_for_svm(cat_cols, num_cols):
    # One-Hot denso + escalado estándar en numéricas
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer
    return ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ("num", StandardScaler(), num_cols),
        ],
        remainder="drop",
    )

Evaluacion y reporte Funcion 

In [6]:
def eval_and_report(name, y_true_val, y_pred_val, y_true_test, y_pred_test):
    rows = []
    for split, yt, yp in [("val", y_true_val, y_pred_val), ("test", y_true_test, y_pred_test)]:
        acc = accuracy_score(yt, yp)
        f1m = f1_score(yt, yp, average="macro")
        print(f"\n[{name}] {split.upper()} - accuracy={acc:.4f} macroF1={f1m:.4f}")
        print(classification_report(yt, yp, digits=4))
        print("Matriz de confusión:")
        print(confusion_matrix(yt, yp))
        rows.append({"model": name, "split": split, "accuracy": acc, "macro_f1": f1m})
    return rows

In [7]:
X, y, cat_cols, num_cols = load_data()

# Split 60/20/20
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp
)

In [8]:
models = {
    "DecisionTree": (
        preprocessor_for_trees(cat_cols, num_cols),
        DecisionTreeClassifier(max_depth=8, min_samples_leaf=10, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "RandomForest": (
        preprocessor_for_trees(cat_cols, num_cols),
        RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, class_weight="balanced", random_state=RANDOM_STATE)
    ),
    "LinearSVM": (
        preprocessor_for_svm(cat_cols, num_cols),
        LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE)
    ),
}

In [9]:
all_rows = []
for name, (pre, clf) in models.items():
    pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred_val = pipe.predict(X_val)
    y_pred_test = pipe.predict(X_test)

    rows = eval_and_report(name, y_val, y_pred_val, y_test, y_pred_test)
    all_rows.extend(rows)

df_metrics = pd.DataFrame(all_rows).sort_values(["split", "macro_f1"], ascending=[True, False])
out_csv = OUT_DIR / "tabular_metrics.csv"
df_metrics.to_csv(out_csv, index=False)
print(f"\nMétricas guardadas en: {out_csv.resolve()}")


[DecisionTree] VAL - accuracy=0.1544 macroF1=0.1118
                                      precision    recall  f1-score   support

      Actinobacillus Pleuropneumonia     0.0000    0.0000    0.0000         4
                 Actinobacillus Suis     0.2000    1.0000    0.3333         1
                 African Swine Fever     0.5000    1.0000    0.6667         1
                   Allergic Rhinitis     0.0000    0.0000    0.0000         0
                           Arthritis     0.2500    1.0000    0.4000         1
                         Blue Tongue     0.1250    1.0000    0.2222         1
                 Blue Tongue Disease     0.0000    0.0000    0.0000         1
                   Blue Tongue Virus     0.2000    1.0000    0.3333         1
                          Bluetongue     0.0000    0.0000    0.0000         3
                    Bluetongue Virus     0.0000    0.0000    0.0000         1
                Bordetella Infection     0.0000    0.0000    0.0000         1
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



[RandomForest] VAL - accuracy=0.9266 macroF1=0.9376
                                      precision    recall  f1-score   support

      Actinobacillus Pleuropneumonia     1.0000    1.0000    1.0000         4
                 Actinobacillus Suis     1.0000    1.0000    1.0000         1
                 African Swine Fever     1.0000    1.0000    1.0000         1
                           Arthritis     1.0000    1.0000    1.0000         1
                         Blue Tongue     1.0000    1.0000    1.0000         1
                 Blue Tongue Disease     1.0000    1.0000    1.0000         1
                   Blue Tongue Virus     1.0000    1.0000    1.0000         1
                          Bluetongue     1.0000    1.0000    1.0000         3
                    Bluetongue Virus     1.0000    1.0000    1.0000         1
                Bordetella Infection     1.0000    1.0000    1.0000         1
              Bovine Johne's Disease     1.0000    1.0000    1.0000         3
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



[LinearSVM] VAL - accuracy=0.8842 macroF1=0.9278
                                      precision    recall  f1-score   support

      Actinobacillus Pleuropneumonia     1.0000    1.0000    1.0000         4
                 Actinobacillus Suis     1.0000    1.0000    1.0000         1
                 African Swine Fever     1.0000    1.0000    1.0000         1
                           Arthritis     1.0000    1.0000    1.0000         1
                         Blue Tongue     1.0000    1.0000    1.0000         1
                 Blue Tongue Disease     1.0000    1.0000    1.0000         1
                   Blue Tongue Virus     1.0000    1.0000    1.0000         1
                          Bluetongue     1.0000    1.0000    1.0000         3
                    Bluetongue Virus     1.0000    1.0000    1.0000         1
                Bordetella Infection     1.0000    1.0000    1.0000         1
              Bovine Johne's Disease     1.0000    1.0000    1.0000         3
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


CNN basica para Imagenes 


In [10]:
SEED = 42
DATA_DIR = r"C:/Users/USER/Downloads/data/data/pet_disease_images_augmented"  # <- ruta real
BATCH_SIZE = 32
IMG_SIZE = 224
EPOCHS = 20
PATIENCE = 5
LR = 1e-3
WEIGHT_DECAY = 1e-4

OUT_DIR = Path("./outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR = Path("./models"); MODEL_DIR.mkdir(parents=True, exist_ok=True)


In [11]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [12]:

class SimpleCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [13]:
def get_dataloaders(data_dir, img_size, batch_size, seed):
    base_ds = datasets.ImageFolder(root=data_dir)  # solo para etiquetas
    targets = np.array([y for _, y in base_ds.samples])
    class_names = base_ds.classes
    num_classes = len(class_names)

    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    train_idx, val_idx = next(splitter.split(np.zeros(len(targets)), targets))

    train_tfms = transforms.Compose([
        transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
    ])
    val_tfms = transforms.Compose([
        transforms.Resize(int(img_size * 1.15)),
        transforms.CenterCrop(img_size),
        transforms.ToTensor(),
    ])

    train_ds_full = datasets.ImageFolder(root=data_dir, transform=train_tfms)
    val_ds_full = datasets.ImageFolder(root=data_dir, transform=val_tfms)

    train_ds = Subset(train_ds_full, train_idx)
    val_ds = Subset(val_ds_full, val_idx)

    # DataLoaders (num_workers=0 en Windows)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

    # Pesos de clase (balanceo)
    train_targets = targets[train_idx]
    counts = np.bincount(train_targets, minlength=num_classes).astype(float)
    class_weights = counts.sum() / (counts + 1e-6)
    class_weights = class_weights / class_weights.mean()

    return train_loader, val_loader, class_names, torch.tensor(class_weights, dtype=torch.float32)

In [14]:
@torch.no_grad()
def evaluate(model, loader, device, criterion):
    model.eval()
    losses, y_true, y_pred = [], [], []
    for images, labels in loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        logits = model(images)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        y_true.append(labels.cpu().numpy())
        y_pred.append(preds.cpu().numpy())
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    return float(np.mean(losses)), acc, f1m, y_true, y_pred

In [15]:
def plot_curves(history, out_path):
    epochs = [h["epoch"] for h in history]
    train_loss = [h["train_loss"] for h in history]
    val_loss = [h["val_loss"] for h in history]
    val_acc = [h["val_acc"] for h in history]
    val_f1 = [h["val_macro_f1"] for h in history]

    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(epochs, train_loss, label="train_loss")
    plt.plot(epochs, val_loss, label="val_loss")
    plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Pérdida"); plt.legend()

    plt.subplot(1,2,2)
    plt.plot(epochs, val_acc, label="val_acc")
    plt.plot(epochs, val_f1, label="val_macro_f1")
    plt.xlabel("Epoch"); plt.ylabel("Score"); plt.title("Métricas Val"); plt.legend()

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()


In [16]:
def save_confusion_matrix(cm, class_names, out_path):
    plt.figure(figsize=(8,6))
    plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    plt.title("Matriz de confusión")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45, ha="right")
    plt.yticks(tick_marks, class_names)
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], "d"),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black", fontsize=8)
    plt.ylabel("Verdadero"); plt.xlabel("Predicho")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

In [None]:
import random
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo: {device}")

train_loader, val_loader, class_names, class_weights = get_dataloaders(
    DATA_DIR, IMG_SIZE, BATCH_SIZE, SEED
)
num_classes = len(class_names)

model = SimpleCNN(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

best_f1 = -1.0
best_epoch = -1
epochs_no_improve = 0
history = []

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_losses = []
    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    scheduler.step()

    val_loss, val_acc, val_f1, y_true, y_pred = evaluate(model, val_loader, device, criterion)
    avg_train_loss = float(np.mean(train_losses))
    history.append({
        "epoch": epoch,
        "train_loss": avg_train_loss,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "val_macro_f1": val_f1,
        "lr": scheduler.get_last_lr()[0],
    })

    print(f"Epoch {epoch:02d} | train_loss={avg_train_loss:.4f} "
            f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} val_macroF1={val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch
        epochs_no_improve = 0
        torch.save({
            "model_state": model.state_dict(),
            "class_names": class_names,
            "config": {"img_size": IMG_SIZE, "num_classes": num_classes, "lr": LR, "weight_decay": WEIGHT_DECAY}
        }, MODEL_DIR / "cnn_basic_pytorch.pt")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping en epoch {epoch} (mejor F1 en epoch {best_epoch})")
            break

# Cargar mejor y evaluar final
ckpt = torch.load(MODEL_DIR / "cnn_basic_pytorch.pt", map_location=device)
model.load_state_dict(ckpt["model_state"])
val_loss, val_acc, val_f1, y_true, y_pred = evaluate(model, val_loader, device, criterion)

print("\nReporte de validación:")
print(classification_report(y_true, y_pred, target_names=class_names, digits=4))
cm = confusion_matrix(y_true, y_pred)
print("Matriz de confusión:\n", cm)

# Guardar artefactos
with open(OUT_DIR / "cnn_metrics.json", "w", encoding="utf-8") as f:
    json.dump({
        "best_epoch": best_epoch,
        "val_loss": float(val_loss),
        "val_accuracy": float(val_acc),
        "val_macro_f1": float(val_f1),
        "class_names": class_names,
        "history": history
    }, f, ensure_ascii=False, indent=2)

np.savetxt(OUT_DIR / "cnn_confusion_matrix.csv", cm, fmt="%d", delimiter=",")
plot_curves(history, OUT_DIR / "cnn_training_curves.png")
save_confusion_matrix(cm, class_names, OUT_DIR / "cnn_confusion_matrix.png")

print(f"\nModelo: {(MODEL_DIR / 'cnn_basic_pytorch.pt').resolve()}")
print(f"Métricas: {(OUT_DIR / 'cnn_metrics.json').resolve()}")
print(f"Curvas: {(OUT_DIR / 'cnn_training_curves.png').resolve()}")
print(f"Confusion matrix: {(OUT_DIR / 'cnn_confusion_matrix.png').resolve()}")

Dispositivo: cuda
Epoch 01 | train_loss=3.0937 val_loss=3.0899 val_acc=0.0516 val_macroF1=0.0090
Epoch 02 | train_loss=3.0761 val_loss=3.0575 val_acc=0.0540 val_macroF1=0.0132
Epoch 03 | train_loss=3.0626 val_loss=3.0686 val_acc=0.0598 val_macroF1=0.0283
Epoch 04 | train_loss=3.0587 val_loss=3.0464 val_acc=0.0628 val_macroF1=0.0270
Epoch 05 | train_loss=3.0464 val_loss=3.0418 val_acc=0.0545 val_macroF1=0.0232
Epoch 06 | train_loss=3.0374 val_loss=3.0367 val_acc=0.0686 val_macroF1=0.0308
Epoch 07 | train_loss=3.0316 val_loss=3.0315 val_acc=0.0733 val_macroF1=0.0299
Epoch 08 | train_loss=3.0242 val_loss=3.0176 val_acc=0.0897 val_macroF1=0.0381
Epoch 09 | train_loss=3.0167 val_loss=3.0110 val_acc=0.0915 val_macroF1=0.0492
Epoch 10 | train_loss=3.0057 val_loss=2.9991 val_acc=0.1032 val_macroF1=0.0599
Epoch 11 | train_loss=2.9955 val_loss=2.9927 val_acc=0.1032 val_macroF1=0.0634
Epoch 12 | train_loss=2.9963 val_loss=2.9878 val_acc=0.1038 val_macroF1=0.0643
Epoch 13 | train_loss=2.9849 val_l

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Modelo: C:\Users\USER\Downloads\data\data\models\cnn_basic_pytorch.pt
Métricas: C:\Users\USER\Downloads\data\data\outputs\cnn_metrics.json
Curvas: C:\Users\USER\Downloads\data\data\outputs\cnn_training_curves.png
Confusion matrix: C:\Users\USER\Downloads\data\data\outputs\cnn_confusion_matrix.png
