In [1]:
# ---- Desactivar detección/distribución MPI en Lightning (Windows/CPU) ----
import os
# Fuerza entrenamiento en CPU y en un solo proceso
os.environ["LIGHTNING_LAUNCHER"] = "none"   # evita lanzadores MPI/SLURM/etc.
os.environ["WORLD_SIZE"] = "1"              # sin distribución
os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = ""  # no intentes backends dist.
os.environ["CUDA_VISIBLE_DEVICES"] = ""     # por si intenta ver GPUs
os.environ["PYTORCH_LIGHTNING_DISABLE_PROGRESS_BAR"] = "1"  # ⛔ barra de progreso
# (opcional) reduce hilos para evitar overhead
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
import os, time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.tab_transformer import TabTransformerConfig

# Modelo TabTransformer

In [3]:
ruta_carpeta_actual = os.getcwd()
ruta_carpeta_raiz = os.path.dirname(ruta_carpeta_actual)
csv_path  = os.path.join(ruta_carpeta_raiz, "dataset", "PuntosMuestra_CR_2023.csv")
df = pd.read_csv(csv_path)
# c:\Users\Fran\Documents\2025-2_Seminario2\Uso-de-Inteligencia-Artificial-Geoespacial-para-Clasificar-Coberturas-Terrestres-en-Costa-Rica\dataset\PuntosMuestra_CR_2023.csv

## Limpieza

In [4]:
# Quitar la categoria 2 del dataset
df = df[df["CATEGORIA"] != 2].copy()

# Remapear categorías a 0..K-1
clases_unicas = sorted(df["CATEGORIA"].unique())
mapa = {old: new for new, old in enumerate(clases_unicas)}
df["CATEGORIA"] = df["CATEGORIA"].map(mapa)

## Procesamiento de datos

In [5]:
# Variables
y = df["CATEGORIA"].values
X = df.drop(columns=["CATEGORIA","lon","lat","year"])

# Escalar
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Reconstruir dataframe final
df_final = pd.concat([X_scaled, pd.Series(y, name="CATEGORIA")], axis=1)

## Division de datos

In [6]:
train, test = train_test_split(df_final, test_size=0.3, random_state=42, stratify=df_final["CATEGORIA"])

## Definir modelo

In [7]:
data_config = DataConfig(
    target=["CATEGORIA"],
    continuous_cols=X.columns.tolist(),
    categorical_cols=[],
)

trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=256,
    max_epochs=15,
    accelerator="cpu",
    devices=1,
)

optimizer_config = OptimizerConfig()

model_config = TabTransformerConfig(
    task="classification",
    metrics=["accuracy", "f1"],
    # num_classes ya no se pasa aquí
)

optimizer_config = OptimizerConfig()

# ⚠️ Usa métricas válidas: "accuracy", "precision", "recall"
# Si quieres F1, la calculamos después manualmente con sklearn
model_config = TabTransformerConfig(
    task="classification",
    metrics=["accuracy"],   # solo métricas soportadas
)

# --- Crear modelo ---
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

## Entrena modelo

In [8]:
# --- Entrena (sin barra de progreso) ---
t0 = time.perf_counter()
tabular_model.fit(train=train, validation=test, callbacks=[])  # <- clave: lista vacía
fit_s = time.perf_counter() - t0

# --- Evalúa ---
t0 = time.perf_counter()
resultados = tabular_model.evaluate(test)
pred_s = time.perf_counter() - t0

# Predicciones
preds = tabular_model.predict(test)
y_true = test["CATEGORIA"].tolist()
y_pred = preds["prediction"].tolist()

print("Resultados en test:", resultados)
print(f"⏱ Tiempo entrenamiento: {fit_s:.2f}s | inferencia: {pred_s:.2f}s")

Seed set to 42


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


IndexError: pop from empty list

## Metricas

In [None]:
oa = accuracy_score(y_true, y_pred)
f1m = f1_score(y_true, y_pred, average="macro")
print("OA:", oa)
print("F1-macro:", f1m)
print("Reporte clasificación:\n", classification_report(y_true, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_true, y_pred))

## Registrar información

In [None]:
import sys
sys.path.append("..")
import importlib, utils_log
importlib.reload(utils_log)
from utils_log import log_row

carpeta_actual = ruta_carpeta_actual.split("\\")[-1]
dataset_utilizado = csv_path.split("\\")[-1]

log_row(
  script="20250901_PruebasEntrenamientoRF.ipynb",
  algoritmo="RandomForest",
  dataset=dataset_utilizado,
  clases_removidas=[2],
  seed=42,
  n_train=len(y_train), n_test=len(y_test),
  n_features=X.shape[1], num_classes=len(sorted(y.unique())),
  fit_seconds=train_time_s,                     # tiempo de rf.fit(...)
  pred_seconds=pred_time_s,                   # tiempo de rf.predict(...)
  ms_per_sample=(pred_time_s/len(y_test))*1000,
  OA=oa, F1_macro=f1m,
  carpeta=carpeta_actual
)