# Paquetes a utilizar

In [2]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn import preprocessing, model_selection, metrics
import xgboost as xgb
from boruta import BorutaPy

# Fix de alias deprecados en NumPy (requerido por Boruta < 0.4.4)
np.int   = np.int32
np.float = np.float64
np.bool  = np.bool_

# Carga de la base

In [3]:
df = pd.read_csv("data/db.csv")

# Renombrar la variable objetivo
df = df.rename(columns={"pobreza": "Label"})
print(df["Label"].value_counts())

Label
No pobre          5639
Pobre no extr.    3989
Pobre ext.        1908
Name: count, dtype: int64


## Exploración rápida

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NUMHOG,11536.0,5768.5,3330.300687,1.0,2884.75,5768.5,8652.25,11536.0
P01F03,11536.0,205.346047,1671.703363,0.0,0.0,0.0,0.0,64800.0
P15B02,11536.0,10.852462,104.289978,0.0,0.0,0.0,0.0,4000.0
P15B04,11536.0,75.097868,790.772822,0.0,0.0,0.0,0.0,36700.0
P15B06A,11536.0,0.061026,0.283185,0.0,0.0,0.0,0.0,8.0
P13A02A,11536.0,0.400659,0.669359,0.0,0.0,0.0,1.0,5.0
P09F03B,11536.0,1.030687,1.781403,0.0,0.0,0.0,1.0,30.0
P09F03C,11536.0,15.830531,26.062729,0.0,0.0,0.0,30.0,340.0
P09F04B,11536.0,7.043083,7.907098,0.0,2.0,5.0,10.0,77.0
P09F04C,11536.0,16.137916,30.785303,0.0,0.0,0.0,30.0,370.0


In [5]:
df.dtypes

NUMHOG     float64
Label       object
P01F03     float64
P15B02     float64
P15B04     float64
P15B06A    float64
P13A02A    float64
P09F03B    float64
P09F03C    float64
P09F04B    float64
P09F04C    float64
P09F05B    float64
P09F05C    float64
P09F06B    float64
P09F06C    float64
P09F07B    float64
P09F07C    float64
P09F08B    float64
P09F08C    float64
P09F09B    float64
P09F09C    float64
P10B20B    float64
P11A05B    float64
P11A06B    float64
P06B10A    float64
P06B10B    float64
P09A03B    float64
P09A03C    float64
P09B02B    float64
P09B02C    float64
P06B01       int64
P10B01     float64
P10B08     float64
dtype: object

# Preparar variable dependiente y codificar etiquetas

In [6]:
y_raw = df["Label"].values
label_enc = preprocessing.LabelEncoder()
Y = label_enc.fit_transform(y_raw)
print(np.unique(Y, return_counts=True))

(array([0, 1, 2]), array([5639, 1908, 3989]))


# Escalado de X

In [7]:
X = df.drop(columns=["Label"])
feature_names = X.columns.to_numpy()

scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train / test split

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X_scaled, Y, test_size=0.25, random_state=42, stratify=Y
)

# Modelo XGBoost para Boruta

In [9]:
xgb_base = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
)

boruta_sel = BorutaPy(
    estimator=xgb_base,
    n_estimators="auto",
    verbose=2,
    random_state=42,
)

boruta_sel.fit(X_train, y_train)
X_train_sel = boruta_sel.transform(X_train)
X_test_sel  = boruta_sel.transform(X_test)


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	9 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	10 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	11 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	12 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	13 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	14 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	15 / 100
Confirmed: 	13
Tentative: 	11
Rejected: 	8
Iteration: 	16 / 100
Confirmed: 	14
Tentative: 	8
Reject

# Ranking de características

In [10]:
feature_ranks = list(
    zip(feature_names, boruta_sel.ranking_, boruta_sel.support_)
)

for feat, rank, keep in feature_ranks:
    print(f"Feature: {feat:<30}  Rank: {rank:<2}  Keep: {keep}")


Feature: NUMHOG                          Rank: 1   Keep: True
Feature: P01F03                          Rank: 3   Keep: False
Feature: P15B02                          Rank: 9   Keep: False
Feature: P15B04                          Rank: 1   Keep: True
Feature: P15B06A                         Rank: 5   Keep: False
Feature: P13A02A                         Rank: 1   Keep: True
Feature: P09F03B                         Rank: 1   Keep: True
Feature: P09F03C                         Rank: 2   Keep: False
Feature: P09F04B                         Rank: 1   Keep: True
Feature: P09F04C                         Rank: 1   Keep: True
Feature: P09F05B                         Rank: 1   Keep: True
Feature: P09F05C                         Rank: 12  Keep: False
Feature: P09F06B                         Rank: 13  Keep: False
Feature: P09F06C                         Rank: 11  Keep: False
Feature: P09F07B                         Rank: 9   Keep: False
Feature: P09F07C                         Rank: 10  Keep: False

# Entrenamiento final y predicciones

In [11]:
xgb_final = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
)

xgb_final.fit(X_train_sel, y_train)
y_pred     = xgb_final.predict(X_test_sel)
y_proba    = xgb_final.predict_proba(X_test_sel)

# Métricas de desempeño

In [12]:
acc = metrics.accuracy_score(y_test, y_pred)
f1_macro = metrics.f1_score(y_test, y_pred, average="macro")
auc_macro = metrics.roc_auc_score(
    y_test, y_proba, multi_class="ovr", average="macro"
)

print(f"Accuracy   : {acc:.4f}")
print(f"F1 macro   : {f1_macro:.4f}")
print(f"AUC‑ROC macro : {auc_macro:.4f}\\n")
print(metrics.classification_report(y_test, y_pred))


Accuracy   : 0.6078
F1 macro   : 0.5492
AUC‑ROC macro : 0.7716\n
              precision    recall  f1-score   support

           0       0.70      0.78      0.74      1410
           1       0.56      0.34      0.42       477
           2       0.48      0.49      0.49       997

    accuracy                           0.61      2884
   macro avg       0.58      0.54      0.55      2884
weighted avg       0.60      0.61      0.60      2884

