# Paquetes a utilizar

In [1]:
import sklearn as skl
import pandas as pd
import numpy as np
import xgboost as xgb
from boruta import BorutaPy

# Base de datos

In [2]:
df = pd.read_csv("data/db.csv")
#df = df.dropna()
df = df.rename(columns={'pobreza':'Label'})
df['Label'].value_counts()

Label
No pobre          5639
Pobre no extr.    3989
Pobre ext.        1908
Name: count, dtype: int64

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,11536.0,5767.5,3330.300687,0.0,2883.75,5767.5,8651.25,11535.0
P01F03,11536.0,205.346047,1671.703363,0.0,0.0,0.0,0.0,64800.0
P15B02,11536.0,10.852462,104.289978,0.0,0.0,0.0,0.0,4000.0
P15B04,11536.0,75.097868,790.772822,0.0,0.0,0.0,0.0,36700.0
P15B06A,11536.0,0.061026,0.283185,0.0,0.0,0.0,0.0,8.0
P13A02A,11536.0,0.400659,0.669359,0.0,0.0,0.0,1.0,5.0
P06B01,11536.0,3.218187,1.945638,0.0,2.0,3.0,4.0,20.0
P06B10A,11536.0,0.028528,0.122511,0.0,0.0,0.0,0.0,3.0
P06B10B,11536.0,3.328799,4.496532,0.0,0.0,1.5,5.0,30.0
P09A03B,11536.0,0.271926,0.816723,0.0,0.0,0.0,0.0,8.571429


In [4]:
df.dtypes

Unnamed: 0      int64
Label          object
P01F03        float64
P15B02        float64
P15B04        float64
P15B06A       float64
P13A02A       float64
P06B01          int64
P06B10A       float64
P06B10B       float64
P09A03B       float64
P09A03C       float64
P09B02B       float64
P09B02C       float64
P09F03B       float64
P09F03C       float64
P09F04B       float64
P09F04C       float64
P09F05B       float64
P09F05C       float64
P09F06B       float64
P09F06C       float64
P09F07B       float64
P09F07C       float64
P09F08B       float64
P09F08C       float64
P09F09B       float64
P09F09C       float64
P10B01        float64
P10B08        float64
P10B20B       float64
P11A05B       float64
P11A06B       float64
dtype: object

# Variable dependiente que debe predecirse y codificación de datos categóricos

In [5]:
y = df["Label"].values
Y = skl.preprocessing.LabelEncoder().fit_transform(y)
Y

array([0, 0, 2, ..., 2, 0, 1])

# Definir x, normalizar valores y definir variables independientes

In [6]:
X = df.drop(labels = ["Label"], axis=1) 
nombres_de_funciones = np.array(X.columns)
scaler = skl.preprocessing.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Train and test para verificar la precisión después de ajustar el modelo

In [7]:
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X, Y, test_size=0.25, random_state=42)

# XGBOOST para ser utilizado por Boruta

In [8]:
modelo = xgb.XGBClassifier()

- Crear funciones de sombra: funciones aleatorias y valores aleatorios en columnas
- Entrenar Random Forest / XGBoost y calcular la importancia de la característica a través de la disminución media de la impureza
- Comprobar si las características reales tienen mayor importancia en comparación con las características de sombra
- Repetir esto para cada iteración
- Si la función original funcionó mejor, marcarla como importante.

In [10]:
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

In [11]:
# Especificar cómo se determinan las características relevantes utilizando el algoritmo Boruta
selector = BorutaPy(modelo, n_estimators='auto', verbose=2, random_state=1)
# Hallar todas las características importantes
selector.fit(X_train, y_train)
# Aplicar el método transform() en el conjunto de datos X para limitarlo a solo las características seleccionadas
X_filtered = selector.transform(X_train)  # Utilizar técnicas de selección de características y obtener el conjunto de datos modificado

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	10
Iteration: 	9 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	10
Iteration: 	10 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	10
Iteration: 	11 / 100
Confirmed: 	13
Tentative: 	9
Rejected: 	10
Iteration: 	12 / 100
Confirmed: 	14
Tentative: 	6
Rejected: 	12
Iteration: 	13 / 100
Confirmed: 	14
Tentative: 	6
Rejected: 	12
Iteration: 	14 / 100
Confirmed: 	14
Tentative: 	6
Rejected: 	12
Iteration: 	15 / 100
Confirmed: 	14
Tentative: 	6
Rejected: 	12
Iteration: 	16 / 100
Confirmed: 	14
Tentative: 	5
Reject

# zip nombres de características, rangos y decisiones

In [12]:
feature_ranks = list(zip(nombres_de_funciones, 
                         selector.ranking_, 
                         selector.support_))


# Resultados

In [13]:
for feat in feature_ranks:
    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

Feature: Unnamed: 0                     Rank: 1,  Keep: True
Feature: P01F03                         Rank: 12,  Keep: False
Feature: P15B02                         Rank: 9,  Keep: False
Feature: P15B04                         Rank: 2,  Keep: False
Feature: P15B06A                        Rank: 4,  Keep: False
Feature: P13A02A                        Rank: 1,  Keep: True
Feature: P06B01                         Rank: 1,  Keep: True
Feature: P06B10A                        Rank: 1,  Keep: True
Feature: P06B10B                        Rank: 1,  Keep: True
Feature: P09A03B                        Rank: 3,  Keep: False
Feature: P09A03C                        Rank: 15,  Keep: False
Feature: P09B02B                        Rank: 6,  Keep: False
Feature: P09B02C                        Rank: 8,  Keep: False
Feature: P09F03B                        Rank: 1,  Keep: True
Feature: P09F03C                        Rank: 5,  Keep: False
Feature: P09F04B                        Rank: 11,  Keep: False
Feature: P0

In [14]:
# Utilizando un conjunto específico de características, ajustar un modelo XGBoost en los datos de entrenamiento
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_filtered, y_train)
# Emplear el modelo previamente entrenado para hacer predicciones con el conjunto de datos de prueba
# Antes de hacer las predicciones, aplicar las mismas técnicas de selección de características en los datos de prueba para asegurarse de que solo se estén utilizando las características relevantes
X_test_filtered = selector.transform(X_test)
prediction_xgb = xgb_model.predict(X_test_filtered)

## Precisión

In [15]:
skl.metrics.accuracy_score(y_test, prediction_xgb)

0.6567267683772539