In [206]:
# Modificación del ejemplo de KNNImputer haciendo OHE antes de la estándarización.


In [207]:
# import data
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## assigned data
housing = pd.read_csv("./data/housing.csv")

In [208]:
## split data to train
X = housing.drop(columns="median_house_value")
y = housing["median_house_value"]

### Muestreo estratificado (*Stratified sampling*)
### dividiendo el *dataset* en grupos llamados **estratos**,
### y asegurándose de tomar no solo un porcentaje de muestras del total,
### sino ese porcentaje de cada estrato.
stratify = pd.cut(
    housing["median_income"],
    bins=[
        0.0,
        1.5,
        3.0,
        4.5,
        6.0,
        np.inf,
    ],  # Secuencia de límites de los contenedores
    labels=[1, 2, 3, 4, 5],  # dividimos en 5 categorías
)

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
## objetivo --> evitar el **sobre-ajuste** o ***over-fitting***
X_train, X_test, y_train, y_test = train_test_split(
    X,  # features
    y,  # target
    stratify=stratify,  # estratificado
    test_size=0.2,  #  estoy usando el 20% de los datos # Controla la mezcla aplicada a los datos antes de aplicar la división.
    random_state=42,  ## fixed seed
)
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN


In [209]:
## para comprobar la effectividad de las operaciones que vamos a realizar
### obtenemos los índices de las filas con valores nulos
null_rows_idx = X_train.isnull().any(axis=1)
null_rows_idx

12655    False
15502    False
2908     False
14053    False
20496    False
         ...  
15174    False
12661    False
19263    False
19140    False
19773    False
Length: 16512, dtype: bool

In [210]:
# Calculamos k de manera manera estandarizada
k_value = np.sqrt(X_train.shape[0]).astype(int)
k_value

128

In [211]:
## OHE
### Aplicamos también OneHotEncoder para las variables categóricas.
cat_encoder = OneHotEncoder(sparse_output=False).set_output(
    transform="pandas"
)  # forzamos que la salida sea DataFrame
X_train_ocean_proximity_ohe = cat_encoder.fit_transform(X_train[["ocean_proximity"]])
X_train_ocean_proximity_ohe

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,0.0,1.0,0.0,0.0,0.0
15502,0.0,0.0,0.0,0.0,1.0
2908,0.0,1.0,0.0,0.0,0.0
14053,0.0,0.0,0.0,0.0,1.0
20496,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
15174,1.0,0.0,0.0,0.0,0.0
12661,0.0,1.0,0.0,0.0,0.0
19263,1.0,0.0,0.0,0.0,0.0
19140,1.0,0.0,0.0,0.0,0.0


In [212]:
### recuperar el resto de DataFrame
## Concatenamos los Dataframes
## entiendo que no es necesario ya que ya solo tenemos una salida
X_train_rest = X_train.drop(columns="ocean_proximity")
housing_cat_ohe = pd.concat([X_train_rest, X_train_ocean_proximity_ohe], axis=1)
housing_cat_ohe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,0.0,1.0,0.0,0.0,0.0
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,0.0,0.0,0.0,0.0,1.0
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,0.0,1.0,0.0,0.0,0.0
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,0.0,0.0,0.0,0.0,1.0
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,1.0,0.0,0.0,0.0,0.0
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,0.0,1.0,0.0,0.0,0.0
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,1.0,0.0,0.0,0.0,0.0
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,1.0,0.0,0.0,0.0,0.0


In [213]:
## Estandarización
## entiendo que como Aplique OHE todas las variables son de type int|float
## X_train_num = X_train.select_dtypes(include=[np.number]) 3 no es necesario
scaler = StandardScaler().set_output(
    transform="pandas"
)  # Para que el resultado sea un DataFrame
X_train_scaled_and_ohe = scaler.fit_transform(housing_cat_ohe)
X_train_scaled_and_ohe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.126529,-0.68391,-0.011006,-0.354889,-0.384217


In [214]:
## Imputamos valores
X_train_imputed_a = (
    KNNImputer(n_neighbors=k_value)
    .set_output(transform="pandas")
    .fit_transform(X_train_scaled_and_ohe)
)
X_train_imputed_a

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.126529,-0.68391,-0.011006,-0.354889,-0.384217


In [215]:
## Verificamos que no hay valores nulos
print(X_train_imputed_a.isna().any().any())
X_train_imputed_a

## Verificamos la actualización de los valores nulos
X_train_imputed_a.loc[
    null_rows_idx
].head()  # visualizamos las filas que tenían valores nulos

False


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
1606,-1.251077,1.048079,-0.211016,0.151734,0.080575,-0.533051,0.343342,-0.494985,-0.887683,-0.68391,-0.011006,2.817783,-0.384217
10915,0.852065,-0.89308,1.299986,-0.167671,-0.046215,0.493276,0.005292,-0.239693,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
19150,-1.560803,1.267921,-1.165333,-0.144756,-0.238985,-0.417421,-0.266212,-0.049654,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
4186,0.672224,-0.705981,1.538566,-0.614744,-0.577098,-0.524088,-0.540378,0.216926,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
16885,-1.410935,0.907754,-0.211016,0.307929,-0.165944,-0.246217,-0.045282,1.303035,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
