In [10]:
# import data
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## utils
def p(tag, value):
    print("\n", tag, ":", "type", type(value), ",value", value)


## assigned data
housing = pd.read_csv("./data/housing.csv")

## split data to train
X = housing.drop(columns="median_house_value")
y = housing["median_house_value"]
p("data X value:", X)
p("data y value:", y)

### Muestreo estratificado (*Stratified sampling*)
### dividiendo el *dataset* en grupos llamados **estratos**,
### y asegurándose de tomar no solo un porcentaje de muestras del total,
### sino ese porcentaje de cada estrato.
stratify = pd.cut(
    housing["median_income"],
    bins=[
        0.0,
        1.5,
        3.0,
        4.5,
        6.0,
        np.inf,
    ],  # Secuencia de límites de los contenedores
    labels=[1, 2, 3, 4, 5],  # dividimos en 5 categorías
)
p("stratify data:", stratify)

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
## objetivo --> evitar el **sobre-ajuste** o ***over-fitting***
X_train, X_test, y_train, y_test = train_test_split(
    X,  # features
    y,  # target
    stratify=stratify,  # estratificado
    test_size=0.2,  #  estoy usando el 20% de los datos # Controla la mezcla aplicada a los datos antes de aplicar la división.
    random_state=42,  ## fixed seed
)
p("split X_train:", X_train)
p("split X_test:", X_test)
p("split y_train:", y_train)
p("split y_test:", y_test)

## para comprobar la effectividad de las operaciones que vamos a realizar
### obtenemos los índices de las filas con valores nulos
null_rows_idx = X_train.isnull().any(axis=1)
p("split null_rows_idx", null_rows_idx)

# Calculamos k de manera manera estandarizada
k_value = np.sqrt(X_train.shape[0]).astype(int)
p("k_value:", k_value)

## OHE
### Aplicamos también OneHotEncoder para las variables categóricas.
cat_encoder = OneHotEncoder(sparse_output=False).set_output(
    transform="pandas"
)  # forzamos que la salida sea DataFrame
X_train_ocean_proximity_ohe = cat_encoder.fit_transform(X_train[["ocean_proximity"]])
p("X_train_ocean_proximity_ohe", X_train_ocean_proximity_ohe)

### recuperar el resto de DataFrame
## Concatenamos los Dataframes
## entiendo que no es necesario ya que ya solo tenemos una salida
X_train_rest = X_train.drop(columns="ocean_proximity")
housing_cat_ohe = pd.concat([X_train_rest, X_train_ocean_proximity_ohe], axis=1)
p("housing_cat_ohe", housing_cat_ohe)

## Estandarización
## entiendo que como Aplique OHE todas las variables son de type int|float
## X_train_num = X_train.select_dtypes(include=[np.number]) 3 no es necesario
scaler = StandardScaler().set_output(
    transform="pandas"
)  # Para que el resultado sea un DataFrame
X_train_scaled_and_ohe = scaler.fit_transform(housing_cat_ohe)
p("X_train_scaled_and_ohe", X_train_scaled_and_ohe)

## Imputamos valores
X_train_imputed_a = (
    KNNImputer(n_neighbors=k_value)
    .set_output(transform="pandas")
    .fit_transform(X_train_scaled_and_ohe)
)
p("X_train_imputed_a", X_train_imputed_a)

## Verificamos que no hay valores nulos
print(X_train_imputed_a.isna().any().any())

## Verificamos la actualización de los valores nulos
X_train_imputed_a.loc[
    null_rows_idx
].head()  # visualizamos las filas que tenían valores nulos


 data X value: : type <class 'pandas.core.frame.DataFrame'> ,value        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0    

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
1606,-1.251077,1.048079,-0.211016,0.151734,0.080575,-0.533051,0.343342,-0.494985,-0.887683,-0.68391,-0.011006,2.817783,-0.384217
10915,0.852065,-0.89308,1.299986,-0.167671,-0.046215,0.493276,0.005292,-0.239693,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
19150,-1.560803,1.267921,-1.165333,-0.144756,-0.238985,-0.417421,-0.266212,-0.049654,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
4186,0.672224,-0.705981,1.538566,-0.614744,-0.577098,-0.524088,-0.540378,0.216926,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
16885,-1.410935,0.907754,-0.211016,0.307929,-0.165944,-0.246217,-0.045282,1.303035,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
