# Pipeline


In [126]:
# import data
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import (
    make_column_selector,
    make_column_transformer,
)
from sklearn.pipeline import make_pipeline

## assigned data
df = pd.read_csv("../data/housing.csv")
###  split data to train
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="median_house_value"),  # features
    df["median_house_value"],  # target
    stratify=pd.cut(
        df["median_income"],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1, 2, 3, 4, 5],
    ),
    test_size=0.2,
    random_state=42,
)

In [127]:
# Move DF columns
# ---------------------------------------------------------------------------------#


def move_columns_to_end_by_len(df: pd.DataFrame, num_columns: int) -> pd.DataFrame:
    # Revisar si el número de columnas a mover es válido
    if num_columns >= len(df.columns):
        print(
            "El número de columnas a mover excede el número total de columnas en el DataFrame."
        )
        return df

    # Reordenar las columnas
    cols = list(df.columns)
    cols = cols[num_columns:] + cols[:num_columns]
    df = df[cols]

    return df


def move_columns_to_end_by_name(
    df: pd.DataFrame, columns_to_move: list
) -> pd.DataFrame:
    # Revisar si los nombres de columnas son válidos
    invalid_columns = [col for col in columns_to_move if col not in df.columns]
    if invalid_columns:
        print(f"Las siguientes columnas no existen en el DataFrame: {invalid_columns}")
        return df

    # Reordenar las columnas
    cols = [col for col in df.columns if col not in columns_to_move] + columns_to_move
    df = df[cols]
    return df

In [128]:
# try 1
# ---------------------------------------------------------------------------------#
df = X_train.copy()
k_value = int(np.sqrt(df.shape[0]))
# Identificar las columnas categóricas y numéricas
categorical_columns = df.select_dtypes(include=["object"]).columns
categorical_columns

Index(['ocean_proximity'], dtype='object')

In [129]:
numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns

# Crear un Pipeline que incluya el preprocesamiento y luego aplique StandardScaler
pipeline = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(), make_column_selector(dtype_include=object)),
        remainder="passthrough",
    ),
    StandardScaler(),
    KNNImputer(n_neighbors=k_value),
)
pipeline

In [130]:
# Aplicar el Pipeline al DataFrame
df_transformed = pipeline.fit_transform(df)

df_transformed

array([[-0.88768266,  1.46218   , -0.0110063 , ...,  0.73260236,
         0.55628602, -0.8936472 ],
       [-0.88768266, -0.68391033, -0.0110063 , ...,  0.53361152,
         0.72131799,  1.292168  ],
       [-0.88768266,  1.46218   , -0.0110063 , ..., -0.67467519,
        -0.52440722, -0.52543365],
       ...,
       [ 1.1265287 , -0.68391033, -0.0110063 , ..., -0.86201341,
        -0.86511838, -0.36547546],
       [ 1.1265287 , -0.68391033, -0.0110063 , ..., -0.18974707,
         0.01061579,  0.16826095],
       [-0.88768266,  1.46218   , -0.0110063 , ..., -0.71232211,
        -0.79857323, -0.390569  ]])

In [131]:
# Aplicar el Pipeline al DataFrame
array_transformed = pipeline.fit_transform(df)

# Convertir el resultado imputado de nuevo a un DataFrame
# Necesitamos obtener los nombres de las nuevas columnas después de OHE y escalar
ohe_columns = (
    pipeline.named_steps["columntransformer"]
    .named_transformers_["onehotencoder"]
    .get_feature_names_out(input_features=categorical_columns)
)
all_columns = np.concatenate([ohe_columns, numerical_columns])

df_transformed = pd.DataFrame(array_transformed, columns=all_columns)

df_transformed

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-0.887683,1.46218,-0.011006,-0.354889,-0.384217,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647
1,-0.887683,-0.68391,-0.011006,-0.354889,2.602693,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168
2,-0.887683,1.46218,-0.011006,-0.354889,-0.384217,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434
3,-0.887683,-0.68391,-0.011006,-0.354889,2.602693,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929
4,1.126529,-0.68391,-0.011006,-0.354889,-0.384217,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1.126529,-0.68391,-0.011006,-0.354889,-0.384217,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374
16508,-0.887683,1.46218,-0.011006,-0.354889,-0.384217,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509
16509,1.126529,-0.68391,-0.011006,-0.354889,-0.384217,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475
16510,1.126529,-0.68391,-0.011006,-0.354889,-0.384217,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261


In [132]:
# Reordenar las columnas
cols = list(df_transformed.columns)
cols = cols[len(ohe_columns) :] + cols[: len(ohe_columns)]
df_transformed_sort = df_transformed[cols]

In [133]:
df_transformed_sort

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
1,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
2,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
3,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
4,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
16508,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
16509,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
16510,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
