# Pipeline


In [287]:
# import data
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import (
    make_column_selector,
    make_column_transformer,
    ColumnTransformer,
)
from sklearn.pipeline import make_pipeline, Pipeline

## assigned data
df = pd.read_csv("../data/housing.csv")
###  split data to train
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="median_house_value"),  # features
    df["median_house_value"],  # target
    stratify=pd.cut(
        df["median_income"],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1, 2, 3, 4, 5],
    ),
    test_size=0.2,
    random_state=42,
)

In [288]:
# Move DF columns
# ---------------------------------------------------------------------------------#


def move_columns_to_end_by_len(df: pd.DataFrame, num_columns: int) -> pd.DataFrame:
    # Revisar si el número de columnas a mover es válido
    if num_columns >= len(df.columns):
        print(
            "El número de columnas a mover excede el número total de columnas en el DataFrame."
        )
        return df

    # Reordenar las columnas
    cols = list(df.columns)
    cols = cols[num_columns:] + cols[:num_columns]
    df = df[cols]

    return df


def move_columns_to_end_by_name(
    df: pd.DataFrame, columns_to_move: list
) -> pd.DataFrame:
    # Revisar si los nombres de columnas son válidos
    invalid_columns = [col for col in columns_to_move if col not in df.columns]
    if invalid_columns:
        print(f"Las siguientes columnas no existen en el DataFrame: {invalid_columns}")
        return df

    # Reordenar las columnas
    cols = [col for col in df.columns if col not in columns_to_move] + columns_to_move
    df = df[cols]
    return df

In [289]:
# try 1
# ---------------------------------------------------------------------------------#


def preprocess_and_impute(df):
    k_value = int(np.sqrt(df.shape[0]))
    # Identificar las columnas categóricas y numéricas
    categorical_columns = df.select_dtypes(include=["object"]).columns
    numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns

    # Crear un ColumnTransformer que aplique OneHotEncoder a las columnas categóricas
    preprocessor = ColumnTransformer(
        transformers=[("cat", OneHotEncoder(), categorical_columns)],
        remainder="passthrough",  # Dejar pasar las columnas numéricas
    )

    # Crear un Pipeline que incluya el preprocesamiento y luego aplique StandardScaler
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            (
                "scaler",
                StandardScaler(with_mean=False),
            ),  # with_mean=False para manejar datos dispersos
        ]
    )

    # Aplicar el Pipeline al DataFrame
    df_transformed = pipeline.fit_transform(df)

    # Aplicar KNNImputer al resultado del Pipeline
    imputer = KNNImputer(n_neighbors=k_value)
    df_imputed = imputer.fit_transform(df_transformed)

    # Convertir el resultado imputado de nuevo a un DataFrame
    # Necesitamos obtener los nombres de las nuevas columnas después de OHE y escalar
    ohe_columns = (
        pipeline.named_steps["preprocessor"]
        .named_transformers_["cat"]
        .get_feature_names_out()
    )
    all_columns = np.concatenate([ohe_columns, numerical_columns])

    imputed_df = pd.DataFrame(df_imputed, columns=all_columns)

    return move_columns_to_end_by_len(imputed_df, len(ohe_columns))

In [290]:
try1 = preprocess_and_impute(X_train)
try1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-60.676379,18.017697,2.306266,1.811208,1.931405,2.005146,1.879235,1.141074,0.000000,2.14609,0.0,0.0,0.00000
1,-58.563246,15.477819,0.556685,2.487897,2.071959,1.806156,2.044267,3.326889,0.000000,0.00000,0.0,0.0,2.98691
2,-59.467447,16.544287,3.499162,0.756657,0.751236,0.597869,0.798542,1.509287,0.000000,2.14609,0.0,0.0,0.00000
3,-58.513290,15.318785,1.908634,0.877779,1.257715,0.804927,1.285652,1.168792,0.000000,0.00000,0.0,0.0,2.98691
4,-59.297597,16.034441,2.147213,1.653610,1.565480,1.646604,1.543847,2.360473,2.014211,0.00000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-58.483317,15.449754,1.113370,3.116886,2.983136,1.816015,2.664468,2.672094,2.014211,0.00000,0.0,0.0,0.00000
16508,-60.656397,18.013020,1.192896,3.694901,3.445994,4.274718,3.774441,1.477211,0.000000,2.14609,0.0,0.0,0.00000
16509,-61.305823,17.980277,3.817268,0.330628,0.402275,0.410531,0.457831,1.669245,2.014211,0.00000,0.0,0.0,0.00000
16510,-61.295831,17.919470,1.113370,1.475435,1.405539,1.082797,1.333565,2.202982,2.014211,0.00000,0.0,0.0,0.00000


In [291]:
# try 2
# ---------------------------------------------------------------------------------#


def preprocess_and_impute2(df):
    ## Definir el número k para KNNImputer
    k_value = int(np.sqrt(df.shape[0]))
    all_numerical_features = df.select_dtypes(include=["int64", "float64"]).columns
    # print(all_numerical_features)
    all_categorical_features = df.select_dtypes(include=[object]).columns
    # print(all_categorical_features)

    numerical_features = [value for value in all_numerical_features]
    categorical_features = [value for value in all_categorical_features]

    print(numerical_features)
    print(categorical_features)
    transformer = make_column_transformer(
        (
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            make_column_selector(dtype_include="object"),
        ),  # Aplica OneHotEncoder a las columnas categóricas
        (
            StandardScaler(),
            make_column_selector(dtype_exclude="object"),
        ),
        # Aplica StandardScaler a las columnas numéricas
        remainder="passthrough",  # Conserva las columnas que no se transforman
        sparse_threshold=0.1,
        n_jobs=3,
        # remainder="drop",  # Conserva las columnas que no se transforman
    )
    # Crear un pipeline que incluya el ColumnTransformer y el KNNImputer
    pipeline = make_pipeline(transformer, KNNImputer(n_neighbors=k_value))

    # Aplicar el pipeline a los datos de entrenamiento
    X_train_imputed_b = pipeline.fit_transform(df)
    X_train_imputed_b
    # Convertir el resultado a DataFrame
    imputed_df = pd.DataFrame(
        X_train_imputed_b,
        columns=transformer.get_feature_names_out(),
        index=df.index,
    )
    # Obtener los nombres de las columnas de OneHotEncoder
    ohe_columns = transformer.transformers_[0][1].get_feature_names_out()
    return move_columns_to_end_by_len(imputed_df, len(ohe_columns))

In [292]:
try2 = preprocess_and_impute2(X_train)
try2

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
['ocean_proximity']


Unnamed: 0,standardscaler__longitude,standardscaler__latitude,standardscaler__housing_median_age,standardscaler__total_rooms,standardscaler__total_bedrooms,standardscaler__population,standardscaler__households,standardscaler__median_income,onehotencoder__ocean_proximity_<1H OCEAN,onehotencoder__ocean_proximity_INLAND,onehotencoder__ocean_proximity_ISLAND,onehotencoder__ocean_proximity_NEAR BAY,onehotencoder__ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0


In [293]:
# try 3
# ---------------------------------------------------------------------------------#


def preprocess_and_impute3(df):
    ## Definir el número k para KNNImputer
    k_value = int(np.sqrt(df.shape[0]))
    # Identificar las columnas categóricas y numéricas
    categorical_columns = df.select_dtypes(include=["object"]).columns
    numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns

    # Crear un ColumnTransformer con make_column_transformer
    preprocessor = make_column_transformer(
        (OneHotEncoder(), categorical_columns),
        remainder="passthrough",  # Dejar pasar las columnas numéricas
    )

    # Crear un Pipeline con make_pipeline
    pipeline = make_pipeline(
        preprocessor,
        StandardScaler(with_mean=False),  # with_mean=False para manejar datos dispersos
    )

    # Aplicar el Pipeline al DataFrame
    df_transformed = pipeline.fit_transform(df)

    # Aplicar KNNImputer al resultado del Pipeline
    imputer = KNNImputer(n_neighbors=k_value)
    df_imputed = imputer.fit_transform(df_transformed)

    # Obtener los nombres de las nuevas columnas después de OHE y escalar
    ohe_columns = preprocessor.named_transformers_[
        "onehotencoder"
    ].get_feature_names_out()
    all_columns = np.concatenate([ohe_columns, numerical_columns])

    # Convertir el resultado imputado de nuevo a un DataFrame
    imputed_df = pd.DataFrame(df_imputed, columns=all_columns)

    print(type(ohe_columns))
    print(ohe_columns)
    return move_columns_to_end_by_len(imputed_df, len(ohe_columns))
    # mc2 = move_columns_to_end_by_name(imputed_df, ohe_columns.tolist())

In [294]:
try3 = preprocess_and_impute3(X_train)
try3

<class 'numpy.ndarray'>
['ocean_proximity_<1H OCEAN' 'ocean_proximity_INLAND'
 'ocean_proximity_ISLAND' 'ocean_proximity_NEAR BAY'
 'ocean_proximity_NEAR OCEAN']


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-60.676379,18.017697,2.306266,1.811208,1.931405,2.005146,1.879235,1.141074,0.000000,2.14609,0.0,0.0,0.00000
1,-58.563246,15.477819,0.556685,2.487897,2.071959,1.806156,2.044267,3.326889,0.000000,0.00000,0.0,0.0,2.98691
2,-59.467447,16.544287,3.499162,0.756657,0.751236,0.597869,0.798542,1.509287,0.000000,2.14609,0.0,0.0,0.00000
3,-58.513290,15.318785,1.908634,0.877779,1.257715,0.804927,1.285652,1.168792,0.000000,0.00000,0.0,0.0,2.98691
4,-59.297597,16.034441,2.147213,1.653610,1.565480,1.646604,1.543847,2.360473,2.014211,0.00000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-58.483317,15.449754,1.113370,3.116886,2.983136,1.816015,2.664468,2.672094,2.014211,0.00000,0.0,0.0,0.00000
16508,-60.656397,18.013020,1.192896,3.694901,3.445994,4.274718,3.774441,1.477211,0.000000,2.14609,0.0,0.0,0.00000
16509,-61.305823,17.980277,3.817268,0.330628,0.402275,0.410531,0.457831,1.669245,2.014211,0.00000,0.0,0.0,0.00000
16510,-61.295831,17.919470,1.113370,1.475435,1.405539,1.082797,1.333565,2.202982,2.014211,0.00000,0.0,0.0,0.00000


In [295]:
def preprocess_and_impute4(df):
    ## Definir el número k para KNNImputer
    k_value = int(np.sqrt(df.shape[0]))
    categorical_columns = df.select_dtypes(include=["object"]).columns
    numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns

    preprocessor = make_column_transformer(
        (OneHotEncoder(), categorical_columns), (StandardScaler(), numerical_columns)
    )

    pipe = make_pipeline(preprocessor, KNNImputer(n_neighbors=k_value))

    df_imputed = pipe.fit_transform(df)

    ohe_columns = preprocessor.named_transformers_[
        "onehotencoder"
    ].get_feature_names_out()
    all_columns = np.concatenate([ohe_columns, numerical_columns])

    imputed_df = pd.DataFrame(df_imputed, columns=all_columns)

    return move_columns_to_end_by_len(imputed_df, len(ohe_columns))

In [296]:
try4 = preprocess_and_impute4(X_train)
try4

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,0.0,1.0,0.0,0.0,0.0,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647
1,0.0,0.0,0.0,0.0,1.0,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168
2,0.0,1.0,0.0,0.0,0.0,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434
3,0.0,0.0,0.0,0.0,1.0,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929
4,1.0,0.0,0.0,0.0,0.0,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1.0,0.0,0.0,0.0,0.0,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374
16508,0.0,1.0,0.0,0.0,0.0,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509
16509,1.0,0.0,0.0,0.0,0.0,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475
16510,1.0,0.0,0.0,0.0,0.0,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261
