In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")


In [3]:
num_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = train.select_dtypes(include=['object']).columns.tolist()


In [4]:
num_features.remove('SalePrice')

In [5]:
num_imputer = SimpleImputer(strategy='median')  # Imputación numérica con la mediana
cat_imputer = SimpleImputer(strategy='most_frequent')  # Imputación categórica con el valor más frecuente


In [None]:
cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [9]:
scaler = StandardScaler()


In [10]:
num_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', scaler)
])


In [11]:
cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', cat_encoder)
])


In [12]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [13]:
X_train = train.drop('SalePrice', axis=1)
y_train = train['SalePrice']
X_test = test.copy()


In [14]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [15]:
cat_encoded_columns = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_features)
preprocessed_columns = num_features + list(cat_encoded_columns)

In [16]:
X_train_df = pd.DataFrame(X_train_preprocessed, columns=preprocessed_columns)
X_test_df = pd.DataFrame(X_test_preprocessed, columns=preprocessed_columns)

In [17]:
X_train_df['SalePrice'] = y_train.values  # Agregar la columna objetivo a los datos de entrenamiento
X_train_df.to_csv('train_preprocessed.csv', index=False)
X_test_df.to_csv('test_preprocessed.csv', index=False)