In [19]:
%load_ext autoreload
%autoreload 2

import os
import sys

# Ruta absoluta al directorio principal del proyecto
proyecto_path = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Agregar la ruta al sistema
if proyecto_path not in sys.path:
    sys.path.append(proyecto_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import json

from src.preprocess import DropNullColumns, TrimmColumns

In [21]:
with open('../src/config.json', 'r') as f:
    config_dict = json.load(f)

y_column = config_dict['y_column']
numeric_features = config_dict['numeric_features']
categorical_features = config_dict['categorical_features']
all_features = numeric_features + categorical_features


df = pd.read_csv("../data/interim/data_v2.csv", sep=';')


In [22]:

# DELETE PIPELINE
cutter = Pipeline([
    ('dropna', DropNullColumns()),
    ('trimmer', TrimmColumns(['AREA','PRICE'], tail = 'upper')),  
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    # ("log_transform", FunctionTransformer(np.log1p, validate=True)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

X_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

y_preprocessor = Pipeline([
    #("log_transform", FunctionTransformer(np.log1p, inverse_func = np.expm1)),
    ("quantile", QuantileTransformer(output_distribution='normal')),
    #("scaler", StandardScaler())
])

In [23]:
# df_final = cutter.fit_transform(df)
df_final = df.copy()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_final[all_features], df_final[y_column], test_size=0.2, random_state=42)

X_train_processed = X_preprocessor.fit_transform(X_train)
y_train_processed = y_preprocessor.fit_transform(y_train)
X_test_processed = X_preprocessor.transform(X_test)

In [24]:
X_train_processed

array([[-0.01112705, -0.01112705, -0.01047677, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01234849, -0.01234849, -0.01314327, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01366211, -0.01366211, -0.01651414, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.01262504, -0.01262504, -0.01374701, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01262504, -0.01262504, -0.01374701, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01062004, -0.01062004, -0.00936992, ...,  0.        ,
         0.        ,  0.        ]], shape=(9148, 38))

In [25]:
y_train_processed

array([[-0.11694087],
       [-0.07615383],
       [-0.99585764],
       ...,
       [-1.33406614],
       [-2.17595139],
       [ 0.1483936 ]], shape=(9148, 1))

In [26]:
X_test_processed

array([[-0.00937555, -0.00937555, -0.0066531 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.01343165, -0.01343165, -0.01550791, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00578037, -0.00578037, -0.01651414, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.01163406, -0.01163406, -0.01158362, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01262504, -0.01262504, -0.01651414, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00985952, -0.00985952, -0.00770964, ...,  0.        ,
         0.        ,  0.        ]], shape=(2287, 38))

In [27]:
import pickle

# Save (good for preprocessed data + metadata)
data_dict = {
    'X_train': X_train_processed,
    'X_test': X_test_processed,
    'y_train': y_train_processed,
    'y_test': y_test,
    'feature_names': all_features,
    'X_preprocessor': X_preprocessor,
    'y_preprocessor': y_preprocessor
}

with open('../data/processed/ml_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)
