In [26]:
%load_ext autoreload
%autoreload 2

import os
import sys

# Ruta absoluta al directorio principal del proyecto
proyecto_path = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Agregar la ruta al sistema
if proyecto_path not in sys.path:
    sys.path.append(proyecto_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import json

from src.preprocess import DropNullColumns, TrimmColumns

In [28]:
with open('../src/config.json', 'r') as f:
    config_dict = json.load(f)

y_column = config_dict['y_column']
numeric_features = config_dict['numeric_features']
categorical_features = config_dict['categorical_features']
all_features = numeric_features + categorical_features


df = pd.read_csv("../data/interim/data_v2.csv", sep=';')


In [29]:

# DELETE PIPELINE
cutter = Pipeline([
    ('dropna', DropNullColumns()),
    ('trimmer', TrimmColumns(['AREA','PRICE'], tail = 'upper')),  
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    # ("log_transform", FunctionTransformer(np.log1p, validate=True)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

X_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

y_preprocessor = Pipeline([
    ("log_transform", FunctionTransformer(np.log, inverse_func = np.exp)),
    ("scaler", StandardScaler())
])

In [30]:
# df_final = cutter.fit_transform(df)
df_final = df.copy()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_final[all_features], df_final[y_column], test_size=0.2, random_state=42)

X_train_processed = X_preprocessor.fit_transform(X_train)
y_train_processed = y_preprocessor.fit_transform(y_train)
X_test_processed = X_preprocessor.transform(X_test)

In [31]:
X_train_processed

array([[ 0.13907964,  0.13907964, -0.00963163, ...,  0.        ,
         0.        ,  0.        ],
       [-0.50730151, -0.50730151, -0.01059335, ...,  0.        ,
         0.        ,  0.        ],
       [-0.64581176, -0.64581176, -0.01022832, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.73815192, -0.73815192, -0.01059335, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.459544  ,  1.459544  , -0.00862779, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57193963, -0.57193963, -0.01017216, ...,  0.        ,
         0.        ,  0.        ]], shape=(20406, 35))

In [32]:
y_train_processed

array([[ 0.53556003],
       [-0.45346018],
       [-0.43538137],
       ...,
       [-1.57206254],
       [ 1.24112306],
       [-0.91787056]], shape=(20406, 1))

In [33]:
X_test_processed

array([[ 0.86856695,  0.86856695, -0.01008792, ...,  0.        ,
         0.        ,  0.        ],
       [-0.2764511 , -0.2764511 , -0.01059335, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.15482146,  1.15482146, -0.01059335, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.56270561, -0.56270561, -0.01020726, ...,  0.        ,
         0.        ,  0.        ],
       [-0.02713265, -0.02713265, -0.00975799, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.2444354 ,  2.2444354 , -0.00803111, ...,  0.        ,
         0.        ,  0.        ]], shape=(5102, 35))

In [35]:
import pickle

# Save (good for preprocessed data + metadata)
data_dict = {
    'X_train': X_train_processed,
    'X_test': X_test_processed,
    'y_train': y_train_processed,
    'y_test': y_test,
    'feature_names': all_features,
    'X_preprocessor': X_preprocessor,
    'y_preprocessor': y_preprocessor
}

with open('../data/processed/ml_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)
