In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

# Ruta absoluta al directorio principal del proyecto
proyecto_path = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Agregar la ruta al sistema
if proyecto_path not in sys.path:
    sys.path.append(proyecto_path)

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import json

from src.preprocess import DropNullColumns, TrimmColumns, clip_outliers

In [3]:
with open('../src/config.json', 'r') as f:
    config_dict = json.load(f)

y_column = config_dict['y_column']
numeric_features = config_dict['numeric_features']
categorical_features = config_dict['categorical_features']
all_features = numeric_features + categorical_features


df = pd.read_csv("../data/interim/data_v2.csv", sep=';')


In [4]:

# DELETE PIPELINE
cutter = Pipeline([
    ('dropna', DropNullColumns()),
    ('trimmer', TrimmColumns(['AREA','PRICE'], tail = 'upper')),  
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    # ("log_transform", FunctionTransformer(np.log1p, validate=True)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

X_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

y_preprocessor = Pipeline([
    ('winsorize',FunctionTransformer(clip_outliers, inverse_func=None, kw_args={'percentile': 95})),
    ("log_transform", FunctionTransformer(np.log1p, inverse_func = np.expm1)),
    ("quantile", QuantileTransformer(output_distribution='normal')),
    #("scaler", StandardScaler())
])

In [5]:
# df_final = cutter.fit_transform(df)
df_final = df.copy()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_final[all_features], df_final[y_column], test_size=0.2, random_state=42)

X_train_processed = X_preprocessor.fit_transform(X_train)
y_train_processed = y_preprocessor.fit_transform(y_train)
X_test_processed = X_preprocessor.transform(X_test)

In [6]:
X_train_processed

array([[-0.66862534, -0.66862534, -0.35708416, ...,  0.        ,
         0.        ,  1.        ],
       [-0.72347218, -0.72347218, -0.1641197 , ...,  0.        ,
         0.        ,  1.        ],
       [-1.03884154, -1.03884154, -0.35708416, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 1.84061781,  1.84061781, -0.35708416, ...,  0.        ,
         0.        ,  1.        ],
       [-0.66862534, -0.66862534, -0.14574214, ...,  0.        ,
         0.        ,  1.        ],
       [-0.97028298, -0.97028298, -0.35708416, ...,  1.        ,
         0.        ,  1.        ]], shape=(8512, 19))

In [7]:
y_train_processed

array([[-0.08031837],
       [-0.11944609],
       [-0.68001258],
       ...,
       [ 5.19933758],
       [-0.9127741 ],
       [-0.40880512]], shape=(8512, 1))

In [8]:
X_test_processed

array([[-0.81945416, -0.81945416, -0.19628045, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47666138, -0.47666138, -0.09060943, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.59285209,  0.59285209,  0.27694191, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.57914038,  0.57914038,  0.27234752, ...,  0.        ,
         0.        ,  1.        ],
       [-0.88801272, -0.88801272, -0.21006362, ...,  0.        ,
         1.        ,  0.        ],
       [-0.47666138, -0.47666138, -0.35708416, ...,  0.        ,
         1.        ,  0.        ]], shape=(2129, 19))

In [9]:
import pickle

# Save (good for preprocessed data + metadata)
data_dict = {
    'X_train': X_train_processed,
    'X_test': X_test_processed,
    'y_train': y_train_processed,
    'y_test': y_test,
    'feature_names': all_features,
    'X_preprocessor': X_preprocessor,
    'y_preprocessor': y_preprocessor
}

with open('../data/processed/ml_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)
