# Preprocesamiento y particion de datos

In [None]:
import json
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

sns.set_theme(style="whitegrid")

# Cargar dataset limpio
data_path = Path("../../00_Data_Clean/Data_Set.csv")
df = pd.read_csv(data_path)
print("Forma del dataset limpio:", df.shape)


In [None]:
# Crear features de fecha y quitar columnas ID

df["Date"] = pd.to_datetime(df["Date"])
df["order_month"] = df["Date"].dt.month
df["order_dayofweek"] = df["Date"].dt.dayofweek

cols_drop = ["Order_ID", "Customer_ID", "Date"]
df = df.drop(columns=cols_drop)

objetivo = "Total_Amount"
X = df.drop(columns=[objetivo])
y = df[objetivo]

cols_cat = [
    "Gender",
    "City",
    "Product_Category",
    "Payment_Method",
    "Device_Type",
    "Is_Returning_Customer",
]
cols_num = [c for c in X.columns if c not in cols_cat]

# Asegurar que las categoricas sean string
X[cols_cat] = X[cols_cat].astype(str)


In [None]:
# Graficos rapidos de distribucion numerica
num_df = X[cols_num]

n_cols = 3
n_rows = math.ceil(len(cols_num) / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))
axes = axes.flatten()
for idx, col in enumerate(cols_num):
    sns.histplot(num_df[col], kde=True, ax=axes[idx])
    axes[idx].set_title(f"Histograma de {col}")
for j in range(len(cols_num), len(axes)):
    axes[j].axis("off")
plt.tight_layout()
plt.savefig("histogramas_numericas.png", dpi=150)
plt.close()

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))
axes = axes.flatten()
for idx, col in enumerate(cols_num):
    sns.boxplot(x=num_df[col], ax=axes[idx])
    axes[idx].set_title(f"Boxplot de {col}")
for j in range(len(cols_num), len(axes)):
    axes[j].axis("off")
plt.tight_layout()
plt.savefig("boxplots_numericas.png", dpi=150)
plt.close()


In [None]:
# Split y pipeline de preprocesamiento
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, cols_num),
        ("cat", cat_pipeline, cols_cat),
    ],
    remainder="drop",
)

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

train_df = pd.DataFrame(X_train_proc, columns=feature_names)
test_df = pd.DataFrame(X_test_proc, columns=feature_names)
train_df[objetivo] = y_train.reset_index(drop=True)
test_df[objetivo] = y_test.reset_index(drop=True)

output_dir = Path(".")
train_df.to_csv(output_dir / "T_train_final_objetivo.csv", index=False)
test_df.to_csv(output_dir / "T_test_final_objetivo.csv", index=False)
train_df.drop(columns=[objetivo]).to_csv(output_dir / "T_train_final.csv", index=False)
test_df.drop(columns=[objetivo]).to_csv(output_dir / "T_test_final.csv", index=False)

joblib.dump(preprocessor, "preprocessor_cat.joblib")
with open("expected_columns.json", "w", encoding="utf-8") as f:
    json.dump({"feature_names": feature_names.tolist(), "target": objetivo}, f, ensure_ascii=False, indent=2)

print("Guardados datasets procesados y preprocessor_cat.joblib")
