In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
orders = pd.read_csv(
    "data/orders.csv",
    dtype={
        "order_id": "int32",
        "user_id": "int32",
        "order_number": "int16",
        "order_dow": "int8",
        "order_hour_of_day": "int8"
    }
)

order_products = pd.read_csv(
    "data/order_products__prior.csv",
    dtype={
        "order_id": "int32",
        "product_id": "int32",
        "add_to_cart_order": "int16",
        "reordered": "int8"
    }
)

products = pd.read_csv(
    "data/products.csv",
    dtype={
        "product_id": "int32",
        "aisle_id": "int16",
        "department_id": "int16"
    }
)


In [26]:
df = (
    order_products
    .merge(orders, on="order_id", how="left")
    .merge(products, on="product_id", how="left")
)


In [27]:
df.isna().mean().sort_values(ascending=False)

days_since_prior_order    0.06407
product_id                0.00000
add_to_cart_order         0.00000
reordered                 0.00000
order_id                  0.00000
user_id                   0.00000
eval_set                  0.00000
order_dow                 0.00000
order_number              0.00000
order_hour_of_day         0.00000
product_name              0.00000
aisle_id                  0.00000
department_id             0.00000
dtype: float64

In [28]:
def winsorize(series, low=0.01, high=0.99):
    return series.clip(
        series.quantile(low),
        series.quantile(high)
    )

df["days_since_prior_order"] = winsorize(df["days_since_prior_order"])
df["add_to_cart_order"] = winsorize(df["add_to_cart_order"])


In [29]:
user_features = (
    df.groupby("user_id")
    .agg(
        total_orders=("order_number", "max"),
        avg_basket_size=("order_id", "count"),
        reorder_ratio=("reordered", "mean"),
        mean_days_between_orders=("days_since_prior_order", "mean"),
        last_order_recency=("days_since_prior_order", "last")
    )
    .reset_index()
)

In [30]:
product_features = (
    df.groupby("product_id")
    .agg(
        product_reorder_rate=("reordered", "mean"),
        avg_cart_position=("add_to_cart_order", "mean"),
        popularity=("order_id", "count")
    )
    .reset_index()
)

In [31]:
user_product_features = (
    df.groupby(["user_id", "product_id"])
    .agg(
        prior_purchases=("order_id", "count"),
        avg_user_product_reorder=("reordered", "mean")
    )
    .reset_index()
)

In [32]:
df = (
    df.merge(user_features, on="user_id", how="left")
      .merge(product_features, on="product_id", how="left")
      .merge(user_product_features, on=["user_id", "product_id"], how="left")
)


In [33]:
df["is_weekend"] = df["order_dow"].isin([0, 6]).astype(int)
df["hour"] = df["order_hour_of_day"]
df["day"] = df["order_dow"]


In [34]:
df = df.sort_values(["user_id", "order_number"])

df["recent_reorder_rate"] = (
    df.groupby("user_id")["reordered"]
    .rolling(3)
    .mean()
    .reset_index(level=0, drop=True)
)


In [35]:
df["log_popularity"] = np.log1p(df["popularity"])


In [36]:
product_freq = df["product_id"].value_counts(normalize=True)
df["product_freq"] = df["product_id"].map(product_freq)

In [37]:
from sklearn.model_selection import KFold

def kfold_target_encoding(df, col, target, n_splits=5):
    global_mean = df[target].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded = pd.Series(index=df.index, dtype=float)

    for train_idx, val_idx in kf.split(df):
        train, val = df.iloc[train_idx], df.iloc[val_idx]
        means = train.groupby(col)[target].mean()
        encoded.iloc[val_idx] = val[col].map(means)

    return encoded.fillna(global_mean)


In [38]:
df["product_te"] = kfold_target_encoding(df, "product_id", "reordered")

In [39]:
target = "reordered"
numeric_features = [
    "mean_days_between_orders",
    "avg_cart_position",
    "prior_purchases",
    "recent_reorder_rate",
    "log_popularity",
    "product_freq"
]
categorical_features = [
    "day",
    "is_weekend"
]
X = df[numeric_features + categorical_features]
y = df[target]
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [40]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


In [41]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [44]:
X.loc[:, "recent_reorder_rate"] = X["recent_reorder_rate"].fillna(0.0)

In [45]:
import os
import joblib

os.makedirs("data/processed", exist_ok=True)

X.to_parquet("data/processed/X.parquet")
y.to_frame(name="reordered").to_parquet("data/processed/y.parquet")

X_train.to_parquet("data/processed/X_train.parquet")
X_val.to_parquet("data/processed/X_val.parquet")

y_train.to_frame(name="reordered").to_parquet("data/processed/y_train.parquet")
y_val.to_frame(name="reordered").to_parquet("data/processed/y_val.parquet")

joblib.dump(preprocessor, "data/processed/preprocessor.pkl")


['data/processed/preprocessor.pkl']