In [1]:
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import os
import pandas as pd

load_dotenv()
engine = create_engine(os.getenv("DATABASE_URL"))

def fetch_df(sql: str) -> pd.DataFrame:
    with engine.connect() as conn:
        return pd.read_sql(text(sql), conn)

In [2]:
print("orders:", fetch_df("SELECT COUNT(*) AS c FROM hpce.olist_orders;")["c"][0])
print("events:", fetch_df("SELECT COUNT(*) AS c FROM hpce.rees46_events;")["c"][0])


orders: 99441
events: 1048575


In [3]:
sql_orders = """
SELECT
    o.customer_id,
    DATE(o.order_purchase_ts) AS feature_date,
    COUNT(DISTINCT o.order_id)          AS daily_orders,
    COALESCE(SUM(p.payment_value), 0.0) AS daily_gmv
FROM hpce.olist_orders o
LEFT JOIN hpce.olist_order_payments p
  ON o.order_id = p.order_id
WHERE o.order_purchase_ts IS NOT NULL
  AND o.order_status NOT IN ('canceled', 'unavailable')
GROUP BY o.customer_id, DATE(o.order_purchase_ts)
;
"""
daily_orders = fetch_df(sql_orders)
print("daily_orders shape:", daily_orders.shape)
daily_orders.head()

daily_orders shape: (98207, 4)


Unnamed: 0,customer_id,feature_date,daily_orders,daily_gmv
0,00012a2ce6f8dcda20d059ce98491703,2017-11-14,1,114.74
1,000161a058600d5901f007fab4c27140,2017-07-16,1,67.41
2,0001fd6190edaaf884bcaf3d49edf079,2017-02-28,1,195.42
3,0002414f95344307404f0ace7a26f1d5,2017-08-16,1,179.35
4,000379cdec625522490c315e70c7a9fb,2018-04-02,1,107.01


In [4]:
sql_events = """
SELECT
    user_id AS customer_id,
    DATE(event_time) AS feature_date,
    SUM(CASE WHEN event_type = 'view'     THEN 1 ELSE 0 END) AS daily_views,
    SUM(CASE WHEN event_type = 'cart'     THEN 1 ELSE 0 END) AS daily_carts,
    SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) AS daily_purchases,
    COUNT(*) AS daily_events_total
FROM hpce.rees46_events
WHERE event_time IS NOT NULL
  AND user_id IS NOT NULL
GROUP BY user_id, DATE(event_time)
;
"""
daily_events = fetch_df(sql_events)
print("daily_events shape:", daily_events.shape)
daily_events.head()


daily_events shape: (181485, 6)


Unnamed: 0,customer_id,feature_date,daily_views,daily_carts,daily_purchases,daily_events_total
0,00012a2ce6f8dcda20d059ce98491703,2017-01-07,6,1,0,7
1,000161a058600d5901f007fab4c27140,2017-03-23,5,0,0,5
2,000161a058600d5901f007fab4c27140,2017-03-24,4,0,0,4
3,000161a058600d5901f007fab4c27140,2017-07-11,3,0,0,3
4,000161a058600d5901f007fab4c27140,2017-07-16,7,2,1,10


In [5]:
import numpy as np

df = daily_orders.merge(
    daily_events,
    on=["customer_id", "feature_date"],
    how="outer",
)

print("merged shape:", df.shape)
df.head()

merged shape: (258159, 8)


Unnamed: 0,customer_id,feature_date,daily_orders,daily_gmv,daily_views,daily_carts,daily_purchases,daily_events_total
0,00012a2ce6f8dcda20d059ce98491703,2017-01-07,,,6.0,1.0,0.0,7.0
1,00012a2ce6f8dcda20d059ce98491703,2017-11-14,1.0,114.74,,,,
2,000161a058600d5901f007fab4c27140,2017-03-23,,,5.0,0.0,0.0,5.0
3,000161a058600d5901f007fab4c27140,2017-03-24,,,4.0,0.0,0.0,4.0
4,000161a058600d5901f007fab4c27140,2017-07-11,,,3.0,0.0,0.0,3.0


In [6]:
df = df.dropna(subset=["customer_id", "feature_date"])
df["feature_date"] = pd.to_datetime(df["feature_date"])
df = df.sort_values(["customer_id", "feature_date"])

daily_numeric_cols = [
    "daily_orders", "daily_gmv",
    "daily_views", "daily_carts",
    "daily_purchases", "daily_events_total"
]
daily_numeric_cols = [c for c in daily_numeric_cols if c in df.columns]

def expand_customer(g: pd.DataFrame) -> pd.DataFrame:
    if g.empty:
        return g
    min_date = g["feature_date"].min()
    max_date = g["feature_date"].max()
    full_dates = pd.date_range(start=min_date, end=max_date, freq="D")

    g = g.set_index("feature_date").reindex(full_dates)
    g.index.name = "feature_date"
    cid = g["customer_id"].dropna().iloc[0]
    g["customer_id"] = cid
    return g.reset_index()

panel = (
    df.groupby("customer_id", group_keys=False)
      .apply(expand_customer)
      .reset_index(drop=True)
)

for col in daily_numeric_cols:
    panel[col] = panel[col].fillna(0.0)

print("panel shape:", panel.shape)
panel.head()


  .apply(expand_customer)


panel shape: (21200435, 8)


Unnamed: 0,feature_date,customer_id,daily_orders,daily_gmv,daily_views,daily_carts,daily_purchases,daily_events_total
0,2017-01-07,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,6.0,1.0,0.0,7.0
1,2017-01-08,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-01-09,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-01-10,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-01-11,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
panel = panel.sort_values(["customer_id", "feature_date"]).copy()

for col in daily_numeric_cols:
    panel[col] = panel[col].fillna(0.0).astype(float)

g = panel.groupby("customer_id", group_keys=False)

panel["f_7"]  = g["daily_orders"].rolling(7,  min_periods=1).sum().reset_index(level=0, drop=True)
panel["f_30"] = g["daily_orders"].rolling(30, min_periods=1).sum().reset_index(level=0, drop=True)
panel["m_7"]  = g["daily_gmv"].rolling(7,  min_periods=1).sum().reset_index(level=0, drop=True)
panel["m_30"] = g["daily_gmv"].rolling(30, min_periods=1).sum().reset_index(level=0, drop=True)
panel["views_7"]  = g["daily_views"].rolling(7,  min_periods=1).sum().reset_index(level=0, drop=True)
panel["events_7"] = g["daily_events_total"].rolling(7,  min_periods=1).sum().reset_index(level=0, drop=True)

def compute_last_order_date(group):
    mask = group["daily_orders"] > 0
    last_order_date = group["feature_date"].where(mask).ffill()
    return last_order_date

panel["last_order_date"] = g.apply(compute_last_order_date).reset_index(level=0, drop=True)
panel["recency_days"] = (panel["feature_date"] - panel["last_order_date"]).dt.days
panel["recency_days"] = panel["recency_days"].fillna(999).astype(int)
panel = panel.drop(columns=["last_order_date"])

print("features shape:", panel.shape)
panel.head()


  panel["last_order_date"] = g.apply(compute_last_order_date).reset_index(level=0, drop=True)


features shape: (21200435, 15)


Unnamed: 0,feature_date,customer_id,daily_orders,daily_gmv,daily_views,daily_carts,daily_purchases,daily_events_total,f_7,f_30,m_7,m_30,views_7,events_7,recency_days
0,2017-01-07,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,6.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,6.0,7.0,999
1,2017-01-08,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.0,999
2,2017-01-09,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.0,999
3,2017-01-10,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.0,999
4,2017-01-11,00012a2ce6f8dcda20d059ce98491703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.0,999


In [8]:
# overwrite the table with this DataFrame
panel.to_sql(
    name="customer_features_daily",
    con=engine,
    schema="hpce",
    if_exists="replace",   # replace table fully
    index=False,
    chunksize=10_000,
    method="multi",
)

21200435

In [9]:
fetch_df("SELECT COUNT(*) FROM hpce.customer_features_daily;")

Unnamed: 0,count
0,21200435


In [None]:
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import os, numpy as np, pandas as pd

load_dotenv()
engine = create_engine(os.getenv("DATABASE_URL"))

def fetch_df(sql: str) -> pd.DataFrame:
    with engine.connect() as conn:
        return pd.read_sql(text(sql), conn)

# 1) Load the current table you wrote (the one with daily_* only)
panel = fetch_df("SELECT * FROM hpce.customer_features_daily;")

panel["feature_date"] = pd.to_datetime(panel["feature_date"])
panel = panel.sort_values(["customer_id", "feature_date"])
