## Data Preparation

Source:
- Public, anonymized e-commerce behavior logs
- Monthly CSV files (2019-Oct, 2019-Nov)

Goal:
- Merge raw event logs
- Enforce a clean data contract
- Output a single experiment-ready dataset: clean_events.csv

In [1]:
import pandas as pd
from pathlib import Path

raw_path = Path("data/raw")

files = [
    raw_path / "2019-Oct.csv",
    raw_path / "2019-Nov.csv"
]

dfs = []

for f in files:
    print(f"Loading {f} ...")
    df_part = pd.read_csv(
        f,
        usecols=[
            "event_time", "event_type", "product_id",
            "category_code", "brand", "price",
            "user_id", "user_session"
        ]
    )
    dfs.append(df_part)

df = pd.concat(dfs, ignore_index=True)

df.shape


Loading data\raw\2019-Oct.csv ...
Loading data\raw\2019-Nov.csv ...


(109950743, 8)

In [3]:
# time
df["event_time"] = pd.to_datetime(df["event_time"], errors="coerce")

# numeric
df["price"] = pd.to_numeric(df["price"], errors="coerce")


In [7]:
import pandas as pd
from pathlib import Path

raw_path = Path("data/raw")
files = [raw_path / "2019-Oct.csv", raw_path / "2019-Nov.csv"]

out_path = Path("data/clean_events.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)

usecols = [
    "event_time", "event_type", "product_id",
    "category_code", "brand", "price",
    "user_id", "user_session"
]

valid_events = {"view", "cart", "purchase"}

# 调大/调小：越大越快但越吃内存；推荐先 500_000
CHUNKSIZE = 500_000

# 如果之前跑过，先删旧文件，避免重复追加
if out_path.exists():
    out_path.unlink()

total_written = 0

for f in files:
    print(f"Processing: {f}")
    reader = pd.read_csv(f, usecols=usecols, chunksize=CHUNKSIZE)

    for i, chunk in enumerate(reader, start=1):
        # 基础类型转换
        chunk["event_time"] = pd.to_datetime(chunk["event_time"], errors="coerce")
        chunk["price"] = pd.to_numeric(chunk["price"], errors="coerce")

        # event_type 只做最小规范化
        chunk["event_type"] = (
            chunk["event_type"]
            .astype("string")
            .str.lower()
            .str.strip()
        )

        # 过滤：只保留漏斗事件
        chunk = chunk[chunk["event_type"].isin(valid_events)]

        # 必要字段非空
        chunk = chunk.dropna(subset=["user_id", "user_session", "event_time", "event_type"])

        # price：允许缺失，但不允许负数
        chunk = chunk[(chunk["price"].isna()) | (chunk["price"] >= 0)]

        # 写出：append 模式（header 只写一次）
        chunk.to_csv(out_path, mode="a", header=(total_written == 0), index=False)

        total_written += len(chunk)

        if i % 10 == 0:
            print(f"  chunks processed: {i}, rows written so far: {total_written:,}")

print(f"Done. Saved: {out_path} | rows written: {total_written:,}")


Processing: data\raw\2019-Oct.csv
  chunks processed: 10, rows written so far: 5,000,000
  chunks processed: 20, rows written so far: 9,999,999
  chunks processed: 30, rows written so far: 14,999,999
  chunks processed: 40, rows written so far: 19,999,999
  chunks processed: 50, rows written so far: 24,999,999
  chunks processed: 60, rows written so far: 29,999,999
  chunks processed: 70, rows written so far: 34,999,998
  chunks processed: 80, rows written so far: 39,999,998
Processing: data\raw\2019-Nov.csv
  chunks processed: 10, rows written so far: 47,448,762
  chunks processed: 20, rows written so far: 52,448,762
  chunks processed: 30, rows written so far: 57,448,760
  chunks processed: 40, rows written so far: 62,448,760
  chunks processed: 50, rows written so far: 67,448,758
  chunks processed: 60, rows written so far: 72,448,758
  chunks processed: 70, rows written so far: 77,448,758
  chunks processed: 80, rows written so far: 82,448,758
  chunks processed: 90, rows written s