In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# 0) 경로/로드

ROOT = Path.cwd()
DATA_DIR = ROOT / "clean_vote_ver2"
csv_path = DATA_DIR / "processed_hackle_merge.csv"

df = pd.read_csv(csv_path)
print("shape:", df.shape)
print(df.columns)

  df = pd.read_csv(csv_path)


shape: (20203847, 16)
Index(['session_id', 'user_id', 'language', 'osname', 'osversion',
       'versionname', 'device_id', 'event_id', 'event_datetime', 'event_key',
       'item_name', 'page_name', 'friend_count', 'votes_count',
       'heart_balance', 'question_id'],
      dtype='object')


In [2]:
# 1) 전처리

need_cols = ["user_id", "event_key", "event_datetime"]
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"필수 컬럼이 없어: {missing}")

hackle = df[need_cols].copy()

# user_id
hackle = hackle[hackle["user_id"].notna()].copy()
hackle["user_id"] = pd.to_numeric(hackle["user_id"], errors="coerce")
hackle = hackle[hackle["user_id"].notna()].copy()
hackle["user_id"] = hackle["user_id"].astype("int64")

# event_datetime
hackle["event_datetime"] = pd.to_datetime(hackle["event_datetime"], errors="coerce")
hackle = hackle[hackle["event_datetime"].notna()].copy()
hackle["event_datetime"] = hackle["event_datetime"].dt.floor("s")

# 정렬
hackle = hackle.sort_values(["user_id", "event_datetime"]).reset_index(drop=True)

print("after clean:", hackle.shape)
print("user_id unique:", hackle["user_id"].nunique())

after clean: (20158855, 3)
user_id unique: 226365


In [3]:
# 2) 퍼널 정의 (hackle only)

# 참여(Engagement)
eng_key = "complete_question"

# 구매의도(Intent)
intent_events = ["view_shop", "click_purchase"]

# 구매완료(Revenue)
rev_key = "complete_purchase"

In [4]:
# 3) 참여 유저
eng_users = hackle.loc[hackle["event_key"] == eng_key, "user_id"].unique()
eng_cnt = len(eng_users)

# 참여 유저 로그만
eng_df = hackle[hackle["user_id"].isin(eng_users)].copy()

In [5]:
# 4) 리텐션: 참여 이후 로그 1개 이상

eng_time = hackle[hackle["event_key"] == eng_key].groupby("user_id")["event_datetime"].min()
eng_df = eng_df.merge(eng_time, on="user_id", suffixes=("", "_eng"))
eng_df["after_eng"] = eng_df["event_datetime"] > eng_df["event_datetime_eng"]

ret_users = eng_df.groupby("user_id")["after_eng"].sum()
ret_users = ret_users[ret_users > 0].index
ret_cnt = len(ret_users)

In [6]:
# 5) 구매의도 (참여 유저 중)
intent_users = hackle[(hackle["event_key"].isin(intent_events)) & (hackle["user_id"].isin(eng_users))]["user_id"].unique()
intent_cnt = len(intent_users)

In [7]:
# 6) 구매완료 (참여 유저 중)
rev_users = hackle[(hackle["event_key"] == rev_key) & (hackle["user_id"].isin(eng_users))]["user_id"].unique()
rev_cnt = len(rev_users)

In [8]:
# 4) 결과 출력

print("---Funnel---")
print(f"참여 유저: {eng_cnt:,}")
print(f"리텐션 유저: {ret_cnt:,} | 전환율: {ret_cnt/eng_cnt:.2%}" if eng_cnt else "참여 유저 0")
print(f"구매의도 유저: {intent_cnt:,} | 전환율(참여 기준): {intent_cnt/eng_cnt:.2%}" if eng_cnt else "참여 유저 0")
print(f"구매완료 유저: {rev_cnt:,} | 전환율(참여 기준): {rev_cnt/eng_cnt:.2%}" if eng_cnt else "참여 유저 0")


---Funnel---
참여 유저: 48,981
리텐션 유저: 46,030 | 전환율: 93.98%
구매의도 유저: 9,119 | 전환율(참여 기준): 18.62%
구매완료 유저: 940 | 전환율(참여 기준): 1.92%
