In [60]:
import pandas as pd
import numpy as np
from pathlib import Path
import ast

# 0) 설정

BASE_PATH = "/Users/10moo/260128_proj"
START_DATE = "2023-05-13"
END_DATE   = "2024-05-07"

clean = Path(BASE_PATH) / "clean_vote_ver2"
dump  = Path(BASE_PATH) / "dump_vote_ver2"

start = pd.to_datetime(START_DATE)
end_excl = pd.to_datetime(END_DATE) + pd.Timedelta(days=1)

In [61]:
# 1) 로드

def read_csv(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"파일 없음: {path}")
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="utf-8-sig")
    except Exception:
        return pd.read_csv(path, encoding="cp949", errors="ignore")

accounts_user = read_csv(clean / "processed_accounts_user.csv")
accounts_school = read_csv(clean / "processed_accounts_school.csv")
accounts_group = read_csv(dump / "accounts_group.csv")
accounts_userquestionrecord = read_csv(clean / "processed_userquestionrecord.csv")
accounts_paymenthistory = read_csv(clean / "processed_accounts_paymenthistory.csv")
accounts_attendance = read_csv(dump / "accounts_attendance.csv")

In [62]:
# 2) pk & fk

USER_GROUP_FK = "group_id"
GROUP_PK = "id"
GROUP_SCHOOL_FK = "school_id"
SCHOOL_PK = "id" if "id" in accounts_school.columns and "school_id" not in accounts_school.columns else "school_id"

print("user_group FK:", USER_GROUP_FK)
print("group PK:", GROUP_PK)
print("group_school FK:", GROUP_SCHOOL_FK)
print("school PK:", SCHOOL_PK)

user_group FK: group_id
group PK: id
group_school FK: school_id
school PK: id


In [63]:
# 3) 기간 필터

def to_dt(df, col, name):
    df = df.copy()
    df[col] = pd.to_datetime(df[col], errors="coerce")
    nat = df[col].isna().sum()
    print(f"{name} > {col} NaT: {nat:,}, user_cnt: {len(df):,}")
    return df

accounts_user = to_dt(accounts_user, "created_at", "accounts_user")
accounts_userquestionrecord = to_dt(accounts_userquestionrecord, "created_at", "accounts_userquestionrecord")
accounts_paymenthistory = to_dt(accounts_paymenthistory, "created_at", "accounts_paymenthistory")

funnel_user = accounts_user[(accounts_user["created_at"] >= start) & (accounts_user["created_at"] < end_excl)].copy()
funnel_question = accounts_userquestionrecord[(accounts_userquestionrecord["created_at"] >= start) & (accounts_userquestionrecord["created_at"] < end_excl)].copy()
funnel_payment = accounts_paymenthistory[(accounts_paymenthistory["created_at"] >= start) & (accounts_paymenthistory["created_at"] < end_excl)].copy()

print("funnel_user:", len(funnel_user))
print("funnel_question:", len(funnel_question))
print("funnel_payment:", len(funnel_payment))

accounts_user > created_at NaT: 0, user_cnt: 676,978
accounts_userquestionrecord > created_at NaT: 0, user_cnt: 1,217,558
accounts_paymenthistory > created_at NaT: 0, user_cnt: 95,140
funnel_user: 363138
funnel_question: 809935
funnel_payment: 95137


In [64]:
# 4) 노출(팀원): 활성학교 student_count 합

active_school = accounts_school[accounts_school["is_active_school"] == True].copy()
exposure = int(active_school["student_count"].fillna(0).sum())

print("exposure:", exposure)

exposure: 655303


In [None]:
# 5) 유입: 기간 내 가입 + 활성학교 재학생만
# join: user.group_id -> group.id -> group.school_id -> school.(id or school_id)

for df, col in [
    (funnel_user, "user_id"),
    (funnel_user, "group_id"),
    (accounts_group, "id"),
    (accounts_group, "school_id"),
    (accounts_school, SCHOOL_PK),
]:
    df[col] = df[col].astype("string")

abcde_user = pd.merge(
    funnel_user,
    accounts_group[["id", "school_id"]],
    left_on="group_id",
    right_on="id",
    how="left"
)
abcde_user = pd.merge(
    abcde_user,
    accounts_school[[SCHOOL_PK, "is_active_school"]],
    left_on="school_id",
    right_on=SCHOOL_PK,
    how="left"
)

# 활성학교
abcde_user = abcde_user[abcde_user["is_active_school"] == True].copy()

# 반 내 가입자 수 >= 4
group_size = abcde_user.groupby("group_id")["user_id"].nunique().reset_index(name="n_signup_in_group")
abcde_user = abcde_user.merge(group_size, on="group_id", how="left")
abcde_user = abcde_user[abcde_user["n_signup_in_group"] >= 4].copy()

inflow_users = abcde_user["user_id"].dropna().drop_duplicates()

print("inflow users:", inflow_users.nunique())


inflow users: 301886


In [66]:
# 6) 참여: questionrecord 능동+수동 union

funnel_question["user_id"] = funnel_question["user_id"].astype("string")
funnel_question["chosen_user_id"] = funnel_question["chosen_user_id"].astype("string")

abcde_question = funnel_question[funnel_question["user_id"].isin(set(inflow_users.tolist()))].copy()

active_u = abcde_question["user_id"].dropna().drop_duplicates()
passive_u = abcde_question["chosen_user_id"].dropna().drop_duplicates()
participation_u = pd.concat([active_u, passive_u], ignore_index=True).drop_duplicates()

print("active:", active_u.nunique())
print("passive:", passive_u.nunique())
print("total:", participation_u.nunique())

active: 1705
passive: 6076
total: 6095


In [67]:
# 7) 리텐션: 최초 참여일 다음날부터 출석 있으면 True

def normalize_att(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    s = str(x).strip()
    if s.startswith("[") and s.endswith("]"):
        try:
            v = ast.literal_eval(s)
            return v if isinstance(v, list) else []
        except Exception:
            return []
    return [s]

att = accounts_attendance.copy()
att["user_id"] = att["user_id"].astype("string")
att["attendance_date_list"] = att["attendance_date_list"].apply(normalize_att)
att = att.explode("attendance_date_list").dropna(subset=["attendance_date_list"])
att["attendance_date_list"] = pd.to_datetime(att["attendance_date_list"], errors="coerce")
att = att.dropna(subset=["attendance_date_list"])

# 출석도 같은 기간으로 필터
att = att[(att["attendance_date_list"] >= start) & (att["attendance_date_list"] < end_excl)].copy()

# 참여 "최초 시점" 계산(능동/수동 통합)
q = abcde_question[["user_id", "chosen_user_id", "created_at"]].copy()
q["created_at"] = pd.to_datetime(q["created_at"], errors="coerce")

first_choose = q[["user_id", "created_at"]].dropna()
first_chosen = q[["chosen_user_id", "created_at"]].dropna().rename(columns={"chosen_user_id": "user_id"})
first_question = pd.concat([first_choose, first_chosen], ignore_index=True).dropna()
first_question = first_question.sort_values("created_at").drop_duplicates(subset="user_id", keep="first")

att_part = att[att["user_id"].isin(set(participation_u.tolist()))].copy()
att_merge = pd.merge(att_part, first_question, on="user_id", how="left")

att_merge["first_day_plus1"] = att_merge["created_at"].dt.normalize() + pd.Timedelta(days=1)
att_merge["is_retention"] = att_merge["attendance_date_list"] >= att_merge["first_day_plus1"]

retention_users = att_merge.loc[att_merge["is_retention"] == True, "user_id"].dropna().drop_duplicates()

print("retention users:", retention_users.nunique())

retention users: 3177


In [68]:
# 8) 수익

funnel_payment["user_id"] = funnel_payment["user_id"].astype("string")

rev_ret = funnel_payment[funnel_payment["user_id"].isin(set(retention_users.tolist()))]["user_id"].dropna().drop_duplicates()
rev_be  = funnel_payment[funnel_payment["user_id"].isin(set(inflow_users.tolist()))]["user_id"].dropna().drop_duplicates()

print("revenue (retention base):", rev_ret.nunique())
print("revenue (inflow base b-e):", rev_be.nunique())

revenue (retention base): 472
revenue (inflow base b-e): 34369


In [69]:
accounts_school["is_active_school"].value_counts(dropna=False)

is_active_school
True     3897
False    2054
Name: count, dtype: int64

In [70]:
abcde_user["is_active_school"].isna().sum()

np.int64(0)

In [71]:
# 유입 후보(활성학교+기간가입) 상태에서 확인
tmp = abcde_user.copy()

print("group_id NaN:", tmp["group_id"].isna().sum())
print("group_id 공백:", (tmp["group_id"].astype(str).str.strip() == "").sum())

# group_id가 이상한 유저 20명만 보기
bad = tmp[tmp["group_id"].isna() | (tmp["group_id"].astype(str).str.strip() == "")]
bad[["user_id","group_id"]].head(20)


group_id NaN: 0
group_id 공백: 0


Unnamed: 0,user_id,group_id


In [72]:
dup = funnel_user.groupby("user_id").size().reset_index(name="cnt")
dup = dup[dup["cnt"] > 1].sort_values("cnt", ascending=False)
print("중복 user_id 수:", len(dup))
dup.head(20)


중복 user_id 수: 0


Unnamed: 0,user_id,cnt


In [73]:
# abcde_user에 n_signup_in_group 붙어있는 상태라고 가정
edge = abcde_user[abcde_user["n_signup_in_group"].isin([3,4,5])].copy()

print(edge["n_signup_in_group"].value_counts())

# 특히 4인 반 유저 목록
edge4 = edge[edge["n_signup_in_group"] == 4][["user_id","group_id"]].drop_duplicates()
edge4.head(30), len(edge4)


n_signup_in_group
5    17540
4    17196
Name: count, dtype: int64


(     user_id group_id
 23   1051061    16812
 35   1184904    42654
 36   1184905    47093
 43   1184913    51504
 47   1184918    51132
 53   1184924    29388
 56   1184927    44968
 76   1184949    51510
 79   1184952    17103
 87   1184961     7652
 109  1184983    13431
 118  1184993    24277
 144  1185026     2684
 156  1185040    51516
 166  1185051    45751
 180  1185065    13999
 184  1185070    41791
 220  1185114     9381
 230  1185124    43848
 239  1185135      962
 243  1185139    27955
 266  1185162    17786
 268  1185164    17309
 287  1185186    45987
 302  1185201    23045
 312  1185214    35318
 330  1185237    45987
 340  1185249    16220
 356  1185267    22821
 372  1185284    39529,
 17196)

In [74]:
# 4명 반 목록
g4 = abcde_user.loc[abcde_user["n_signup_in_group"] == 4, "group_id"].drop_duplicates()
print("4명 반 개수:", g4.nunique())

# 4명 반에 속한 유저 수(너가 본 17196과 같아야 정상)
u4 = abcde_user[abcde_user["group_id"].isin(set(g4.tolist()))]["user_id"].nunique()
print("4명 반 유저 수:", u4)

4명 반 개수: 4299
4명 반 유저 수: 17196


In [75]:
# 유입 유저의 가입 시각
signup_time = abcde_user.drop_duplicates("user_id")[["user_id","created_at"]].copy()
signup_time = signup_time.rename(columns={"created_at":"signup_at"})

# 참여 유저의 첫 투표 시각(능동/수동 둘 다)
q = abcde_question[["user_id","chosen_user_id","created_at"]].copy()
q["created_at"] = pd.to_datetime(q["created_at"], errors="coerce")

first_choose = q[["user_id","created_at"]].dropna().rename(columns={"created_at":"first_participate_at"})
first_chosen = q[["chosen_user_id","created_at"]].dropna().rename(columns={"chosen_user_id":"user_id","created_at":"first_participate_at"})
first_part = pd.concat([first_choose, first_chosen], ignore_index=True)
first_part = first_part.sort_values("first_participate_at").drop_duplicates("user_id", keep="first")

tmp = signup_time.merge(first_part, on="user_id", how="left")
tmp["days_to_participate"] = (tmp["first_participate_at"] - tmp["signup_at"]).dt.days

# 분포 확인
print(tmp["days_to_participate"].value_counts(dropna=False).sort_index().head(20))
print("가입 후 0일 내 참여율:", (tmp["days_to_participate"]==0).mean())
print("가입 후 7일 내 참여율:", (tmp["days_to_participate"]<=7).mean())


days_to_participate
0.0     2362
1.0      345
2.0      198
3.0      152
4.0      122
5.0       88
6.0       70
7.0       54
8.0       60
9.0       40
10.0      34
11.0      44
12.0      28
13.0      33
14.0      21
15.0      16
16.0       5
17.0       4
18.0       8
19.0       4
Name: count, dtype: int64
가입 후 0일 내 참여율: 0.007824145538382038
가입 후 7일 내 참여율: 0.011232716985882088


In [76]:
active_set = set(abcde_question["user_id"].dropna().astype(str).unique())
passive_set = set(abcde_question["chosen_user_id"].dropna().astype(str).unique())

both = active_set & passive_set
only_active = active_set - passive_set
only_passive = passive_set - active_set

print("능동만:", len(only_active))
print("수동만:", len(only_passive))
print("둘다:", len(both))
print("전체 참여:", len(active_set | passive_set))


능동만: 19
수동만: 4390
둘다: 1686
전체 참여: 6095


In [77]:
# 유입 유저 기준으로 세그먼트 테이블 만들기
base = abcde_user.drop_duplicates("user_id")[["user_id","group_id"]].copy()
base["is_participated"] = base["user_id"].isin(set(participation_u.astype(str).tolist())).astype(int)

seg = base.groupby("group_id").agg(
    inflow=("user_id","nunique"),
    participated=("is_participated","sum")
).reset_index()
seg["p_rate"] = seg["participated"] / seg["inflow"]

# 유입이 큰데 참여율 낮은 반 TOP
seg = seg.sort_values(["inflow","p_rate"], ascending=[False, True])
seg.head(20)


Unnamed: 0,group_id,inflow,participated,p_rate
24666,67097,80,0,0.0
16061,53560,66,0,0.0
20381,59890,55,0,0.0
7930,39451,52,0,0.0
9038,41946,52,0,0.0
8247,4021,51,0,0.0
17898,56063,50,0,0.0
14510,51408,48,5,0.104167
6453,35726,46,0,0.0
27991,73024,44,0,0.0


In [78]:
non_part = set(inflow_users.astype(str)) - set(participation_u.astype(str))
non_part = list(non_part)

# 비참여 유저 중 출석이라도 있는지(=앱 방문은 했는지)
att_u = set(accounts_attendance["user_id"].astype(str).unique())
print("비참여인데 출석은 있는 유저:", len(set(non_part) & att_u))


비참여인데 출석은 있는 유저: 183947


In [79]:
base = abcde_user.copy()

base["is_participated"] = base["user_id"].isin(
    set(participation_u.astype(str))
).astype(int)

summary = base.groupby("is_participated").agg(
    users=("user_id","nunique"),
    avg_group_size=("n_signup_in_group","mean"),
).reset_index()

summary


Unnamed: 0,is_participated,users,avg_group_size
0,0,298157,11.802624
1,1,3729,12.785465


In [80]:
group_effect = (
    base.groupby("n_signup_in_group")["is_participated"]
    .mean()
    .reset_index(name="participation_rate")
)

group_effect.sort_values("n_signup_in_group")


Unnamed: 0,n_signup_in_group,participation_rate
0,4,0.008316
1,5,0.009179
2,6,0.008269
3,7,0.010111
4,8,0.0108
5,9,0.01166
6,10,0.011231
7,11,0.012567
8,12,0.012562
9,13,0.01036


In [81]:
ready_users = base[
    (base["n_signup_in_group"] >= 4) &
    (base["is_participated"] == 0)
]

In [82]:
len(ready_users)

298157

In [83]:
# user_id 기준 투표 횟수
vote_cnt = (
    abcde_question
    .groupby("user_id")
    .size()
    .reset_index(name="vote_count")
)

# 2회 이상 투표 유저
repeat_voters = vote_cnt[vote_cnt["vote_count"] >= 2]["user_id"]

print("투표 2회 이상 유저 수:", repeat_voters.nunique())

투표 2회 이상 유저 수: 1697


In [84]:
repeat_voters_inflow = repeat_voters[
    repeat_voters.isin(set(inflow_users.astype(str)))
]

print("유입 → 투표 2회 이상:", repeat_voters_inflow.nunique())

유입 → 투표 2회 이상: 1697


In [85]:
active = (abcde_question[["user_id"]].rename(columns={"user_id":"user"}))

passive = (abcde_question[["chosen_user_id"]].rename(columns={"chosen_user_id":"user"}))

all_participation = pd.concat([active, passive], ignore_index=True)

participation_cnt = (all_participation.groupby("user").size().reset_index(name="participation_count")
)

repeat_participants = participation_cnt[
    participation_cnt["participation_count"] >= 2
]["user"]

print("참여 2회 이상 유저:", repeat_participants.nunique())


참여 2회 이상 유저: 5429


In [86]:
base = abcde_user[["user_id"]].drop_duplicates()

base["repeat_voter"] = base["user_id"].isin(set(repeat_voters)).astype(int)
base["payer"] = base["user_id"].isin(
    set(accounts_paymenthistory["user_id"].astype(str))
).astype(int)

pd.crosstab(base["repeat_voter"], base["payer"], normalize="index")

payer,0,1
repeat_voter,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.886222,0.113778
1,0.873895,0.126105


- 수익 관련 분석

In [87]:
accounts_paymenthistory.columns

Index(['user_id', 'productId', 'phone_type', 'created_at'], dtype='object')

In [88]:
candidates = {
    "accounts_paymenthistory": accounts_paymenthistory,
    "event_receipts": globals().get("event_receipts", None),
    "events": globals().get("events", None),
}

for name, df in candidates.items():
    if df is None:
        continue
    cols = [c.lower() for c in df.columns]
    money_cols = [c for c in df.columns if any(k in c.lower() for k in ["price","amount","value","revenue","cost","currency","money","paid"])]
    pid_cols = [c for c in df.columns if "product" in c.lower() or "sku" in c.lower() or "item" in c.lower()]
    print(f"\n[{name}] rows={len(df):,}")
    print("  money-ish:", money_cols)
    print("  product-ish:", pid_cols[:10])



[accounts_paymenthistory] rows=95,140
  money-ish: []
  product-ish: ['productId']

[event_receipts] rows=309
  money-ish: []
  product-ish: []

[events] rows=3
  money-ish: []
  product-ish: []


In [89]:
import pandas as pd
import os

base = "/Users/10moo/260128_proj"
dump = os.path.join(base, "dump_vote_ver2")

event_receipts = pd.read_csv(os.path.join(dump, "event_receipts.csv"))
events = pd.read_csv(os.path.join(dump, "events.csv"))

print(event_receipts.shape, events.shape)
print("event_receipts cols:", list(event_receipts.columns))
print("events cols:", list(events.columns))

(309, 5) (3, 6)
event_receipts cols: ['id', 'created_at', 'event_id', 'user_id', 'plus_point']
events cols: ['id', 'title', 'plus_point', 'event_type', 'is_expired', 'created_at']


- 정확한 금액을 알 수 있는 컬럼이나 테이블이 없었던 기억
- 지피티한테, 가격과 관련한 컬럼이 있는지 교차검증 할 수 있는 코드 짜달라고 해서 해클까지 확인
    - 구체적인 매출을 알 수 있는 방법은 없을듯 함

In [90]:
# 형변환, 날짜 설정

pay = accounts_paymenthistory.copy()

pay["user_id"] = pay["user_id"].astype(str)
pay["created_at"] = pd.to_datetime(pay["created_at"])

START = pd.to_datetime("2023-05-13")
END = pd.to_datetime("2024-05-07")

pay = pay[(pay["created_at"] >= START) & (pay["created_at"] <= END)]

In [91]:
inflow_set = set(inflow_users.astype(str))
part_set = set(participation_u.astype(str))

In [92]:
# 함수로 작성
def heart_kpis(base_set, label):

    # 전체 결제 중에서 유입/참여 유저가 한 결제만 가져옴
    sub = pay[pay["user_id"].isin(base_set)].copy()

    # 분석 유저 수
    users_cnt = len(base_set)
    # 실제로 하트를 한번이라도 산 사람
    buyers = sub["user_id"].nunique()
    # 결제 이벤트 수
    tx = len(sub)

    # 구매 횟수 유저별로 카운트
    user_tx = sub.groupby("user_id").size()

    # 평균 구매 횟수
    freq = tx / max(1, buyers)
    # 재구매 유저
    repurchase = (user_tx >= 2).sum()
    # 재구매율
    repurchase_rate = repurchase / max(1, buyers)

    print(f"{label} → 하트 수익")
    print("기준 유저:", users_cnt)
    print("하트 구매 유저:", buyers)
    print("구매 전환율:", buyers / max(1, users_cnt))
    print("총 구매 건수:", tx)
    print("구매자당 평균 구매:", freq)
    print("재구매 유저:", repurchase)
    print("재구매율:", repurchase_rate)

    print("\n상품 별 구매자 수")
    print(sub["productId"].value_counts().head(10))

In [93]:
heart_kpis(inflow_set, "유입")

유입 → 하트 수익
기준 유저: 301886
하트 구매 유저: 34369
구매 전환율: 0.1138476113499798
총 구매 건수: 57401
구매자당 평균 구매: 1.670138787861154
재구매 유저: 9971
재구매율: 0.29011609299077656

상품 별 구매자 수
productId
heart.777     33592
heart.1000    12354
heart.200     10128
heart.4000     1327
Name: count, dtype: int64


In [94]:
heart_kpis(part_set, "참여")

참여 → 하트 수익
기준 유저: 6095
하트 구매 유저: 673
구매 전환율: 0.11041837571780147
총 구매 건수: 1150
구매자당 평균 구매: 1.7087667161961366
재구매 유저: 193
재구매율: 0.2867756315007429

상품 별 구매자 수
productId
heart.777     663
heart.1000    261
heart.200     200
heart.4000     26
Name: count, dtype: int64


- 유입 퍼널의 결제 전환율: 11.3%, 재구매율: 29%
- 참여 퍼널의 결제 전환율: 11.0%, 재구매율: 28.6%
    - 구매는 참여의 깊이보다 서비스 진입 초반 상태에서 이미 결정됨
- 평균 구매 횟수: 약 1.7회
- 중간 가격 상품(777개)이 메인


In [95]:
# 일부 유저들만 구매를 하는건지

user_purchase = pay.groupby("user_id").size().reset_index(name="tx")

# 유저별 구매 횟수
# 10등급으로 나눔
# 각 그룹이 만든 총 구매량 합계
user_purchase["percentile"] = pd.qcut(user_purchase["tx"], q=10, labels=False, duplicates="drop")

user_purchase.groupby("percentile")["tx"].sum()

# 가장 구매를 적게 한 유저 그룹인데 전체 구매의 가장 많은 양을 차지

percentile
0    60212
1    10266
2    24658
Name: tx, dtype: int64

In [96]:
# 구매 속도

# 가입 시간
signup_time = accounts_user[["user_id","created_at"]].copy()
signup_time["created_at"] = pd.to_datetime(signup_time["created_at"])

# 구매 최초 시간
purchase_time = pay.groupby("user_id")["created_at"].min().reset_index()
purchase_time["created_at"] = pd.to_datetime(purchase_time["created_at"])

# 타입 통일
signup_time["user_id"] = signup_time["user_id"].astype(str)
purchase_time["user_id"] = purchase_time["user_id"].astype(str)

# merge
speed = signup_time.merge(purchase_time, on="user_id", how="inner", suffixes=("_signup", "_buy"))

# 구매까지 걸린 시간
# 가입일 ~ 첫 하트 구매까지 걸린 일수
speed["days_to_buy"] = (speed["created_at_buy"] - speed["created_at_signup"]).dt.days

speed["days_to_buy"].describe()

count    59177.000000
mean         9.206313
std         24.112832
min          0.000000
25%          1.000000
50%          3.000000
75%          8.000000
max        372.000000
Name: days_to_buy, dtype: float64

- 하트 구매까지 이어진 유저 수: 59177명(가입 -> 하트 구매 기록)
- 평균 구매까지 9.2일이 걸림
- 표준편차가 평균보다 훨씬 큼
    - max 구매 유저가 372일이므로 표준편차 차이가 나는 듯
    - 구매 타이밍이 매우 제각각, 유저 팽동 패턴이 하나가 아님
- 구매 유저 절반이 가입 후 3일 안에 결제

- 참여도 보자!!!

In [None]:
# 1) 질문/투표 로그 기간 필터 + 유입 유저만
q = funnel_question.copy()
q["created_at"] = pd.to_datetime(q["created_at"])
q = q[(q["created_at"] >= START) & (q["created_at"] <= END)].copy()

# 타입 통일(merge/isin 안정성)
q["user_id"] = q["user_id"].astype(str)
q["chosen_user_id"] = q["chosen_user_id"].astype(str)

abcde_signup_user = complete_signup["user_id"].unique()
signup_set = set(map(str, abcde_signup_user))
q = q[q["user_id"].isin(signup_set)].copy()

# 2) 참여 유저 정의 = 능동(user_id) + 수동(chosen_user_id)
active_users  = set(q["user_id"].unique())
passive_users = set(q["chosen_user_id"].unique())

part_users = active_users | passive_users

print("능동 유저:", len(active_users))
print("수동 유저:", len(passive_users))
print("참여(능동+수동) 유저:", len(part_users))

NameError: name 'complete_signup' is not defined

In [None]:
# 3) 첫 참여 시간(능동/수동 모두 포함)
# 능동: user_id 기준
active_time = (
    q.groupby("user_id")["created_at"].min()
     .reset_index()
     .rename(columns={"user_id": "user", "created_at": "first_part_at"})
)

# 수동: chosen_user_id 기준
passive_time = (
    q.groupby("chosen_user_id")["created_at"].min()
     .reset_index()
     .rename(columns={"chosen_user_id": "user", "created_at": "first_part_at"})
)

# 둘을 합쳐서 "유저별 첫 참여시간"으로 최종 집계
part_time = (
    pd.concat([active_time, passive_time], ignore_index=True)
      .groupby("user")["first_part_at"].min()
      .reset_index()
)

# 4) 구매 기간 필터 + 타입 통일
pay = accounts_paymenthistory.copy()
pay["created_at"] = pd.to_datetime(pay["created_at"])
pay = pay[(pay["created_at"] >= START) & (pay["created_at"] <= END)].copy()
pay["user_id"] = pay["user_id"].astype(str)

# 유저별 첫 구매
buy_time = (
    pay.groupby("user_id")["created_at"].min()
       .reset_index()
       .rename(columns={"user_id": "user", "created_at": "first_buy_at"})
)

In [None]:

# 5) 참여 → 구매 속도
speed_part = part_time.merge(buy_time, on="user", how="inner")
speed_part["days_to_buy"] = (speed_part["first_buy_at"] - speed_part["first_part_at"]).dt.days

print("참여→구매 유저 수:", len(speed_part))
print("음수(구매가 참여보다 빠름) 개수:", (speed_part["days_to_buy"] < 0).sum())

speed_part["days_to_buy"].describe()


In [None]:
# 참여 시간 만들기
active = abcde_question[["user_id", "created_at"]].rename(columns={"user_id": "user"})

passive = abcde_question[["chosen_user_id", "created_at"]].rename(columns={"chosen_user_id": "user"})

participation = pd.concat([active, passive], ignore_index=True)

participation["created_at"] = pd.to_datetime(participation["created_at"])
participation["user"] = participation["user"].astype(str)

# 첫 참여 시간
part_time = (participation.groupby("user")["created_at"].min().reset_index().rename(columns={"user": "user_id"}))

In [None]:
# 첫 구매 시간

purchase_time = pay.groupby("user_id")["created_at"].min().reset_index()

purchase_time["created_at"] = pd.to_datetime(purchase_time["created_at"])
purchase_time["user_id"] = purchase_time["user_id"].astype(str)


In [None]:
# 참여 → 구매 merge

speed_part = part_time.merge(purchase_time, on="user_id", how="inner", suffixes=("_part", "_buy"))

speed_part["days_to_buy"] = (speed_part["created_at_buy"] - speed_part["created_at_part"]).dt.days

speed_part["days_to_buy"].describe()


In [None]:
pay.groupby("user_id").size().value_counts().sort_index()

In [None]:
vote_cnt = all_participation.groupby("user").size().reset_index(name="vote_cnt")

buyer = pay["user_id"].unique()
vote_cnt["is_buyer"] = vote_cnt["user"].isin(buyer)

vote_cnt.groupby(pd.cut(vote_cnt["vote_cnt"],
                        bins=[0,1,3,5,10,50,999])
                )["is_buyer"].mean()


In [None]:
pay["created_at"] = pd.to_datetime(pay["created_at"])

pay = pay.sort_values(["user_id","created_at"])
pay["next_gap"] = pay.groupby("user_id")["created_at"].diff().dt.days

pay["next_gap"].describe()