In [46]:
import pandas as pd
import numpy as np
from pathlib import Path

# 0) 경로 세팅

ROOT = Path.cwd()
DATA_DIR = ROOT / "clean_vote_ver2"
OUT_DIR  = ROOT / "analysis_out"

HACKLE_PATH   = DATA_DIR / "processed_hackle_merge.csv"
PAY_PATH      = DATA_DIR / "processed_accounts_paymenthistory.csv"
FAILPAY_PATH  = DATA_DIR / "processed_accounts_failpaymenthistory.csv"

print("HACKLE:", HACKLE_PATH.exists(), HACKLE_PATH)
print("PAY  :", PAY_PATH.exists(), PAY_PATH)
print("FAIL :", FAILPAY_PATH.exists(), FAILPAY_PATH)


# 1) 큰 파일 처리 옵션

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


CHUNK = 500_000


HACKLE: True /Users/10moo/260128_proj/clean_vote_ver2/processed_hackle_merge.csv
PAY  : True /Users/10moo/260128_proj/clean_vote_ver2/processed_accounts_paymenthistory.csv
FAIL : True /Users/10moo/260128_proj/clean_vote_ver2/processed_accounts_failpaymenthistory.csv


In [47]:
hackle = pd.read_csv("clean_vote_ver2/processed_hackle_merge.csv")
pay   = pd.read_csv("clean_vote_ver2/processed_accounts_paymenthistory.csv")
fail  = pd.read_csv("clean_vote_ver2/processed_accounts_failpaymenthistory.csv")

  hackle = pd.read_csv("clean_vote_ver2/processed_hackle_merge.csv")


In [48]:
# 하트 이상치
# 기준 임의 설정
### 10만개 이상 11명
# 100만개 이상 4명
# 1000만개 이상 2명
# 1억개 이상 2명
# 10억개 이상 0명

THRESH = 100000

out_users = (hackle.loc[hackle["heart_balance"] >= THRESH, "user_id"].dropna().astype(str).unique())

print("이상치 유저 수:", len(out_users))
out_users[:10]

이상치 유저 수: 11


array(['1577938.0', '914589.0', '1577954.0', '1437875.0', '833041.0',
       '849763.0', '1290502.0', '967442.0', '1577930.0', '838541.0'],
      dtype=object)

In [49]:
# 11명 유저들의 결제기록, 결제 실패기록
pay_out = pay[pay["user_id"].isin(heart_out_users)]
fail_out = fail[fail["user_id"].isin(heart_out_users)]

print("결제 기록 수:", len(pay_out))
print("실패 기록 수:", len(fail_out))

display(pay_out.head(10))
fail_out.tail()

결제 기록 수: 9
실패 기록 수: 0


Unnamed: 0,user_id,productId,phone_type,created_at
1918,838541,heart.200,I,2023-05-14 03:36:19
3132,838541,heart.777,I,2023-05-14 05:00:59
5757,833041,heart.4000,I,2023-05-14 07:59:46
13371,833041,heart.4000,I,2023-05-15 01:50:06
26856,838541,heart.200,I,2023-05-16 16:49:01
89610,1290502,heart.777,A,2023-06-23 06:48:40
89668,1563520,heart.777,A,2023-06-24 07:01:41
90060,1577930,heart.777,A,2023-07-09 07:27:17
90062,1577938,heart.777,I,2023-07-09 07:48:21


Unnamed: 0,id,user_id,productId,phone_type,created_at


In [50]:
# 활동 로그
out_logs = hackle[hackle["user_id"].isin(heart_out_users)] \
    .sort_values("event_datetime", ascending=False)

out_logs.head()
# 너무 많음

Unnamed: 0,session_id,user_id,language,osname,osversion,versionname,device_id,event_id,event_datetime,event_key,item_name,page_name,friend_count,votes_count,heart_balance,question_id
17945534,ea0e4dd9-8447-4580-a140-6869d6ff39a5,1290502.0,ko,Android,13,2.0.0,ea0e4dd9-8447-4580-a140-6869d6ff39a5,59a31307-1504-4882-be22-f02d594f94d0,2023-08-10 23:24:18,view_timeline_tap,,,54.0,140.0,151907.0,
17945552,ea0e4dd9-8447-4580-a140-6869d6ff39a5,1290502.0,ko,Android,13,2.0.0,ea0e4dd9-8447-4580-a140-6869d6ff39a5,f6568435-994c-4bc5-8391-ac4c50e8bff1,2023-08-10 23:24:18,view_profile_tap,,,54.0,140.0,151907.0,
11395625,ea0e4dd9-8447-4580-a140-6869d6ff39a5,1290502.0,ko,Android,13,2.0.5,ea0e4dd9-8447-4580-a140-6869d6ff39a5,511be6f2-eb39-4108-8f98-9ec63d382a48,2023-08-10 23:24:18,click_bottom_navigation_profile,,,54.0,140.0,151907.0,
17945537,ea0e4dd9-8447-4580-a140-6869d6ff39a5,1290502.0,ko,Android,13,2.0.0,ea0e4dd9-8447-4580-a140-6869d6ff39a5,8ea7b0c7-46d7-4a44-be0e-47e21c83696c,2023-08-10 23:24:18,view_lab_tap,,,54.0,140.0,151907.0,
11395627,ea0e4dd9-8447-4580-a140-6869d6ff39a5,1290502.0,ko,Android,13,2.0.5,ea0e4dd9-8447-4580-a140-6869d6ff39a5,59a31307-1504-4882-be22-f02d594f94d0,2023-08-10 23:24:18,view_timeline_tap,,,54.0,140.0,151907.0,


In [51]:
# 로그 수, 최대 하트, 최소 하트 요약
def show_user_log(user_id):
    user_log = hackle[hackle["user_id"] == user_id] \
        .sort_values("event_datetime")

    print(f"\n===== USER {user_id} =====")
    print("총 이벤트 수:", len(user_log))
    print("최대 하트:", user_log["heart_balance"].max())
    print("최소 하트:", user_log["heart_balance"].min())

    user_log[["event_datetime", "event_key", "heart_balance", "friend_count", "votes_count", "item_name", "page_name"]].tail(30)

In [52]:
show_user_log(heart_out_users[10])
# 최대 하트 수가 왜 20만개인 사람이지?
# 7억개, 8억개 유저는?
# 누적기록이었나?


===== USER 1563520.0 =====
총 이벤트 수: 40
최대 하트: 208894.0
최소 하트: 208894.0


In [53]:
hackle["heart_balance"].max()
# 8억개가 맞음

np.float64(884999804.0)

In [54]:
# 8억개 유저 정보
hackle.loc[hackle["heart_balance"].idxmax()]

session_id                NnVWxmwjHcfnMENN9y4SrTPfcG82
user_id                                       833041.0
language                                         ko-KR
osname                                             iOS
osversion                                       16.5.1
versionname                                      2.0.5
device_id         420F81D6-4206-4D72-B42D-207341327899
event_id          01a583d7-b2b2-490b-96f6-1645b172ac0e
event_datetime                     2023-07-24 20:58:40
event_key                               $session_start
item_name                                          NaN
page_name                                          NaN
friend_count                                      65.0
votes_count                                       97.0
heart_balance                              884999804.0
question_id                                        NaN
Name: 7323975, dtype: object

In [55]:
# 8억개 유저 아이디 행동 로그
uid = hackle.loc[hackle["heart_balance"].idxmax(), "user_id"]

hackle[hackle["user_id"] == uid] \
    .sort_values("event_datetime") \
    [["user_id", "event_datetime", "event_key", "heart_balance"]]

Unnamed: 0,user_id,event_datetime,event_key,heart_balance
10310396,833041.0,2023-07-18 07:28:20,$session_start,884999804.0
10310427,833041.0,2023-07-18 07:28:20,launch_app,884999804.0
7323988,833041.0,2023-07-18 07:28:20,$session_start,884999804.0
7324019,833041.0,2023-07-18 07:28:20,launch_app,884999804.0
7324012,833041.0,2023-07-18 07:28:24,view_lab_tap,884999804.0
...,...,...,...,...
7323978,833041.0,2023-08-02 22:26:50,view_lab_tap,884999804.0
7324039,833041.0,2023-08-02 22:26:50,click_bottom_navigation_profile,884999804.0
7324094,833041.0,2023-08-02 22:26:50,view_profile_tap,884999804.0
10310386,833041.0,2023-08-02 22:26:50,view_lab_tap,884999804.0


In [56]:
# 구매기록 2건 밖에 없음
pay[pay["user_id"] == max_user]

Unnamed: 0,user_id,productId,phone_type,created_at
5757,833041,heart.4000,I,2023-05-14 07:59:46
13371,833041,heart.4000,I,2023-05-15 01:50:06


In [57]:
# 구매 실패기록은 없음
fail[fail["user_id"] == max_user]

Unnamed: 0,id,user_id,productId,phone_type,created_at


In [58]:
# 이 사람 친구 수는 65명
hackle[hackle["user_id"] == max_user]["friend_count"].max()

np.float64(65.0)

- 하트 기록이 고정되어있는게 아니라, 변화하는 값인거였슨...!
- 그럼 하트 증가/감소폭이 급한 사람을 찾아야 함

In [59]:
# 유저별 최고 하트 수와 최종 하트 수 비교

target_users = out_users

target_df = hackle[hackle["user_id"].astype(str).isin(target_users)].copy()
target_df = target_df.sort_values(["user_id", "event_datetime"])

summary = (
    target_df.groupby("user_id")
    .agg(
        event_cnt=("event_id", "count"),
        heart_max=("heart_balance", "max"),
        heart_avg=("heart_balance", "mean"),
        last_dt=("event_datetime", "max"),
    )
    .reset_index()
)

# 유저별 최종 heart_balance
last_heart = (
    target_df.dropna(subset=["event_datetime"])
    .groupby("user_id")
    .tail(1)[["user_id", "heart_balance"]]
    .rename(columns={"heart_balance": "heart_last"})
)

summary = summary.merge(last_heart, on="user_id", how="left") \
                 .sort_values("heart_max", ascending=False)

summary.head(15)
# 7억개인 사람 최종 하트 수가 엄청 적다

Unnamed: 0,user_id,event_cnt,heart_max,heart_avg,last_dt,heart_last
0,833041.0,268,884999804.0,884999800.0,2023-08-02 22:26:50,884999804.0
4,967442.0,45,703126260.0,445888100.0,2023-08-05 18:29:19,8517.0
2,849763.0,2702,9991115.0,6527275.0,2023-08-10 17:08:48,9991115.0
6,1437875.0,3129,9991115.0,8354790.0,2023-08-10 17:08:48,9991115.0
3,914589.0,1022,263783.0,263637.2,2023-08-10 18:16:19,263783.0
7,1563520.0,40,208894.0,208894.0,2023-07-25 22:13:16,208894.0
5,1290502.0,66,151907.0,151907.0,2023-08-10 23:24:18,151907.0
1,838541.0,568,100142.0,69171.3,2023-08-04 19:22:15,95474.0
8,1577930.0,568,100142.0,69171.3,2023-08-04 19:22:15,95474.0
9,1577938.0,1136,100142.0,69171.3,2023-08-04 19:22:15,95474.0


In [60]:
# 하트 변화량

hackle = hackle.sort_values(["user_id", "event_datetime"])

# 직전 값 대비 변화량
hackle["heart_diff"] = (
    hackle.groupby("user_id")["heart_balance"]
    .diff()
)

hackle[["user_id","event_datetime","heart_balance","heart_diff"]].head()

Unnamed: 0,user_id,event_datetime,heart_balance,heart_diff
16377238,831962.0,2023-07-20 18:50:09,,
16377244,831962.0,2023-07-20 18:50:09,,
16377243,831962.0,2023-07-20 18:50:14,,
16377249,831962.0,2023-07-20 18:50:14,,
16377240,831962.0,2023-07-20 18:50:16,,


In [61]:
# 왜 다 null이 나오는건지?
# 하트 결측치
hackle["heart_balance"].isna().sum()

np.int64(875889)

In [62]:
# 0) 정렬
hackle = hackle.sort_values(["user_id", "event_datetime"])

# 1) heart_balance 있는 행만 따로 뽑기
hb = hackle.loc[hackle["heart_balance"].notna(), ["user_id", "event_datetime", "event_key", "heart_balance"]].copy()

# 2) diff 계산
hb["heart_diff"] = hb.groupby("user_id")["heart_balance"].diff()

# 3) 한 유저만 확인
uid = hb["user_id"].iloc[0]
hb[hb["user_id"] == uid].head(20)

Unnamed: 0,user_id,event_datetime,event_key,heart_balance,heart_diff
16377252,831962.0,2023-07-20 18:50:17,click_bottom_navigation_lab,1982.0,
16377254,831962.0,2023-07-20 18:50:17,view_lab_tap,1982.0,0.0
16377241,831962.0,2023-07-20 18:50:20,click_bottom_navigation_profile,1982.0,0.0
16377237,831962.0,2023-07-20 18:50:24,click_bottom_navigation_lab,1982.0,0.0
16377245,831962.0,2023-07-20 18:50:24,view_lab_tap,1982.0,0.0
16377248,831962.0,2023-07-20 18:50:25,click_bottom_navigation_profile,1982.0,0.0
16377246,831962.0,2023-07-20 18:50:26,view_lab_tap,1982.0,0.0
16377250,831962.0,2023-07-20 18:50:26,click_bottom_navigation_lab,1982.0,0.0
16377242,831962.0,2023-07-20 19:28:36,click_bottom_navigation_profile,1982.0,0.0
16377247,831962.0,2023-07-20 19:28:36,view_lab_tap,1982.0,0.0


In [63]:
# 4000 이상 점프한 사람
# 가장 비싼 하트가 4000이니까
TH = 4001

jump_users = (hb.loc[hb["heart_diff"].abs() >= TH, ["user_id"]])

jump_users.head(20)

Unnamed: 0,user_id
5766176,836487.0
14430244,838541.0
14430133,838541.0
14430344,838541.0
14430226,838541.0
5967079,843185.0
2501776,849244.0
16323300,849763.0
8295626,849763.0
16323299,849763.0


In [None]:
# 4001 이상 하트가 변화한 건이 814건
len(jump_users)

814

In [65]:
TH = 4001

hackle = hackle.sort_values(["user_id", "event_datetime"])

hb = hackle.loc[hackle["heart_balance"].notna(),["user_id", "event_datetime", "event_key", "heart_balance"]].copy()

hb["heart_diff"] = hb.groupby("user_id")["heart_balance"].diff()

jump_users = (hb.loc[hb["heart_diff"].abs() >= TH, "user_id"].dropna().astype("int64").unique())

print("점프 유저 수:", len(jump_users))
jump_users[:50]

점프 유저 수: 388


array([836487, 838541, 843185, 849244, 849763, 852859, 859913, 860594,
       867575, 879916, 885945, 889629, 893651, 896349, 897148, 901798,
       902108, 904885, 905606, 909118, 913652, 913764, 917305, 924010,
       928431, 929393, 929425, 934103, 934983, 935764, 936072, 936203,
       937905, 939259, 940302, 945154, 945387, 955817, 955844, 958443,
       959102, 959305, 961722, 962413, 962619, 965306, 965867, 966185,
       966555, 967442])

In [66]:
TH = 4001

jump_logs = (
    hb.loc[hb["heart_diff"].abs() >= TH,
           ["user_id","event_datetime","event_key","heart_balance","heart_diff"]]
    .sort_values(["user_id","event_datetime"])
)

jump_logs["user_id"] = jump_logs["user_id"].astype("int64")

print("점프 로그 수:", len(jump_logs))
jump_logs.head(50)


점프 로그 수: 814


Unnamed: 0,user_id,event_datetime,event_key,heart_balance,heart_diff
5766176,836487,2023-07-26 23:58:14,launch_app,613.0,-4393.0
14430244,838541,2023-07-21 19:15:25,$session_start,1844.0,-95059.0
14430133,838541,2023-07-21 19:15:26,click_bottom_navigation_profile,100142.0,98298.0
14430344,838541,2023-07-21 19:17:42,$session_start,975.0,-99167.0
14430226,838541,2023-07-23 17:41:28,click_bottom_navigation_lab,96953.0,95760.0
5967079,843185,2023-08-10 13:05:27,launch_app,1646.0,-4687.0
2501776,849244,2023-07-27 03:13:56,view_timeline_tap,14319.0,4971.0
16323300,849763,2023-07-19 01:00:49,$session_start,766.0,-9990125.0
8295626,849763,2023-07-19 01:46:42,$session_start,9990891.0,9990125.0
16323299,849763,2023-07-19 19:50:17,$session_start,766.0,-9990125.0


In [None]:
# 진짜 직전 로그 대비 변화량이 맞는지
uid = 849763

u = hb[hb["user_id"] == uid].copy()
u["heart_balance"].value_counts().head(20)
# 이건 그냥 해당 값이 몇 번 등장하는지 카운트

heart_balance
9990891.0    908
779.0        305
9990927.0    202
97007.0      188
9991027.0    144
1193.0       137
9990993.0    112
96953.0      108
9991052.0     90
9991115.0     80
766.0         73
9991105.0     64
9990981.0     54
96977.0       40
9990864.0     38
96903.0       34
9990988.0     24
9991019.0     20
514.0         14
100142.0      13
Name: count, dtype: int64

In [None]:
# 
u = hb[hb["user_id"] == 849763].copy()

u = u.sort_values(["event_datetime"])

u[["event_datetime", "event_key", "heart_balance"]].head(60)


Unnamed: 0,event_datetime,event_key,heart_balance
8295627,2023-07-18 00:24:39,click_bottom_navigation_timeline,9990864.0
14436270,2023-07-18 00:24:39,click_bottom_navigation_timeline,9990864.0
8295507,2023-07-18 00:24:40,click_bottom_navigation_questions,9990864.0
8295986,2023-07-18 00:24:40,click_bottom_navigation_lab,9990864.0
14436150,2023-07-18 00:24:40,click_bottom_navigation_questions,9990864.0
14436629,2023-07-18 00:24:40,click_bottom_navigation_lab,9990864.0
8295767,2023-07-18 00:24:41,view_lab_tap,9990864.0
14436410,2023-07-18 00:24:41,view_lab_tap,9990864.0
14436760,2023-07-18 01:02:29,click_bottom_navigation_timeline,9990864.0
14436537,2023-07-18 01:02:29,click_bottom_navigation_questions,9990864.0


In [None]:
# 다시 차이 계산
u["heart_diff"] = u["heart_balance"].diff()

u[["event_datetime", "event_key", "heart_balance", "heart_diff"]].head(50)

Unnamed: 0,event_datetime,event_key,heart_balance,heart_diff
8295627,2023-07-18 00:24:39,click_bottom_navigation_timeline,9990864.0,
14436270,2023-07-18 00:24:39,click_bottom_navigation_timeline,9990864.0,0.0
8295507,2023-07-18 00:24:40,click_bottom_navigation_questions,9990864.0,0.0
8295986,2023-07-18 00:24:40,click_bottom_navigation_lab,9990864.0,0.0
14436150,2023-07-18 00:24:40,click_bottom_navigation_questions,9990864.0,0.0
14436629,2023-07-18 00:24:40,click_bottom_navigation_lab,9990864.0,0.0
8295767,2023-07-18 00:24:41,view_lab_tap,9990864.0,0.0
14436410,2023-07-18 00:24:41,view_lab_tap,9990864.0,0.0
14436760,2023-07-18 01:02:29,click_bottom_navigation_timeline,9990864.0,0.0
14436537,2023-07-18 01:02:29,click_bottom_navigation_questions,9990864.0,0.0


In [76]:
# 점프 구간만 확인
u[abs(u["heart_diff"]) >= 4000][
    ["event_datetime", "event_key", "heart_balance", "heart_diff"]
]

Unnamed: 0,event_datetime,event_key,heart_balance,heart_diff
16323301,2023-07-19 01:00:49,launch_app,766.0,-9990125.0
14436269,2023-07-19 01:46:42,$session_start,9990891.0,9990125.0
16323299,2023-07-19 19:50:17,$session_start,766.0,-9990125.0
8296035,2023-07-19 20:09:29,$session_start,9990891.0,9990125.0
19400741,2023-07-20 16:00:50,click_bottom_navigation_profile,766.0,-9990125.0
...,...,...,...,...
14436653,2023-08-02 19:15:03,$session_end,9991052.0,9894045.0
17439863,2023-08-02 19:16:56,$session_end,97007.0,-9894045.0
8295630,2023-08-02 23:07:31,launch_app,9991052.0,9894045.0
17439671,2023-08-03 20:57:31,launch_app,97007.0,-9894045.0
