# 1. 환경설정

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import koreanize_matplotlib
import seaborn as sns



In [3]:
# 지수표현 없애기
pd.options.display.float_format = '{:.2f}'.format

# 2. 데이터 불러오기


In [4]:
device_path = 'dumps/hackle_csv/device_properties.csv'
events_path = 'dumps/hackle_csv/hackle_events.csv'
hackle_prop_path = 'dumps/hackle_csv/hackle_properties.csv'
user_path = 'dumps/hackle_csv/user_properties.csv'

In [5]:
raw_device = pd.read_csv(device_path)
raw_hackle_events = pd.read_csv(events_path)
raw_hackle_propeties = pd.read_csv(hackle_prop_path)
raw_user = pd.read_csv(user_path, dtype={0: str})

# 3. 데이터 확인

## 기기 설정

In [6]:
raw_device.head()

Unnamed: 0,id,device_id,device_model,device_vendor
0,1,000007C9-E103-4EB5-9777-A9084D4952DF,"iPhone14,7",Apple
1,2,00002245-458F-4CDD-8533-B448CD43DBD2,"iPhone14,7",Apple
2,3,00012620-313A-4502-9F8D-8DAB7443215B,"iPhone14,5",Apple
3,4,000137bc-80de-4bb5-b61d-df7f217a4501,SM-F711N,samsung
4,5,000227D6-B782-4367-91C4-486B76DF9E37,"iPhone12,3",Apple


In [7]:
# 결측치 없음
raw_device.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252380 entries, 0 to 252379
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             252380 non-null  int64 
 1   device_id      252380 non-null  object
 2   device_model   252380 non-null  object
 3   device_vendor  252380 non-null  object
dtypes: int64(1), object(3)
memory usage: 7.7+ MB


In [8]:
# 전체중복값은 없음
raw_device.duplicated().sum()

np.int64(0)

In [9]:
# device_id 고유값이 다르다 = 중복값이 있다!
raw_device['device_id'].nunique()

251720

In [10]:
# device 중복행 660개
raw_device['device_id'].duplicated().sum()

np.int64(660)

In [11]:
# 기기가 바뀌었음. 모델은 상관 없으니까 keep=last로 남겨도 되나?
raw_device[raw_device['device_id'].duplicated(keep=False)].sort_values(by=['device_id', 'id']).head(10)

Unnamed: 0,id,device_id,device_model,device_vendor
16,17,0006bed8-512b-48f9-9b33-f79bb1e225a8,SM-A325N,samsung
17,18,0006bed8-512b-48f9-9b33-f79bb1e225a8,SM-S911N,samsung
704,705,00bf3a4e-4ef9-4a6a-8558-cc5f68d4041d,SM-A546S,samsung
705,706,00bf3a4e-4ef9-4a6a-8558-cc5f68d4041d,SM-A716S,samsung
1718,1719,01BBBADC-073C-4AEC-8C00-52F0A29577B4,"iPhone12,8",Apple
1719,1720,01BBBADC-073C-4AEC-8C00-52F0A29577B4,"iPhone14,4",Apple
1824,1825,01D2DB25-5EF6-447C-B3FA-63156C4DCBCF,"iPhone12,1",Apple
1825,1826,01D2DB25-5EF6-447C-B3FA-63156C4DCBCF,"iPhone14,5",Apple
2134,2135,021fd2a9-a0e5-42e8-b9d1-0c2244101869,SM-A315N,samsung
2135,2136,021fd2a9-a0e5-42e8-b9d1-0c2244101869,SM-S918N,samsung


In [12]:
raw_device['device_model'].nunique()

522

## 해클 이벤트

In [13]:
raw_hackle_events.describe()

# 투표수 3천건 확인필요
# 하트 8억개? 확인필요

Unnamed: 0,friend_count,votes_count,heart_balance,question_id
count,10688763.0,10686765.0,10712676.0,449484.0
mean,54.34,257.27,16269.29,2766.39
std,33.51,218.07,3317340.09,1599.97
min,0.0,0.0,0.0,99.0
25%,32.0,97.0,434.0,1393.0
50%,49.0,210.0,1249.0,2569.0
75%,71.0,362.0,3188.0,4459.0
max,1365.0,3017.0,884999804.0,5133.0


In [14]:
raw_hackle_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11441319 entries, 0 to 11441318
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   event_id        object 
 1   event_datetime  object 
 2   event_key       object 
 3   session_id      object 
 4   id              object 
 5   item_name       object 
 6   page_name       object 
 7   friend_count    float64
 8   votes_count     float64
 9   heart_balance   float64
 10  question_id     float64
dtypes: float64(4), object(7)
memory usage: 960.2+ MB


In [15]:
# events_key의 종류에 따라 나올 수 있는 결측치이기때문에 이벤트키 종류별로 확인해봐야할듯
raw_hackle_events.isna().sum()

event_id                 0
event_datetime           0
event_key                0
session_id               0
id                       0
item_name         11428280
page_name         10652540
friend_count        752556
votes_count         754554
heart_balance       728643
question_id       10991835
dtype: int64

In [16]:
print(f"{raw_hackle_events['item_name'].unique()}")
print(f"{raw_hackle_events['page_name'].unique()}")

[nan '777 하트' '무료충전소' '1000 하트' '200 하트' '4000 하트']
[nan 'notice' 'home' 'profile' '학교선택' '학년선택' '반선택' '번호인증' '성별선택' '아이디입력'
 '프사설정' 'invite' '이름입력']


In [17]:
raw_hackle_events.loc[raw_hackle_events['heart_balance'].idxmax()]

event_id          01a583d7-b2b2-490b-96f6-1645b172ac0e
event_datetime                     2023-07-24 20:58:40
event_key                               $session_start
session_id                NnVWxmwjHcfnMENN9y4SrTPfcG82
id                01a583d7-b2b2-490b-96f6-1645b172ac0e
item_name                                          NaN
page_name                                          NaN
friend_count                                     65.00
votes_count                                      97.00
heart_balance                             884999804.00
question_id                                        NaN
Name: 73195, dtype: object

In [18]:
raw_hackle_merge = pd.merge(raw_hackle_propeties, raw_hackle_events, on='session_id')

In [19]:
 # friend_count 가장 많은 유저 top10
raw_hackle_merge.groupby('user_id').agg(max_friend=('friend_count', 'max')).reset_index(drop=False).sort_values(by='max_friend', ascending=False). head(10)

Unnamed: 0,user_id,max_friend
126401,1353849,1365.0
56968,1153340,1261.0
324039,xrQ3bwiTWCQ5Da9d77f686NDs113,1261.0
213737,877266,795.0
197902,1zc5NMP9LPehQ7ARWfRfJOG1SjJ2,757.0
169484,1496245,757.0
139212,1395312,753.0
211946,866386,694.0
234429,957607,671.0
242195,987572,619.0


In [20]:
# votes_count 가장 많은 유저 top10
raw_hackle_merge.groupby('user_id').agg(max_vote=('votes_count', 'max')).reset_index(drop=False).sort_values(by='max_vote', ascending=False). head(10)

Unnamed: 0,user_id,max_vote
132898,1375334,3017.0
57664,1155373,2696.0
46740,1125597,2548.0
9650,1023199,2419.0
284786,YYPgkedYLmhjaTy9XmD1LxiScR72,2419.0
93567,1253878,2309.0
289171,bL0TbwZAaxYmWLVpHzm0CxZzP772,2309.0
245433,9LdChceXXHZd0PB0KGFKh6ZKzhr2,2210.0
21200,1055839,2210.0
225639,923457,2088.0


In [21]:
# hear_balance 하트가 많은 유저 top30
raw_hackle_merge.groupby('user_id').agg(max_balance=('heart_balance', 'max')).reset_index(drop=False).sort_values(by='max_balance', ascending=False). head(30)

# 구매내역 확인해봐야할듯

Unnamed: 0,user_id,max_balance
207325,833041,884999804.0
237089,967442,703126260.0
209194,849763,9991115.0
152222,1437875,9991115.0
223365,914589,263783.0
190320,1563520,208894.0
105600,1290502,151907.0
207476,838541,100142.0
194799,1577938,100142.0
194795,1577930,100142.0
