In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False


# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
df = pd.read_csv('data/merged_data.csv')

In [3]:
# 메모리 감소를 위한 event_time drop
df = df.drop(columns=['event_time'])

## price가 0인 data 탐색

In [8]:
df_zero = df[df['price'] == 0]

In [9]:
# purchase인 경우
df_zero[df_zero['event_type'] == 'purchase']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
13767115,purchase,5911801,1487580005671109489,,0.0,562735465,3b8c48f2-b4bd-400e-a786-eace723dbffa,2020-01-13 13:24:06+03:00,1,5627354653b8c48f2-b4bd-400e-a786-eace723dbffa


- 가격이 0인 경우의 purchase는 1건뿐

In [11]:
# 가격이 0인 product별로 고유값 계산
df_zero['product_id'].nunique

# 고유값과 고유값별 개수 구하기
np.unique(df_zero['product_id'], return_counts=True)
products, counts = np.unique(df_zero['product_id'], return_counts=True)

# 딕셔너리 형태로
product_count_dict = dict(zip(products, counts))
product_count_dict

{np.int64(3763): np.int64(3),
 np.int64(3774): np.int64(6),
 np.int64(3776): np.int64(1),
 np.int64(3806): np.int64(4),
 np.int64(3865): np.int64(7),
 np.int64(3936): np.int64(7),
 np.int64(3945): np.int64(10),
 np.int64(3959): np.int64(7),
 np.int64(3978): np.int64(1),
 np.int64(4092): np.int64(6),
 np.int64(4102): np.int64(3),
 np.int64(4104): np.int64(1),
 np.int64(4131): np.int64(5),
 np.int64(4184): np.int64(1),
 np.int64(4185): np.int64(1),
 np.int64(4203): np.int64(5),
 np.int64(4210): np.int64(1),
 np.int64(4229): np.int64(1),
 np.int64(4246): np.int64(5),
 np.int64(4476): np.int64(2),
 np.int64(4497): np.int64(9),
 np.int64(4540): np.int64(6),
 np.int64(4542): np.int64(4),
 np.int64(4552): np.int64(3),
 np.int64(4554): np.int64(12),
 np.int64(4560): np.int64(8),
 np.int64(4569): np.int64(5),
 np.int64(4571): np.int64(14),
 np.int64(4572): np.int64(11),
 np.int64(4586): np.int64(4),
 np.int64(4587): np.int64(5),
 np.int64(4591): np.int64(15),
 np.int64(4594): np.int64(3),
 np.i

- product_id와 price가 0인 경우는 관련성이 높지는 않아 보임

In [12]:
# 등장 횟수 기준으로 내림차순 정렬된 리스트 (튜플 형태)
sorted_products = sorted(product_count_dict.items(), key=lambda x: x[1], reverse=True)

# 상위 10개만 보기
top_10_products = sorted_products[:10]
top_10_products

[(np.int64(5907812), np.int64(205)),
 (np.int64(5896186), np.int64(177)),
 (np.int64(5891052), np.int64(174)),
 (np.int64(5903628), np.int64(149)),
 (np.int64(5903915), np.int64(133)),
 (np.int64(5924418), np.int64(129)),
 (np.int64(5891053), np.int64(121)),
 (np.int64(5904031), np.int64(116)),
 (np.int64(5773605), np.int64(90)),
 (np.int64(5896187), np.int64(89))]

- 자주 나오는 product_id 상위 10개임 → 조사 필요

In [13]:
# user_session별 등장 횟수 확인
df_zero['user_session'].value_counts()

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
08738b86-4642-4dbe-8da1-7ad9ee2bd76f       1
b7efa5a7-e422-4081-afaf-a6d4f01f97fb       1
825a19d7-9905-49d4-9a1a-3f651efd1aa1       1
b7087089-41f5-48bc-b7eb-52fea86fa22c       1
00849bd2-fcd2-4cb4-af31-4e264f151848       1
Name: count, Length: 26255, dtype: int64

- 5411의 이상한 기록 존재
- 반면 1인 경우도 있어, 전부 이상기록으로 보기엔 애매

In [15]:
# 5411건의 user_session을 추적해 보기
df_5411 = df[df['user_session'] == '74206ba1-c863-4760-bbdc-b11777059e46']
df_5411

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
18698675,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698676,cart,5710986,1487580010821714008,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698677,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698678,cart,5711049,1487580010872045658,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698679,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
...,...,...,...,...,...,...,...,...,...,...
18704116,remove_from_cart,5799929,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704117,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704118,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704119,remove_from_cart,5802406,1487580004857414477,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46


In [16]:
df_5411['event_type'].value_counts()

event_type
cart                3749
remove_from_cart    1662
Name: count, dtype: int64

In [17]:
df_5411['product_id'].value_counts()

product_id
5798000    25
5772624    24
5800002    22
5712572    22
5797994    22
           ..
5779252     5
5779254     5
5779178     3
5712568     1
5751383     1
Name: count, Length: 521, dtype: int64

In [18]:
df_5411['brand'].value_counts(dropna=False)

brand
NaN    5411
Name: count, dtype: int64

- 한 유저가 19:08:37부터 19:08:57까지 약 20초 동안 cart와 remove_from_cart 반복
- 전부 brand = NaN

In [19]:
# price 0인 건이 5411인 user_id가 price가 0이 아닌 다른 물건도 구매했는지 확인
df_445777038 = df[(df['user_id'] == 445777038) & (df['price'] != 0)]
df_445777038

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
15458167,remove_from_cart,5867954,1487580009445982239,,1.51,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:13+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458171,remove_from_cart,5790689,1487580009445982239,nitrimax,3.97,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458180,remove_from_cart,5822287,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:25+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458181,remove_from_cart,5822291,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:26+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458182,remove_from_cart,5822306,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:26+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
...,...,...,...,...,...,...,...,...,...,...
15460197,remove_from_cart,5811818,1487580005671109489,masura,5.54,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:11+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460200,remove_from_cart,5813476,1487580005671109489,masura,3.95,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:15+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460201,remove_from_cart,5797969,1487580005671109489,masura,1.73,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460202,remove_from_cart,5839691,1487580005671109489,masura,3.95,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf


In [20]:
df_445777038['event_type'].value_counts()

event_type
cart                101
remove_from_cart     50
view                  4
Name: count, dtype: int64

- 같은 유저가 price가 0이 아닌 다른 물건도 구매
- brand는 다양하며, 대부분 같은 날짜에 구매

### 같은 product_id면 같은 price인가?

In [21]:
# price가 0이 아닌 df 생성
df_nonzero = df[df['price'] != 0]

In [22]:
# product_id 와 price만 보기
df_product_price = df_nonzero[['product_id','price']]

In [23]:
# 중복 제거 후 product_id 기준으로 정렬
df_product_price.drop_duplicates(inplace=True)
df_product_price.sort_values('product_id')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_product_price.drop_duplicates(inplace=True)


Unnamed: 0,product_id,price
59024,3752,15.71
4826,3762,19.37
6901312,3762,18.40
40456,3763,16.03
6929263,3763,15.54
...,...,...
20520951,5932581,6.02
20506445,5932582,6.02
20510028,5932583,6.02
20522512,5932584,6.02


In [24]:
df.loc[6929263]

event_type                                                    view
product_id                                                    3763
category_id                                    1487580005411062629
brand                                                          cnd
price                                                        15.54
user_id                                                  540274846
user_session                  097889b8-0e7a-4970-8741-36317d9dbf42
event_time_moscow                        2019-11-21 15:50:40+03:00
event_month                                                     11
user_key             540274846097889b8-0e7a-4970-8741-36317d9dbf42
Name: 6929263, dtype: object

In [25]:
df.loc[40456]

event_type                                                    view
product_id                                                    3763
category_id                                    1487580005411062629
brand                                                          cnd
price                                                        16.03
user_id                                                  551319165
user_session                  4cfee1d6-8a6f-4564-87b8-dc7277027cac
event_time_moscow                        2019-10-01 13:04:29+03:00
event_month                                                     10
user_key             5513191654cfee1d6-8a6f-4564-87b8-dc7277027cac
Name: 40456, dtype: object

- 한 달간의 차이가 있는 것으로 보아, 상품의 가격 변동이 있는 것으로 추측

### 어디까지 0을 그냥 둘 것인가

In [34]:
zero_user_session_count = df_zero['user_session'].value_counts()
zero_user_session_count

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
08738b86-4642-4dbe-8da1-7ad9ee2bd76f       1
b7efa5a7-e422-4081-afaf-a6d4f01f97fb       1
825a19d7-9905-49d4-9a1a-3f651efd1aa1       1
b7087089-41f5-48bc-b7eb-52fea86fa22c       1
00849bd2-fcd2-4cb4-af31-4e264f151848       1
Name: count, Length: 26255, dtype: int64

In [164]:
df_zero[df_zero['user_session'] == '74206ba1-c863-4760-bbdc-b11777059e46']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
18698675,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698676,cart,5710986,1487580010821714008,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698677,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698678,cart,5711049,1487580010872045658,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698679,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
...,...,...,...,...,...,...,...,...,...,...
18704116,remove_from_cart,5799929,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704117,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704118,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704119,remove_from_cart,5802406,1487580004857414477,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46


In [96]:
# 2816 / 상위 2번째
df_zero[df_zero['user_session'] == 'd7998d70-7562-4ff0-8870-5ace290db2c9']

# 첫 번째 경우만 다르고 나머지는 거의 비슷한 시간
# 봇이 아닐까 추정

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
2481778,view,5896182,1487580009051717646,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2019-10-19 05:07:27+03:00,10,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819214,cart,4874,1487580011157258342,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819215,cart,4894,1487580008187692007,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819216,cart,4901,1487580008162526182,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819217,cart,4931,1487580008162526182,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
...,...,...,...,...,...,...,...,...,...,...
19822040,remove_from_cart,5851621,1487580007483048900,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822041,remove_from_cart,5860303,1487580007004898224,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822042,remove_from_cart,5860306,1487580007004898224,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822043,remove_from_cart,6906,1487580007634043851,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9


In [92]:
# 2383 / 상위 3번째
df_zero[df_zero['user_session'] == '6b408b88-8a2e-4e84-a1ea-2b5e23deaba7']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
17441569,cart,5852677,1648815651034235876,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441570,cart,5852678,1648815651034235876,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441571,cart,5857862,1487580011627020412,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441572,cart,5857863,1487580011627020412,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441573,cart,5858219,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441574,cart,5858220,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441575,cart,5858221,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441576,cart,5858223,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441577,cart,5858224,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441578,cart,5858225,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7


In [95]:
pd.reset_option('display.max_rows')
# price가 0인 user_session들의 분포 파악
zero_user_session_count[
    (zero_user_session_count >= 16) & (zero_user_session_count <= 17)
]

# 17 이상은 같은 시간대에 여러 개
# 16 이하는 시간대가 다른 경우가 다수

user_session
28161d84-155d-7180-2bd0-6e89bd4fdd11    17
2aa506ae-41a7-4d1f-b28e-67b470661065    17
fa6c9877-e34d-4176-a3b4-9c6fbb8dde2d    17
dcf18d11-6eb9-44c9-9534-4cadb59947a5    17
5ffc0af2-179a-4011-9f2c-0daa91b2f2a9    17
                                        ..
caefe03f-3650-4b1b-bfab-9d6c409e9b4d    16
5566efcd-2ec3-42b2-8776-e940e0357299    16
67a6fd3c-8b85-407a-bf6f-cb85bc9d3fca    16
4de9821e-afd8-4bbb-b002-be718cc69e70    16
f9279e13-e96e-4efb-8f8b-913322bcf651    16
Name: count, Length: 90, dtype: int64

In [79]:
# 16 가장 아래 user
session = 'f9279e13-e96e-4efb-8f8b-913322bcf651'
df_zero[df_zero['user_session'] == session]

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
11724708,cart,5911179,1597770225539875791,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:14:01+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11724937,cart,5910604,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:15:33+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725052,cart,5910286,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:16:26+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725181,cart,5909364,1487580011652186237,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:17:25+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725249,view,5909354,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:17:51+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725704,view,5910604,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:21:30+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725775,view,5910286,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:22:04+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725878,remove_from_cart,5910286,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:22:53+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725885,remove_from_cart,5911179,1597770225539875791,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:22:55+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651
11725889,remove_from_cart,5910604,1783999072332415142,,0.0,478554505,f9279e13-e96e-4efb-8f8b-913322bcf651,2019-12-24 23:23:00+03:00,12,478554505f9279e13-e96e-4efb-8f8b-913322bcf651


In [89]:
# 17 가장 위 유저
session = '28161d84-155d-7180-2bd0-6e89bd4fdd11'
df_zero[df_zero['user_session'] == session]

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
17949790,cart,38745,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949792,cart,49674,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949793,cart,5587760,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949794,cart,5773607,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949795,cart,5773611,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949796,cart,5810145,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949797,cart,5815653,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949798,cart,5815662,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949799,cart,5815666,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11
17949800,cart,5820773,1487580006317032337,,0.0,467753044,28161d84-155d-7180-2bd0-6e89bd4fdd11,2020-02-10 14:03:09+03:00,2,46775304428161d84-155d-7180-2bd0-6e89bd4fdd11


### 파이프라인
- 가설
    - view는 같은 시간에 한 번에 여러 개를 할 수 없다.
    - cart, remove는 한 번에 여러 개를 할 수 있다.
    - 정상의 기준은 1분에 60개 (1초당 1개)로 설정


2. view만 있는 테이블 생성
3. view 제외한 테이블 생성
    - event_type, event_time 기준으로 drop.duplicate
4. 두 테이블 다시 병합
5. user_section이 1뿐인 경우
    - 나중에 max-min 계산할 때, 0이 되어버림
    - 제외 후, 나중에 
    5-1. 그 외에 max-min 계산시 0인 경우도 제거(여러 event_type이 한 번에 일어난 경우)
6. 같은 user_session의 (최종 시간) - (최초 시간) / 분을 했을 때, 1분에 60개 초과면 이상치로 보고 제거
    - 6-1. 남은 것들 중 (최종 시간) - (최초 시간) 이 며칠 단위인 경우 이상일 수 있으니 확인

In [132]:
df_zero

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
2454,view,5892084,1597770225539875791,,0.0,555458569,c048be2f-547d-4add-9bc6-09aa1e7ae9de,2019-10-01 05:05:17+03:00,10,555458569c048be2f-547d-4add-9bc6-09aa1e7ae9de
2614,view,5892052,1487580010377117763,,0.0,555455025,320f6021-30ac-4a58-ae17-bac1cc32aac3,2019-10-01 05:15:41+03:00,10,555455025320f6021-30ac-4a58-ae17-bac1cc32aac3
5686,view,5873432,2007399943458784057,,0.0,500054739,701220b4-45b4-4028-bcc3-5d77480253ff,2019-10-01 07:25:00+03:00,10,500054739701220b4-45b4-4028-bcc3-5d77480253ff
6086,view,5882605,1487580013522845895,,0.0,523154930,caaf1f68-7b97-4d27-9c3d-8b28c2920ffd,2019-10-01 07:33:15+03:00,10,523154930caaf1f68-7b97-4d27-9c3d-8b28c2920ffd
8344,view,5889621,1487580010561667147,,0.0,523988665,00849bd2-fcd2-4cb4-af31-4e264f151848,2019-10-01 08:16:30+03:00,10,52398866500849bd2-fcd2-4cb4-af31-4e264f151848
...,...,...,...,...,...,...,...,...,...,...
20671632,view,5932207,1487580005092295511,,0.0,326342054,85aa647f-6062-909a-806b-3e4e493e8fb1,2020-02-29 23:22:46+03:00,2,32634205485aa647f-6062-909a-806b-3e4e493e8fb1
20672023,view,5923106,1487580008246412266,,0.0,622047714,74f04dc6-2b3c-4565-beda-f575d73ed81c,2020-02-29 23:26:16+03:00,2,62204771474f04dc6-2b3c-4565-beda-f575d73ed81c
20673702,view,5932595,1487580013950664926,,0.0,586098065,97e4d28c-5954-4742-b9a9-8de24dc432f3,2020-02-29 23:40:38+03:00,2,58609806597e4d28c-5954-4742-b9a9-8de24dc432f3
20674535,view,5932595,1487580013950664926,,0.0,539621479,a03c8167-4528-4ef1-9b45-22731ce1c754,2020-02-29 23:47:07+03:00,2,539621479a03c8167-4528-4ef1-9b45-22731ce1c754


In [150]:
df_view = df_zero[df_zero['event_type'] == 'view'].copy()

In [151]:
df_non_view = df_zero[df_zero['event_type'] != 'view'].copy()
df_non_view = df_non_view.drop_duplicates(subset=['event_type', 'event_time_moscow'])
df_non_view

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
10358,cart,5892179,1487580013950664926,,0.0,465338762,36a90d24-5901-4424-adf0-fd84e0c53f34,2019-10-01 08:41:09+03:00,10,46533876236a90d24-5901-4424-adf0-fd84e0c53f34
47248,cart,5891962,1487580009311764506,,0.0,520610701,7b895931-ebea-43fb-aa99-c7c02059f4c4,2019-10-01 13:51:13+03:00,10,5206107017b895931-ebea-43fb-aa99-c7c02059f4c4
47506,cart,5891962,1487580009311764506,,0.0,520610701,7b895931-ebea-43fb-aa99-c7c02059f4c4,2019-10-01 13:52:58+03:00,10,5206107017b895931-ebea-43fb-aa99-c7c02059f4c4
49995,cart,5891927,1487580007675986893,,0.0,378981686,4bcb3d40-1751-4059-91c4-b421cd9ca011,2019-10-01 14:04:22+03:00,10,3789816864bcb3d40-1751-4059-91c4-b421cd9ca011
51270,cart,5891933,1487580008313521133,,0.0,469109024,80a03dab-aeae-4660-b8fa-48649872af06,2019-10-01 14:12:05+03:00,10,46910902480a03dab-aeae-4660-b8fa-48649872af06
...,...,...,...,...,...,...,...,...,...,...
20638315,cart,5724575,1487580005427839846,,0.0,226572682,deaaca4b-63c9-41c5-ba54-b7a41b4193c5,2020-02-29 19:50:08+03:00,2,226572682deaaca4b-63c9-41c5-ba54-b7a41b4193c5
20638329,remove_from_cart,5812120,1761186209054327497,,0.0,226572682,deaaca4b-63c9-41c5-ba54-b7a41b4193c5,2020-02-29 19:50:08+03:00,2,226572682deaaca4b-63c9-41c5-ba54-b7a41b4193c5
20660750,cart,5687139,1487580008145748965,,0.0,360192262,9eaf0f62-0e7f-42f6-9b22-88218577c3ee,2020-02-29 22:13:04+03:00,2,3601922629eaf0f62-0e7f-42f6-9b22-88218577c3ee
20660763,remove_from_cart,5809222,1487580007675986893,,0.0,360192262,9eaf0f62-0e7f-42f6-9b22-88218577c3ee,2020-02-29 22:13:04+03:00,2,3601922629eaf0f62-0e7f-42f6-9b22-88218577c3ee


In [152]:
df_new_zero = pd.concat([df_view, df_non_view], ignore_index=False)

In [167]:
# user_session이 1개뿐인 경우 파이프라인에서 제외
session_counts = df_new_zero['user_session'].value_counts()
single_sessions = session_counts[session_counts == 1].index

# user_section : 1인 df
df_single_sessions = df_new_zero[df_new_zero['user_session'].isin(single_sessions)].copy()

# user_section : 1이 제외된 df
df_new_zero = df_new_zero[~df_new_zero['user_session'].isin(single_sessions)].copy()

In [166]:
single_sessions

Index(['8b2bf9d8-43f0-43b2-bed3-13b2c956cada',
       '8d9aa6ed-53ca-4dcc-aa05-9c93de5fbbf8',
       '57476289-7b4e-4197-b155-268b931e180a',
       'eb3d4562-c590-488e-bb37-90bbfcb041a0',
       '5485c402-49b2-4c97-98e5-8e5ff2f29d88',
       'dd77ab70-4969-4548-838e-d4a7f80a70e9',
       '1fb93384-f074-49b3-a3ad-f7d33b05907d',
       '938de093-cfc0-47ad-ac61-de6755e1dc64',
       '7d64749a-a6a3-488b-8def-d360fa2bb6ae',
       '5e51a2e8-a82e-4622-ba16-bedf1cffcec6',
       ...
       '5272655c-e49e-4415-a08c-31753125ba68',
       '7c4f6972-ead8-46f1-85d4-e7d7028f06c3',
       '701220b4-45b4-4028-bcc3-5d77480253ff',
       '320f6021-30ac-4a58-ae17-bac1cc32aac3',
       'c048be2f-547d-4add-9bc6-09aa1e7ae9de',
       '8bf369b4-92c0-4fb8-88a5-8a2dd0947e46',
       'cc622d89-5915-482b-b603-aa5e7f7d28c0',
       '5c033304-ca82-4259-b17e-784151bc1480',
       '200a74a9-b790-4d91-a1d0-2b8fbb1f2b27',
       'fa184a6c-7f37-4d1f-bb6b-99aa4aa7307b'],
      dtype='object', name='user_session', lengt

In [154]:
df_new_zero['event_time_moscow'] = pd.to_datetime(df_new_zero['event_time_moscow'])

In [155]:
# max, min의 시간차 (분 단위)
session_time = df_new_zero.groupby('user_session')['event_time_moscow'].agg(['min', 'max'])
session_time['duration_min'] = (session_time['max'] - session_time['min']).dt.total_seconds() / 60

# user_session 개수
session_event_count = df_new_zero.groupby('user_session').size()

In [None]:
event_rate = session_event_count[valid_sessions] / session_time.loc[valid_sessions, 'duration_min']

In [None]:
abnormal_sessions = event_rate[event_rate > 60].index
df_new_zero = df_new_zero[~df_new_zero['user_session'].isin(abnormal_sessions)].copy()

---

In [156]:
zero_duration_sessions = session_time[session_time['duration_min'] == 0]
zero_duration_sessions

Unnamed: 0_level_0,min,max,duration_min
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2020-02-04 07:26:46+03:00,0.0
01c3f29d-82e5-4439-a682-7b1cc9778861,2020-02-06 15:32:13+03:00,2020-02-06 15:32:13+03:00,0.0
0242507c-4ce3-4b11-ad90-38681ec04cd4,2020-02-01 10:50:55+03:00,2020-02-01 10:50:55+03:00,0.0
02f7d775-be99-442a-bce7-827e5797d3a3,2020-02-26 21:03:43+03:00,2020-02-26 21:03:43+03:00,0.0
0385bfed-fbc2-4f80-93fa-6718c61f5192,2020-02-09 13:57:24+03:00,2020-02-09 13:57:24+03:00,0.0
...,...,...,...
fda5d2e7-541d-44e4-ac72-d10dada5135a,2020-02-25 15:24:58+03:00,2020-02-25 15:24:58+03:00,0.0
fe6c7b53-b911-4c01-9748-ad7d5d2d1d74,2020-02-10 17:49:51+03:00,2020-02-10 17:49:51+03:00,0.0
fe8a0d4c-0d01-49b6-b03c-0ebfce63605e,2020-02-12 03:33:06+03:00,2020-02-12 03:33:06+03:00,0.0
ff0b05e0-b444-441f-b509-7bebe49fa9ff,2020-02-03 09:54:21+03:00,2020-02-03 09:54:21+03:00,0.0


In [165]:
# 1초 동안 cart - remove from cart 다수 처리
# 어렵다고 판단되어 제거
session = '00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de'
df_zero[df_zero['user_session'] == session]

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
17018596,cart,5639734,1487580005595612013,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018597,cart,5649238,1487580005511725929,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018598,cart,5686081,1487580008145748965,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018599,cart,5698325,1487580011752849537,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018600,cart,5700955,1487580005092295511,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018601,cart,5700956,1487580005268456287,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018602,cart,5760349,1487580009261432856,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018603,cart,5784085,1487580008145748965,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018604,cart,5804433,1487580009286598681,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de
17018605,cart,5822368,1487580005595612013,,0.0,350828700,00a03cf5-b9bb-4d53-94ed-ac04ac8cd8de,2020-02-04 07:26:46+03:00,2,35082870000a03cf5-b9bb-4d53-94ed-ac04ac8cd8de


---