In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False


# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
df = pd.read_csv('data/merged_data.csv')

In [3]:
# 메모리 감소를 위한 event_time drop
df = df.drop(columns=['event_time'])

## price가 0인 data 탐색

In [8]:
df_zero = df[df['price'] == 0]

In [9]:
# purchase인 경우
df_zero[df_zero['event_type'] == 'purchase']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
13767115,purchase,5911801,1487580005671109489,,0.0,562735465,3b8c48f2-b4bd-400e-a786-eace723dbffa,2020-01-13 13:24:06+03:00,1,5627354653b8c48f2-b4bd-400e-a786-eace723dbffa


- 가격이 0인 경우의 purchase는 1건뿐

In [11]:
# 가격이 0인 product별로 고유값 계산
df_zero['product_id'].nunique

# 고유값과 고유값별 개수 구하기
np.unique(df_zero['product_id'], return_counts=True)
products, counts = np.unique(df_zero['product_id'], return_counts=True)

# 딕셔너리 형태로
product_count_dict = dict(zip(products, counts))
product_count_dict

{np.int64(3763): np.int64(3),
 np.int64(3774): np.int64(6),
 np.int64(3776): np.int64(1),
 np.int64(3806): np.int64(4),
 np.int64(3865): np.int64(7),
 np.int64(3936): np.int64(7),
 np.int64(3945): np.int64(10),
 np.int64(3959): np.int64(7),
 np.int64(3978): np.int64(1),
 np.int64(4092): np.int64(6),
 np.int64(4102): np.int64(3),
 np.int64(4104): np.int64(1),
 np.int64(4131): np.int64(5),
 np.int64(4184): np.int64(1),
 np.int64(4185): np.int64(1),
 np.int64(4203): np.int64(5),
 np.int64(4210): np.int64(1),
 np.int64(4229): np.int64(1),
 np.int64(4246): np.int64(5),
 np.int64(4476): np.int64(2),
 np.int64(4497): np.int64(9),
 np.int64(4540): np.int64(6),
 np.int64(4542): np.int64(4),
 np.int64(4552): np.int64(3),
 np.int64(4554): np.int64(12),
 np.int64(4560): np.int64(8),
 np.int64(4569): np.int64(5),
 np.int64(4571): np.int64(14),
 np.int64(4572): np.int64(11),
 np.int64(4586): np.int64(4),
 np.int64(4587): np.int64(5),
 np.int64(4591): np.int64(15),
 np.int64(4594): np.int64(3),
 np.i

- product_id와 price가 0인 경우는 관련성이 높지는 않아 보임

In [12]:
# 등장 횟수 기준으로 내림차순 정렬된 리스트 (튜플 형태)
sorted_products = sorted(product_count_dict.items(), key=lambda x: x[1], reverse=True)

# 상위 10개만 보기
top_10_products = sorted_products[:10]
top_10_products

[(np.int64(5907812), np.int64(205)),
 (np.int64(5896186), np.int64(177)),
 (np.int64(5891052), np.int64(174)),
 (np.int64(5903628), np.int64(149)),
 (np.int64(5903915), np.int64(133)),
 (np.int64(5924418), np.int64(129)),
 (np.int64(5891053), np.int64(121)),
 (np.int64(5904031), np.int64(116)),
 (np.int64(5773605), np.int64(90)),
 (np.int64(5896187), np.int64(89))]

- 자주 나오는 product_id 상위 10개임 → 조사 필요

In [13]:
# user_session별 등장 횟수 확인
df_zero['user_session'].value_counts()

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
08738b86-4642-4dbe-8da1-7ad9ee2bd76f       1
b7efa5a7-e422-4081-afaf-a6d4f01f97fb       1
825a19d7-9905-49d4-9a1a-3f651efd1aa1       1
b7087089-41f5-48bc-b7eb-52fea86fa22c       1
00849bd2-fcd2-4cb4-af31-4e264f151848       1
Name: count, Length: 26255, dtype: int64

- 5411의 이상한 기록 존재
- 반면 1인 경우도 있어, 전부 이상기록으로 보기엔 애매

In [15]:
# 5411건의 user_session을 추적해 보기
df_5411 = df[df['user_session'] == '74206ba1-c863-4760-bbdc-b11777059e46']
df_5411

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
18698675,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698676,cart,5710986,1487580010821714008,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698677,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698678,cart,5711049,1487580010872045658,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18698679,cart,24755,1487580007759872977,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:37+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
...,...,...,...,...,...,...,...,...,...,...
18704116,remove_from_cart,5799929,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704117,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704118,remove_from_cart,5800002,1658462125284131265,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46
18704119,remove_from_cart,5802406,1487580004857414477,,0.0,445777038,74206ba1-c863-4760-bbdc-b11777059e46,2020-02-15 19:08:57+03:00,2,44577703874206ba1-c863-4760-bbdc-b11777059e46


In [16]:
df_5411['event_type'].value_counts()

event_type
cart                3749
remove_from_cart    1662
Name: count, dtype: int64

In [17]:
df_5411['product_id'].value_counts()

product_id
5798000    25
5772624    24
5800002    22
5712572    22
5797994    22
           ..
5779252     5
5779254     5
5779178     3
5712568     1
5751383     1
Name: count, Length: 521, dtype: int64

In [18]:
df_5411['brand'].value_counts(dropna=False)

brand
NaN    5411
Name: count, dtype: int64

- 한 유저가 19:08:37부터 19:08:57까지 약 20초 동안 cart와 remove_from_cart 반복
- 전부 brand = NaN

In [19]:
# price 0인 건이 5411인 user_id가 price가 0이 아닌 다른 물건도 구매했는지 확인
df_445777038 = df[(df['user_id'] == 445777038) & (df['price'] != 0)]
df_445777038

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
15458167,remove_from_cart,5867954,1487580009445982239,,1.51,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:13+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458171,remove_from_cart,5790689,1487580009445982239,nitrimax,3.97,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458180,remove_from_cart,5822287,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:25+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458181,remove_from_cart,5822291,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:26+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15458182,remove_from_cart,5822306,1487580005595612013,,3.00,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:16:26+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
...,...,...,...,...,...,...,...,...,...,...
15460197,remove_from_cart,5811818,1487580005671109489,masura,5.54,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:11+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460200,remove_from_cart,5813476,1487580005671109489,masura,3.95,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:15+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460201,remove_from_cart,5797969,1487580005671109489,masura,1.73,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf
15460202,remove_from_cart,5839691,1487580005671109489,masura,3.95,445777038,3fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf,2020-01-25 00:42:16+03:00,1,4457770383fde3b34-e72e-4bd6-b3c9-3c95cbe85ecf


In [20]:
df_445777038['event_type'].value_counts()

event_type
cart                101
remove_from_cart     50
view                  4
Name: count, dtype: int64

- 같은 유저가 price가 0이 아닌 다른 물건도 구매
- brand는 다양하며, 대부분 같은 날짜에 구매

### 같은 product_id면 같은 price인가?

In [21]:
# price가 0이 아닌 df 생성
df_nonzero = df[df['price'] != 0]

In [22]:
# product_id 와 price만 보기
df_product_price = df_nonzero[['product_id','price']]

In [23]:
# 중복 제거 후 product_id 기준으로 정렬
df_product_price.drop_duplicates(inplace=True)
df_product_price.sort_values('product_id')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_product_price.drop_duplicates(inplace=True)


Unnamed: 0,product_id,price
59024,3752,15.71
4826,3762,19.37
6901312,3762,18.40
40456,3763,16.03
6929263,3763,15.54
...,...,...
20520951,5932581,6.02
20506445,5932582,6.02
20510028,5932583,6.02
20522512,5932584,6.02


In [24]:
df.loc[6929263]

event_type                                                    view
product_id                                                    3763
category_id                                    1487580005411062629
brand                                                          cnd
price                                                        15.54
user_id                                                  540274846
user_session                  097889b8-0e7a-4970-8741-36317d9dbf42
event_time_moscow                        2019-11-21 15:50:40+03:00
event_month                                                     11
user_key             540274846097889b8-0e7a-4970-8741-36317d9dbf42
Name: 6929263, dtype: object

In [25]:
df.loc[40456]

event_type                                                    view
product_id                                                    3763
category_id                                    1487580005411062629
brand                                                          cnd
price                                                        16.03
user_id                                                  551319165
user_session                  4cfee1d6-8a6f-4564-87b8-dc7277027cac
event_time_moscow                        2019-10-01 13:04:29+03:00
event_month                                                     10
user_key             5513191654cfee1d6-8a6f-4564-87b8-dc7277027cac
Name: 40456, dtype: object

- 한 달간의 차이가 있는 것으로 보아, 상품의 가격 변동이 있는 것으로 추측

### 어디까지 0을 그냥 둘 것인가

In [26]:
df_zero['user_id'].value_counts()

user_id
419558969    6002
445777038    5411
527021202    4286
454177715    2816
497593427    2485
             ... 
561571154       1
621949668       1
399387076       1
619482248       1
515250501       1
Name: count, Length: 12493, dtype: int64

In [34]:
zero_user_session_count = df_zero['user_session'].value_counts()
zero_user_session_count

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
08738b86-4642-4dbe-8da1-7ad9ee2bd76f       1
b7efa5a7-e422-4081-afaf-a6d4f01f97fb       1
825a19d7-9905-49d4-9a1a-3f651efd1aa1       1
b7087089-41f5-48bc-b7eb-52fea86fa22c       1
00849bd2-fcd2-4cb4-af31-4e264f151848       1
Name: count, Length: 26255, dtype: int64

In [53]:
df_zero[df_zero['user_session'] == 'd7998d70-7562-4ff0-8870-5ace290db2c9']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
2481778,view,5896182,1487580009051717646,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2019-10-19 05:07:27+03:00,10,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819214,cart,4874,1487580011157258342,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819215,cart,4894,1487580008187692007,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819216,cart,4901,1487580008162526182,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19819217,cart,4931,1487580008162526182,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:06+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
...,...,...,...,...,...,...,...,...,...,...
19822040,remove_from_cart,5851621,1487580007483048900,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822041,remove_from_cart,5860303,1487580007004898224,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822042,remove_from_cart,5860306,1487580007004898224,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9
19822043,remove_from_cart,6906,1487580007634043851,,0.0,454177715,d7998d70-7562-4ff0-8870-5ace290db2c9,2020-02-24 06:44:36+03:00,2,454177715d7998d70-7562-4ff0-8870-5ace290db2c9


In [51]:
df_zero[df_zero['user_session'] == '6b408b88-8a2e-4e84-a1ea-2b5e23deaba7']

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
17441569,cart,5852677,1648815651034235876,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441570,cart,5852678,1648815651034235876,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441571,cart,5857862,1487580011627020412,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441572,cart,5857863,1487580011627020412,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17441573,cart,5858219,1487580005511725929,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:37+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
...,...,...,...,...,...,...,...,...,...,...
17443974,remove_from_cart,5866149,2055161088059638328,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:50+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17443975,remove_from_cart,5866163,1487580007675986893,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:50+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17443976,remove_from_cart,5866164,1487580007675986893,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:50+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7
17443977,remove_from_cart,5866172,1487580007675986893,,0.0,497593427,6b408b88-8a2e-4e84-a1ea-2b5e23deaba7,2020-02-06 21:10:50+03:00,2,4975934276b408b88-8a2e-4e84-a1ea-2b5e23deaba7


In [43]:
zero_user_session_count

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
08738b86-4642-4dbe-8da1-7ad9ee2bd76f       1
b7efa5a7-e422-4081-afaf-a6d4f01f97fb       1
825a19d7-9905-49d4-9a1a-3f651efd1aa1       1
b7087089-41f5-48bc-b7eb-52fea86fa22c       1
00849bd2-fcd2-4cb4-af31-4e264f151848       1
Name: count, Length: 26255, dtype: int64

In [58]:
# zero_user_session_count (price가 0인 user_session들의 분포 파악)
zero_user_session_count[zero_user_session_count > 20]

user_session
74206ba1-c863-4760-bbdc-b11777059e46    5411
d7998d70-7562-4ff0-8870-5ace290db2c9    2816
6b408b88-8a2e-4e84-a1ea-2b5e23deaba7    2383
15047a59-66a9-4f06-8c5a-bc86688eaf41    1407
39d88d60-43c1-47a5-9d2f-90d08be62044    1203
                                        ... 
389181e5-ce12-4a71-9568-83bd4e5500bc      21
60dcbf0c-8a23-4784-9127-87d8857bc423      21
add0f83e-e274-4165-9194-5c046c0d398f      21
98cb641e-d45c-49cf-abcc-e9515a06c713      21
81ee648d-fd7f-4f09-a7ae-18feb9fdc372      21
Name: count, Length: 611, dtype: int64

In [57]:
session = 'b526e453-27d4-4045-8844-799258d4d913'
df_zero[df_zero['user_session'] == session]

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
16548452,cart,5552332,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548454,cart,5552336,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548456,cart,5552346,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548457,cart,5662640,1487580010754605141,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548458,cart,5686925,1487580009311764506,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548460,cart,5746635,1487580009445982239,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548462,cart,5749145,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548464,cart,5749146,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548466,cart,5749148,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
16548469,cart,5749153,1487580007592100809,,0.0,486545211,b526e453-27d4-4045-8844-799258d4d913,2020-02-01 10:28:24+03:00,2,486545211b526e453-27d4-4045-8844-799258d4d913
