In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False


# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
# 데이터프레임을 넣고 column별 특성 및 결측값, 고유값들을 확인하는 함수를 작성해본다.
# 필수는 아니지만 전체적인 흐름을 파악하기 쉬워진다.

def resumetable(df, n):
    print(f'데이터셋 크기: {df.shape}')                                # 데이터프레임의 전체 크기(행, 열) 출력

    summary = pd.DataFrame(df.dtypes, columns=['데이터 타입'])         # 각 피처의 데이터 타입을 가져와 데이터프레임으로 생성
    summary = summary.reset_index()                                    # 인덱스를 초기화하여 컬럼으로 변환
    summary = summary.rename(columns={'index':'피처'})                 # 'index' 컬럼명을 '피처'로 변경

    summary['결측값 개수'] = df.isnull().sum().values                 # 각 피처의 결측값(null) 개수 계산
    summary['고유값 개수'] = df.nunique().values                      # 각 피처의 고유값 개수 계산
    
    for i in range(n):
        summary[f"{i+1}번째 값"] = df.iloc[i].values
    
    return summary                                                     # 요약 테이블 반환

In [3]:
df_oct = pd.read_csv('data/2019_Oct.csv')

In [4]:
df_nov = pd.read_csv('data/2019_Nov.csv')

In [5]:
df_dec = pd.read_csv('data/2019_Dec.csv')

In [6]:
df_jan = pd.read_csv('data/2020_Jan.csv')

In [7]:
df_feb = pd.read_csv('data/2020_feb.csv')

In [8]:
resumetable(df_oct, 3)

데이터셋 크기: (4102283, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,1782439,2019-10-01 00:00:00 UTC,2019-10-01 00:00:03 UTC,2019-10-01 00:00:07 UTC
1,event_type,object,0,4,cart,cart,cart
2,product_id,int64,0,41899,5773203,5773353,5881589
3,category_id,int64,0,490,1487580005134238553,1487580005134238553,2151191071051219817
4,category_code,object,4034806,11,,,
5,brand,object,1659261,240,runail,runail,lovely
6,price,float64,0,1820,2.62,2.62,13.48
7,user_id,int64,0,399664,463240011,463240011,429681830
8,user_session,object,637,873960,26dd6e6e-4dac-4778-8d2c-92e149dab885,26dd6e6e-4dac-4778-8d2c-92e149dab885,49e8d843-adf3-428b-a2c3-fe8bc6a307c9


In [9]:
resumetable(df_nov, 3)

데이터셋 크기: (4635837, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,1810735,2019-11-01 00:00:02 UTC,2019-11-01 00:00:09 UTC,2019-11-01 00:00:10 UTC
1,event_type,object,0,4,view,cart,view
2,product_id,int64,0,43419,5802432,5844397,5837166
3,category_id,int64,0,491,1487580009286598681,1487580006317032337,1783999064103190764
4,category_code,object,4560089,10,,,
5,brand,object,1986029,239,,,pnb
6,price,float64,0,2538,0.32,2.38,22.22
7,user_id,int64,0,368232,562076640,553329724,556138645
8,user_session,object,813,942022,09fafd6c-6c99-46b1-834f-33527f4de241,2067216c-31b5-455d-a1cc-af0575a34ffb,57ed222e-a54a-4907-9944-5a875c2d7f4f


In [10]:
resumetable(df_dec, 3)

데이터셋 크기: (3533286, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,1654771,2019-12-01 00:00:00 UTC,2019-12-01 00:00:00 UTC,2019-12-01 00:00:02 UTC
1,event_type,object,0,4,remove_from_cart,view,cart
2,product_id,int64,0,44624,5712790,5764655,4958
3,category_id,int64,0,482,1487580005268456287,1487580005411062629,1487580009471148064
4,category_code,object,3474821,10,,,
5,brand,object,1510289,252,f.o.x,cnd,runail
6,price,float64,0,2122,6.27,29.05,1.19
7,user_id,int64,0,370154,576802932,412120092,494077766
8,user_session,object,779,839812,51d85cb0-897f-48d2-918b-ad63965c12dc,8adff31e-2051-4894-9758-224bfa8aec18,c99a50e8-2fac-4c4d-89ec-41c05f114554


In [11]:
resumetable(df_jan, 3)

데이터셋 크기: (4264752, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,1811717,2020-01-01 00:00:00 UTC,2020-01-01 00:00:09 UTC,2020-01-01 00:00:19 UTC
1,event_type,object,0,4,view,view,view
2,product_id,int64,0,45484,5809910,5812943,5798924
3,category_id,int64,0,482,1602943681873052386,1487580012121948301,1783999068867920626
4,category_code,object,4190033,12,,,
5,brand,object,1775630,256,grattol,kinetics,zinger
6,price,float64,0,2097,5.24,3.97,3.97
7,user_id,int64,0,410073,595414620,595414640,595412617
8,user_session,object,1314,965351,4adb70bb-edbd-4981-b60f-a05bfd32683a,c8c5205d-be43-4f1d-aa56-4828b8151c8a,46a5010f-bd69-4fbe-a00d-bb17aa7b46f3


In [12]:
resumetable(df_feb, 3)

데이터셋 크기: (4156682, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,1723228,2020-02-01 00:00:01 UTC,2020-02-01 00:00:03 UTC,2020-02-01 00:00:08 UTC
1,event_type,object,0,4,cart,view,view
2,product_id,int64,0,48579,5844305,5769925,5817765
3,category_id,int64,0,487,1487580006317032337,1487580013841613016,1487580008246412266
4,category_code,object,4079497,11,,,
5,brand,object,1825908,268,,kapous,zeitun
6,price,float64,0,2199,2.14,4.22,11.03
7,user_id,int64,0,391055,485174092,594621622,495404942
8,user_session,object,1055,931668,4be9643a-420b-4c6b-83dd-a15e772fbf7a,a88baf11-9cd0-4362-bde4-1bfeed3f641d,3a569c8d-d848-4f09-a925-33f673d84c46


In [13]:
# 10월부터 2월까지 5개 파일 병합
df = pd.concat([df_oct, df_nov, df_dec, df_jan, df_feb], axis=0, ignore_index=True)

In [14]:
resumetable(df,3)

데이터셋 크기: (20692840, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,8782890,2019-10-01 00:00:00 UTC,2019-10-01 00:00:03 UTC,2019-10-01 00:00:07 UTC
1,event_type,object,0,4,cart,cart,cart
2,product_id,int64,0,54571,5773203,5773353,5881589
3,category_id,int64,0,525,1487580005134238553,1487580005134238553,2151191071051219817
4,category_code,object,20339246,12,,,
5,brand,object,8757117,273,runail,runail,lovely
6,price,float64,0,2860,2.62,2.62,13.48
7,user_id,int64,0,1639358,463240011,463240011,429681830
8,user_session,object,4598,4535941,26dd6e6e-4dac-4778-8d2c-92e149dab885,26dd6e6e-4dac-4778-8d2c-92e149dab885,49e8d843-adf3-428b-a2c3-fe8bc6a307c9


- category_code

In [15]:
# category_code
df['category_code'].value_counts(dropna=False, normalize=True)

category_code
NaN                                       9.829123e-01
appliances.environment.vacuum             7.208677e-03
stationery.cartrige                       2.886409e-03
apparel.glove                             2.540734e-03
furniture.living_room.cabinet             1.476839e-03
accessories.bag                           1.172048e-03
furniture.bathroom.bath                   1.156487e-03
appliances.personal.hair_cutter           2.601866e-04
accessories.cosmetic_bag                  1.760996e-04
appliances.personal.massager              1.592821e-04
appliances.environment.air_conditioner    3.421473e-05
furniture.living_room.chair               1.657578e-05
sport.diving                              1.933036e-07
Name: proportion, dtype: float64

In [16]:
# 98%가 결측치이므로, drop
df.drop(columns='category_code', inplace=True)

- brand

In [17]:
# 1. 브랜드별 개수
brand_counts = df_oct['brand'].value_counts(dropna=False)

# 2. 3만 초과 브랜드만 선택
top_brands = brand_counts[brand_counts > 50_000]

# 3. 각 브랜드의 전체 대비 비율 계산
top_brand_ratio_each = top_brands / len(df_oct)

# 4. 결과 출력
print(top_brand_ratio_each)
print(f"상위 브랜드 비율: {top_brand_ratio_each.sum():.4f} ({top_brand_ratio_each.sum():.2%})")

brand
NaN          0.404473
runail       0.074651
irisk        0.055235
masura       0.047576
grattol      0.032209
bpw.style    0.027836
ingarden     0.022007
estel        0.015709
pole         0.015017
kapous       0.014118
Name: count, dtype: float64
상위 브랜드 비율: 0.7088 (70.88%)


- 브랜드는 추후에 해석에 도움을 줄 수 있어 keep
- 50_000건 이상인 브랜드가 전체의 70% 이상(NaN 40%)
- 30_000건 이상인 브랜드가 전체의 80% 이상(NaN 40%)

In [18]:
# 결측치 비율
4598 / 20692840

0.00022220246230096982

In [19]:
# 행 drop
df = df.dropna(subset=['user_session'])
print(f"삭제 후 행 수: {len(df)}")
print(f"정상적으로 삭제됐을 때 행 수 : {20692840 - 4598}")

삭제 후 행 수: 20688242
정상적으로 삭제됐을 때 행 수 : 20688242


In [20]:
resumetable(df,3)

데이터셋 크기: (20688242, 8)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,object,0,8782133,2019-10-01 00:00:00 UTC,2019-10-01 00:00:03 UTC,2019-10-01 00:00:07 UTC
1,event_type,object,0,4,cart,cart,cart
2,product_id,int64,0,54570,5773203,5773353,5881589
3,category_id,int64,0,525,1487580005134238553,1487580005134238553,2151191071051219817
4,brand,object,8755488,273,runail,runail,lovely
5,price,float64,0,2860,2.62,2.62,13.48
6,user_id,int64,0,1639151,463240011,463240011,429681830
7,user_session,object,0,4535941,26dd6e6e-4dac-4778-8d2c-92e149dab885,26dd6e6e-4dac-4778-8d2c-92e149dab885,49e8d843-adf3-428b-a2c3-fe8bc6a307c9


- event_time

In [21]:
df['event_time'] = pd.to_datetime(df['event_time'], utc=True)

In [22]:
# 월단위로 보는 컬럼 생성
df['event_month'] = df['event_time'].dt.month

In [23]:
# 201910 ~ 202002라 순서 지정
month_order = [10, 11, 12, 1, 2]
df['event_month'] = pd.Categorical(df['event_month'], categories=month_order, ordered=True)

- 브랜드 상위 목록을 차지하는 브랜드 목록이 러시아에서 제조되거나 주로 유통되는 뷰티 브랜드인 것으로 보아, 러시아 기반 이커머스 플랫폼일 확률이 높음
- 구글(Google)이 주도해 만든 오픈 소스 오픈 데이터 플랫폼인 Data Commons(https://datacommons.org) 기준 러시아의 인구는 모스크바, 상트페테르부르크에 집중되어 있음
- 두 도시 모두 UTC+3이므로 날짜 변환

In [24]:
# 모스크바 시간(UTC+3)으로 변환
df['event_time_moscow'] = df['event_time'].dt.tz_convert('Europe/Moscow')

In [25]:
resumetable(df,3)

데이터셋 크기: (20688242, 10)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,1번째 값,2번째 값,3번째 값
0,event_time,"datetime64[ns, UTC]",0,8782133,2019-10-01 00:00:00+00:00,2019-10-01 00:00:03+00:00,2019-10-01 00:00:07+00:00
1,event_type,object,0,4,cart,cart,cart
2,product_id,int64,0,54570,5773203,5773353,5881589
3,category_id,int64,0,525,1487580005134238553,1487580005134238553,2151191071051219817
4,brand,object,8755488,273,runail,runail,lovely
5,price,float64,0,2860,2.62,2.62,13.48
6,user_id,int64,0,1639151,463240011,463240011,429681830
7,user_session,object,0,4535941,26dd6e6e-4dac-4778-8d2c-92e149dab885,26dd6e6e-4dac-4778-8d2c-92e149dab885,49e8d843-adf3-428b-a2c3-fe8bc6a307c9
8,event_month,category,0,5,10,10,10
9,event_time_moscow,"datetime64[ns, Europe/Moscow]",0,8782133,2019-10-01 03:00:00+03:00,2019-10-01 03:00:03+03:00,2019-10-01 03:00:07+03:00


- price

In [26]:
# 최댓값
print("최댓값:", df['price'].max())

# 최솟값
print("최솟값:", df['price'].min())

# 평균
print("평균:", df['price'].mean())

최댓값: 327.78
최솟값: -79.37
평균: 8.534892247006782


---

In [27]:
# # 임시 저장
# df.to_csv('data/merged_data.csv', index=False, encoding='utf-8-sig')

# AAARR 분석

In [28]:
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id', 'brand',
       'price', 'user_id', 'user_session', 'event_month', 'event_time_moscow'],
      dtype='object')

# 행동 기반 코호트 분석

In [29]:
# purchase_df = df[df["event_type"] == "purchase"]
# first_purchase = purchase_df.groupby("user_id")["event_time_moscow"].min().reset_index()
# first_purchase.columns = ["user_id", "first_purchase_time"]

In [30]:
# # 예시: 첫 구매 이후 30일 이내의 'view' 이벤트 수
# df = df.merge(first_purchase, on="user_id")
# df["days_since_first"] = (df["event_time"] - df["first_purchase_time"]).dt.days
# within_period = df[(df["days_since_first"] >= 0) & (df["days_since_first"] <= 30)]
# view_count = within_period[within_period["event_type"] == "view"].groupby("user_id").size()

In [31]:
# after_first_purchase = purchase_df.merge(first_purchase, on="user_id")
# after_first_purchase = after_first_purchase[after_first_purchase["event_time"] > after_first_purchase["first_purchase_time"]]
# repurchase_flag = after_first_purchase["user_id"].drop_duplicates()
# df["재구매여부"] = df["user_id"].isin(repurchase_flag)

In [32]:
# user_view_df = view_count.reset_index(name="view_count")
# user_view_df["repurchased"] = user_view_df["user_id"].isin(repurchase_flag)

# # 예시: view_count가 N회 이상 vs 미만일 때의 재구매율 비교
# user_view_df["활발사용자"] = user_view_df["view_count"] >= 5
# result = user_view_df.groupby("활발사용자")["repurchased"].mean()

In [33]:
# result