In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 그래프 기본 테마 설정
sns.set_style("white")
sns.set_context("notebook")

# 폰트 적용
import platform
from matplotlib import font_manager as fm
font_path = "font/NanumGothic.ttf"

font_prop = fm.FontProperties(fname=font_path)
plt.rcParams['font.family'] = font_prop.get_name()

# 그래프 기본 설정
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['axes.facecolor'] = 'white'      # 축 배경 흰색
plt.rcParams['figure.facecolor'] = 'white'    # 전체 배경 흰색
plt.rcParams['axes.grid'] = False             # 그리드 제거
plt.rcParams['savefig.facecolor'] = 'white'   # 저장 이미지 배경도 흰색

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
df = pd.read_csv('data/merged_data_brand_dropna.csv')

In [7]:
df

Unnamed: 0,event_type,product_id,category_id,brand,price,user_id,user_session,event_time_moscow,event_month,user_key
0,cart,5773203,1487580005134238553,runail,2.62,463240011,26dd6e6e-4dac-4778-8d2c-92e149dab885,2019-10-01 03:00:00+03:00,10,46324001126dd6e6e-4dac-4778-8d2c-92e149dab885
1,cart,5773353,1487580005134238553,runail,2.62,463240011,26dd6e6e-4dac-4778-8d2c-92e149dab885,2019-10-01 03:00:03+03:00,10,46324001126dd6e6e-4dac-4778-8d2c-92e149dab885
2,cart,5881589,2151191071051219817,lovely,13.48,429681830,49e8d843-adf3-428b-a2c3-fe8bc6a307c9,2019-10-01 03:00:07+03:00,10,42968183049e8d843-adf3-428b-a2c3-fe8bc6a307c9
3,cart,5723490,1487580005134238553,runail,2.62,463240011,26dd6e6e-4dac-4778-8d2c-92e149dab885,2019-10-01 03:00:07+03:00,10,46324001126dd6e6e-4dac-4778-8d2c-92e149dab885
4,cart,5881449,1487580013522845895,lovely,0.56,429681830,49e8d843-adf3-428b-a2c3-fe8bc6a307c9,2019-10-01 03:00:15+03:00,10,42968183049e8d843-adf3-428b-a2c3-fe8bc6a307c9
...,...,...,...,...,...,...,...,...,...,...
11925724,remove_from_cart,5710530,1487580009622143014,irisk,0.63,622026701,54926d92-0446-4577-b923-a7309a5a8a52,2020-02-29 23:59:58+03:00,2,62202670154926d92-0446-4577-b923-a7309a5a8a52
11925725,remove_from_cart,5710530,1487580009622143014,irisk,0.63,622026701,54926d92-0446-4577-b923-a7309a5a8a52,2020-02-29 23:59:58+03:00,2,62202670154926d92-0446-4577-b923-a7309a5a8a52
11925726,cart,5700037,1487580009286598681,runail,0.40,495882061,2ad87792-8da8-45e1-94ba-74cb133df9ab,2020-02-29 23:59:59+03:00,2,4958820612ad87792-8da8-45e1-94ba-74cb133df9ab
11925727,remove_from_cart,5700037,1487580009286598681,runail,0.40,495882061,2ad87792-8da8-45e1-94ba-74cb133df9ab,2020-02-29 23:59:59+03:00,2,4958820612ad87792-8da8-45e1-94ba-74cb133df9ab


In [7]:
# 분석 대상 브랜드 리스트
target_brands = ['eunyul', 'severina', 'supertan', 'elskin', 'cosima']

# 이벤트별 데이터 분리 후 중복 제거 (user_id 기준)
df['event_month'] = pd.to_datetime(df['event_time_moscow']).dt.to_period('M')
filtered = df[df['brand'].isin(target_brands)].copy()

view_df = filtered[filtered['event_type'] == 'view'][['event_month', 'brand', 'user_id']].drop_duplicates()
cart_df = filtered[filtered['event_type'] == 'cart'][['event_month', 'brand', 'user_id']].drop_duplicates()
purchase_df = filtered[filtered['event_type'] == 'purchase'][['event_month', 'brand', 'user_id']].drop_duplicates()

# 유저 수 집계
view_counts = view_df.groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='view_users')
cart_counts = cart_df.groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='cart_users')
purchase_counts = purchase_df.groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='purchase_users')

# 병합
merged = view_counts.merge(cart_counts, on=['event_month', 'brand'], how='left') \
                    .merge(purchase_counts, on=['event_month', 'brand'], how='left')

# 결측값 처리
merged[['cart_users', 'purchase_users']] = merged[['cart_users', 'purchase_users']].fillna(0)

# 전환율 계산
merged['view_to_cart_rate'] = (merged['cart_users'] / merged['view_users']).round(4)
merged['cart_to_purchase_rate'] = (merged['purchase_users'] / merged['cart_users'].replace(0, pd.NA)).round(4)
merged['view_to_purchase_rate'] = (merged['purchase_users'] / merged['view_users']).round(4)

# 보기 좋게 정렬
merged = merged.sort_values(['brand', 'event_month']).reset_index(drop=True)

In [8]:
# 원하는 브랜드 순서 지정
brand_order = ['eunyul', 'severina', 'supertan', 'elskin', 'cosima']

# 브랜드 컬럼을 카테고리형으로 변환하여 순서 고정
merged['brand'] = pd.Categorical(merged['brand'], categories=brand_order, ordered=True)

# 정렬 시 브랜드 순서 유지
merged = merged.sort_values(['brand', 'event_month']).reset_index(drop=True)

In [9]:
merged

Unnamed: 0,event_month,brand,view_users,cart_users,purchase_users,view_to_cart_rate,cart_to_purchase_rate,view_to_purchase_rate
0,2019-10,eunyul,268,447,142,1.6679,0.3177,0.5299
1,2019-11,eunyul,259,431,146,1.6641,0.3387,0.5637
2,2019-12,eunyul,266,326,118,1.2256,0.362,0.4436
3,2020-01,eunyul,243,341,118,1.4033,0.346,0.4856
4,2020-02,eunyul,173,280,100,1.6185,0.3571,0.578
5,2019-10,severina,2828,3390,1416,1.1987,0.4177,0.5007
6,2019-11,severina,3178,3689,1759,1.1608,0.4768,0.5535
7,2019-12,severina,2539,2829,1284,1.1142,0.4539,0.5057
8,2020-01,severina,3511,3798,1746,1.0817,0.4597,0.4973
9,2020-02,severina,3200,3395,1424,1.0609,0.4194,0.445


In [15]:
merged.to_csv('data/13_monthly_brand_funnel.csv', index=False)

- gpt가 개선해준 방식 비교
- 위는 교집합을 이용한 방식이라 원하는 대로 유저 기반이 되긴 하는데 정석은 아닌 듯
- 이게 view 중 purchase 방식

In [10]:
import pandas as pd

# 분석 대상 브랜드 리스트
target_brands = ['eunyul', 'severina', 'supertan', 'elskin', 'cosima']

# 1. event_month 생성 및 대상 브랜드 필터링
df['event_month'] = pd.to_datetime(df['event_time_moscow']).dt.to_period('M')
filtered = df[df['brand'].isin(target_brands)].copy()

# 2. 이벤트별 유저 추출 (중복 제거)
view_df = filtered[filtered['event_type'] == 'view'][['event_month', 'brand', 'user_id']].drop_duplicates()
cart_df = filtered[filtered['event_type'] == 'cart'][['event_month', 'brand', 'user_id']].drop_duplicates()
purchase_df = filtered[filtered['event_type'] == 'purchase'][['event_month', 'brand', 'user_id']].drop_duplicates()

# 3. view 유저 기준 퍼널 구조 생성
funnel = view_df.copy()
funnel['view_flag'] = 1

# 4. view 유저 중 cart 여부 플래그
funnel['to_cart'] = funnel.set_index(['event_month', 'brand', 'user_id']).index.isin(
    cart_df.set_index(['event_month', 'brand', 'user_id']).index
).astype(int)

# 5. view 유저 중 purchase 여부 플래그
funnel['to_purchase'] = funnel.set_index(['event_month', 'brand', 'user_id']).index.isin(
    purchase_df.set_index(['event_month', 'brand', 'user_id']).index
).astype(int)

# 6. 전환율 계산
conversion = (
    funnel.groupby(['event_month', 'brand'])[['to_cart', 'to_purchase']]
    .mean()
    .reset_index()
    .rename(columns={
        'to_cart': 'view_to_cart_rate',
        'to_purchase': 'view_to_purchase_rate'
    })
)

# 7. 유저 수 함께 계산
view_counts = funnel.groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='view_users')
cart_counts = funnel[funnel['to_cart'] == 1].groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='cart_users')
purchase_counts = funnel[funnel['to_purchase'] == 1].groupby(['event_month', 'brand'])['user_id'].nunique().reset_index(name='purchase_users')

# 8. 병합
final = (
    conversion
    .merge(view_counts, on=['event_month', 'brand'], how='left')
    .merge(cart_counts, on=['event_month', 'brand'], how='left')
    .merge(purchase_counts, on=['event_month', 'brand'], how='left')
)

# 9. 결측값 처리
final[['cart_users', 'purchase_users']] = final[['cart_users', 'purchase_users']].fillna(0)

# 10. 추가: cart to purchase 전환율 계산
final['cart_to_purchase_rate'] = (
    final['purchase_users'] / final['cart_users'].replace(0, pd.NA)
).round(4)

# 11. 보기 좋게 정렬
final = final.sort_values(['brand', 'event_month']).reset_index(drop=True)

# 12. 결과 확인
final

Unnamed: 0,event_month,brand,view_to_cart_rate,view_to_purchase_rate,view_users,cart_users,purchase_users,cart_to_purchase_rate
0,2019-10,cosima,0.761905,0.190476,21,16.0,4.0,0.25
1,2019-11,cosima,0.0,0.043478,23,0.0,1.0,
2,2019-12,cosima,0.266667,0.0,15,4.0,0.0,0.0
3,2020-01,cosima,0.375,0.1875,16,6.0,3.0,0.5
4,2020-02,cosima,0.4,0.15,20,8.0,3.0,0.375
5,2019-10,elskin,0.367647,0.125,272,100.0,34.0,0.34
6,2019-11,elskin,0.369231,0.161538,260,96.0,42.0,0.4375
7,2019-12,elskin,0.348548,0.182573,241,84.0,44.0,0.52381
8,2020-01,elskin,0.378788,0.166667,264,100.0,44.0,0.44
9,2020-02,elskin,0.324627,0.130597,268,87.0,35.0,0.402299


- 임세희님 양식

### brand별 구매전환율

In [39]:
import pandas as pd

# 1. 브랜드별 view 유저 수
view_users = (
    df[df['event_type'] == 'view']
    .groupby('brand')['user_id'].nunique()
    .reset_index(name='view_users')
)

# 2. 브랜드별 purchase 유저 수
purchase_users = (
    df[df['event_type'] == 'purchase']
    .groupby('brand')['user_id'].nunique()
    .reset_index(name='purchase_users')
)

# 3. 병합
brand_conv = pd.merge(view_users, purchase_users, on='brand', how='left').fillna(0)

# 4. 전환율 계산
brand_conv['conversion_rate'] = brand_conv['purchase_users'] / brand_conv['view_users']

# 5. 내림차순 정렬
brand_conv_sorted = brand_conv.sort_values(by='conversion_rate', ascending=False).reset_index(drop=True)

brand_conv_sorted

Unnamed: 0,brand,view_users,purchase_users,conversion_rate
0,eunyul,1175,603.0,0.513191
1,severina,14551,7172.0,0.492887
2,supertan,352,168.0,0.477273
3,elskin,1263,578.0,0.457641
4,cosima,93,40.0,0.430108
...,...,...,...,...
268,bodipure,29,0.0,0.000000
269,pueen,1,0.0,0.000000
270,busch,130,0.0,0.000000
271,shifei,3,0.0,0.000000


In [49]:
import pandas as pd

# 1. event_month 컬럼 생성 (YYYY-MM 형태)
df['event_month'] = pd.to_datetime(df['event_time_moscow']).dt.to_period('M').astype(str)

# 2. 월별·브랜드별 view 유저 수
view_users_month = (
    df[df['event_type'] == 'view']
    .groupby(['event_month', 'brand'])['user_id'].nunique()
    .reset_index(name='view_users')
)

# 3. 월별·브랜드별 purchase 유저 수
purchase_users_month = (
    df[df['event_type'] == 'purchase']
    .groupby(['event_month', 'brand'])['user_id'].nunique()
    .reset_index(name='purchase_users')
)

# 4. 병합
brand_conv_month = pd.merge(view_users_month, purchase_users_month,
                             on=['event_month', 'brand'], how='left').fillna(0)

# 5. 전환율 계산
brand_conv_month['conversion_rate'] = (
    brand_conv_month['purchase_users'] / brand_conv_month['view_users']
)

# 6. 월별로 전환율 내림차순 정렬
brand_conv_month_sorted = (
    brand_conv_month
    .sort_values(['event_month', 'conversion_rate'], ascending=[True, False])
    .reset_index(drop=True)
)

brand_conv_month_sorted.head(10)

Unnamed: 0,event_month,brand,view_users,purchase_users,conversion_rate
0,2019-10,eunyul,268,142.0,0.529851
1,2019-10,cosima,21,11.0,0.52381
2,2019-10,severina,2828,1416.0,0.500707
3,2019-10,nitrile,184,82.0,0.445652
4,2019-10,supertan,90,36.0,0.4
5,2019-10,elskin,272,108.0,0.397059
6,2019-10,dermal,291,115.0,0.395189
7,2019-10,nitrimax,877,322.0,0.367161
8,2019-10,smart,1448,441.0,0.304558
9,2019-10,igrobeauty,832,252.0,0.302885


In [46]:
import pandas as pd

# 0. event_time에서 월 추출
df['event_month'] = pd.to_datetime(df['event_time']).dt.to_period('M')

# 1. remove_from_cart 제거
df_filtered = df[~df['event_type'].isin(['remove_from_cart'])]

# 2. 월·브랜드·이벤트타입별 user_id 고유 수 집계
agg_df = (
    df_filtered.groupby(['event_month', 'brand', 'event_type'])['user_id']
      .nunique()
      .reset_index(name='count')
)

# 3. pivot으로 단계별 수 집계
pivot_df = (
    agg_df.pivot_table(index=['event_month', 'brand'],
                       columns='event_type',
                       values='count',
                       fill_value=0)
    .reset_index()
)

# 4. 단계별 전환율 계산
pivot_df['view_rate'] = 1.0
pivot_df['cart_rate'] = pivot_df['cart'] / pivot_df['view']
pivot_df['purchase_rate'] = pivot_df['purchase'] / pivot_df['cart']

# 5. long 형식으로 변환 (event_type, count, rate)
rate_map = {
    'view': 'view_rate',
    'cart': 'cart_rate',
    'purchase': 'purchase_rate'
}

rate_df = pivot_df.melt(
    id_vars=['event_month', 'brand'],
    value_vars=['view_rate', 'cart_rate', 'purchase_rate'],
    var_name='rate_type',
    value_name='rate'
)

# 6. rate_type을 event_type으로 매핑
rate_df['event_type'] = rate_df['rate_type'].map({
    'view_rate': 'view',
    'cart_rate': 'cart',
    'purchase_rate': 'purchase'
})

# 7. 최종 병합
final_df = (
    agg_df.merge(rate_df[['event_month', 'brand', 'event_type', 'rate']],
                 on=['event_month', 'brand', 'event_type'],
                 how='left')
    [['event_month', 'brand', 'event_type', 'count', 'rate']]
)

final_df

Unnamed: 0,event_month,brand,event_type,count,rate
0,2019-10,airnails,cart,1611,0.488330
1,2019-10,airnails,purchase,428,0.265673
2,2019-10,airnails,view,3299,1.000000
3,2019-10,almea,cart,144,0.260870
4,2019-10,almea,purchase,27,0.187500
...,...,...,...,...,...
3495,2020-02,zeitun,purchase,51,0.222707
3496,2020-02,zeitun,view,725,1.000000
3497,2020-02,zinger,cart,3452,0.510198
3498,2020-02,zinger,purchase,1344,0.389340


In [54]:
# 1. 브랜드 순서 지정
brands_to_keep = [
    'eunyul', 'supertan', 'severina', 'elskin', 'cosima',
    'benovy', 'dermal', 'nitrimax', 'nitrile', 'smart'
]
brand_order = pd.CategoricalDtype(categories=brands_to_keep, ordered=True)

# 2. event_type 순서 지정
event_order = pd.CategoricalDtype(categories=['view', 'cart', 'purchase'], ordered=True)

# 3. 필터링 및 정렬
filtered_df = (
    final_df[final_df['brand'].isin(brands_to_keep)]
    .assign(
        brand=lambda x: x['brand'].astype(brand_order),
        event_type=lambda x: x['event_type'].astype(event_order)
    )
    .sort_values(by=['event_month', 'event_type', 'brand'])
    .reset_index(drop=True)
)

filtered_df

Unnamed: 0,event_month,brand,event_type,count,rate
0,2019-10,eunyul,view,268,1.000000
1,2019-10,supertan,view,90,1.000000
2,2019-10,severina,view,2828,1.000000
3,2019-10,elskin,view,272,1.000000
4,2019-10,cosima,view,21,1.000000
...,...,...,...,...,...
145,2020-02,benovy,purchase,522,0.417600
146,2020-02,dermal,purchase,104,0.386617
147,2020-02,nitrimax,purchase,211,0.370826
148,2020-02,nitrile,purchase,78,0.329114


In [56]:
# CSV 저장
filtered_df.to_csv("data/14_filtered_brand_month_event.csv", index=False, encoding="utf-8-sig")