In [1]:
import pandas as pd

file_path = r"C:\Users\HR\Desktop\Workspace\데이터톤\마케팅\merged_data.csv"
df = pd.read_csv(file_path)

print(df.head())


                  event_time event_type  product_id          category_id  \
0  2019-10-01 00:00:00+00:00       cart     5773203  1487580005134238553   
1  2019-10-01 00:00:03+00:00       cart     5773353  1487580005134238553   
2  2019-10-01 00:00:07+00:00       cart     5881589  2151191071051219817   
3  2019-10-01 00:00:07+00:00       cart     5723490  1487580005134238553   
4  2019-10-01 00:00:15+00:00       cart     5881449  1487580013522845895   

    brand  price    user_id                          user_session  \
0  runail   2.62  463240011  26dd6e6e-4dac-4778-8d2c-92e149dab885   
1  runail   2.62  463240011  26dd6e6e-4dac-4778-8d2c-92e149dab885   
2  lovely  13.48  429681830  49e8d843-adf3-428b-a2c3-fe8bc6a307c9   
3  runail   2.62  463240011  26dd6e6e-4dac-4778-8d2c-92e149dab885   
4  lovely   0.56  429681830  49e8d843-adf3-428b-a2c3-fe8bc6a307c9   

           event_time_moscow  event_month  \
0  2019-10-01 03:00:00+03:00         10.0   
1  2019-10-01 03:00:03+03:00         1

In [2]:
import pandas as pd

# CSV 또는 Parquet 파일을 불러온 뒤 df에 저장했다고 가정
# 예: df = pd.read_csv('your_file.csv') 또는 pd.read_parquet('your_file.parquet')

#  1. price가 0인 행 제거
df = df[df['price'] > 0]

#  2. brand 컬럼에서 결측치 제거
df = df.dropna(subset=['brand'])

# 결과 확인 (선택사항)
print(df.shape)
print(df['price'].min())
print(df['brand'].isnull().sum())

(11932733, 11)
0.06
0


In [3]:
# 카테고리별 각 이벤트 수 계산
category_funnel = df.pivot_table(
    index='category_id',
    columns='event_type',
    values='user_key',
    aggfunc='nunique',
    fill_value=0
).reset_index()

# 전환률 계산
category_funnel['view_to_cart_rate'] = category_funnel['cart'] / category_funnel['view']
category_funnel['cart_to_purchase_rate'] = category_funnel['purchase'] / category_funnel['cart']
category_funnel['overall_conversion_rate'] = category_funnel['purchase'] / category_funnel['view']

# 소수점 정리
category_funnel[['view_to_cart_rate', 'cart_to_purchase_rate', 'overall_conversion_rate']] = category_funnel[
    ['view_to_cart_rate', 'cart_to_purchase_rate', 'overall_conversion_rate']
].round(4)

# 저장
category_funnel.to_csv('category_funnel_analysis.csv', index=False)
print("✅ 'category_funnel_analysis.csv' 저장 완료!")

# 결과 미리보기 (전환률 높은 순 Top 10)
top10 = category_funnel.sort_values(by='overall_conversion_rate', ascending=False).head(10)
print("\n📊 전환률 높은 Top 10 카테고리:")
print(top10[['category_id', 'view', 'cart', 'purchase',
             'view_to_cart_rate', 'cart_to_purchase_rate', 'overall_conversion_rate']])


✅ 'category_funnel_analysis.csv' 저장 완료!

📊 전환률 높은 Top 10 카테고리:
event_type          category_id   view   cart  purchase  view_to_cart_rate  \
148         1487580009622143014    797   1950       818             2.4467   
424         2193074740552270669    487    721       298             1.4805   
47          1487580006509970331   5621   6924      3314             1.2318   
316         1752742615205281895   2437   3634      1300             1.4912   
147         1487580009605365797   3312   4818      1704             1.4547   
49          1487580006551913373  10260  10371      4946             1.0108   
139         1487580009362096156   3432   3877      1619             1.1297   
394         2055161088059638328   3906   4289      1834             1.0981   
67          1487580007281722301  13430  14132      6276             1.0523   
136         1487580009286598681  26524  29703     12354             1.1199   

event_type  cart_to_purchase_rate  overall_conversion_rate  
148              

In [4]:
# cart, purchase 이벤트만 필터링해서 카테고리별 유저 수 집계
category_dropoff = df[df['event_type'].isin(['cart', 'purchase'])].pivot_table(
    index='category_id',
    columns='event_type',
    values='user_key',
    aggfunc='nunique',
    fill_value=0
).reset_index()

# 이탈률 계산: (장바구니 - 구매) / 장바구니
category_dropoff['cart_dropoff_rate'] = (
    (category_dropoff['cart'] - category_dropoff['purchase']) / category_dropoff['cart']
).round(4)

# 저장
category_dropoff.to_csv('category_cart_dropoff.csv', index=False)
print("✅ 'category_cart_dropoff.csv' 저장 완료!")

# 결과 미리보기 (이탈률 높은 순 Top 10)
top10_drop = category_dropoff.sort_values(by='cart_dropoff_rate', ascending=False).head(10)
print("\n📉 이탈률 높은 Top 10 카테고리 (Cart → Purchase):")
print(top10_drop[['category_id', 'cart', 'purchase', 'cart_dropoff_rate']])


✅ 'category_cart_dropoff.csv' 저장 완료!

📉 이탈률 높은 Top 10 카테고리 (Cart → Purchase):
event_type          category_id  cart  purchase  cart_dropoff_rate
180         1487580011828347011     7         0             1.0000
346         2053031020655018687     1         0             1.0000
359         2121383893343929118    35         1             0.9714
29          1487580006157648777    59         2             0.9661
109         1487580008951054345    18         1             0.9444
287         1783999072365969578    48         3             0.9375
4           1487580004983243602    76         5             0.9342
330         1977786601392047073   236        17             0.9280
324         1962525118928257818    41         3             0.9268
284         1783999071199952917    26         2             0.9231
