In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import psycopg2
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from mpl_toolkits.mplot3d import Axes3D

In [2]:
import matplotlib

matplotlib.rcParams['font.family'] ='Malgun Gothic'

matplotlib.rcParams['axes.unicode_minus'] =False

In [3]:
# 데이터베이스 연결 설정
conn = psycopg2.connect(database="postgres", user="postgres", password="postgres123", host="postgres.c4brhfvhrm5g.ap-northeast-2.rds.amazonaws.com", port=5432)

# 빈 데이터 프레임 만들기
df_order = pd.DataFrame()
df_order_items = pd.DataFrame()
df_payments = pd.DataFrame()
df_products = pd.DataFrame()
df_customers = pd.DataFrame()
df_sellers = pd.DataFrame()
df_closed_deals = pd.DataFrame()
df_mql = pd.DataFrame()
df_geoloc = pd.DataFrame()
df_reviews = pd.DataFrame()

# DB에서 데이터를 가져와 데이터프레임으로 만들기
def make_df(df, db_table):
    cur = conn.cursor()
    query = 'SELECT * FROM ' + db_table
    cur.execute(query)
    temp = cur.fetchall()
    df = pd.DataFrame(temp, columns=[desc[0] for desc in cur.description])
    cur.close()
    return df

df_order = make_df(df_order, '"order"')
df_order_items = make_df(df_order_items, 'order_items')
df_payments = make_df(df_payments, 'payments')
df_products = make_df(df_products, 'products')
df_customers = make_df(df_customers, 'customers')
df_sellers = make_df(df_sellers, 'sellers')
df_closed_deals = make_df(df_closed_deals, 'closed_deals')
df_mql = make_df(df_mql, 'mql')
df_geoloc = make_df(df_geoloc, 'geoloc')
df_reviews = make_df(df_reviews, 'reviews')

# 데이터 정제

In [4]:
# 2018년 8월까지의 데이터만 사용하기
end_date = '2018-08-31'
df_order = df_order[df_order['order_purchase_timestamp'] <= end_date]

# 조건에 맞는 order_status를 가진 데이터만 추출
order_status = ["delivered", "shipped", "invoiced", "processing"]
df_order_filtered= df_order[df_order["order_status"].isin(order_status)]

In [7]:
print(df_closed_deals.columns)
print('------------------------------------------------------')
print(df_mql.columns)


Index(['mql_id', 'seller_id', 'sdr_id', 'sr_id', 'won_date',
       'business_segment', 'lead_type', 'lead_behaviour_profile',
       'business_type', 'declared_monthly_revenue'],
      dtype='object')
------------------------------------------------------
Index(['mql_id', 'first_contact_date', 'landing_page_id', 'origin'], dtype='object')


In [217]:
df_mql.origin.value_counts()

origin
organic_search       2296
paid_search          1586
social               1350
unknown              1159
direct_traffic        499
email                 493
referral              284
other                 150
display               118
other_publicities      65
Name: count, dtype: int64

## origin분석을 위한 df_origin생성

In [9]:
df_origin = pd.merge(df_closed_deals, df_mql, on='mql_id', how='inner')

In [15]:
 # 유료채널,무료채널로 나누어 'origin_seg'컬럼 생성 및 unknown, dark_traffic 필터링
def classify_origin(origin):
    if origin == 'organic_search':
        return 'free'
    elif origin in ['social', 'paid_search', 'email', 'referral', 'display', 'other_publicities']:
        return 'paid'
    else:
        return 'none' # unknown, dark_traffic 

df_origin['origin_seg'] = df_origin['origin'].apply(classify_origin)
df_origin_filtered = df_origin[df_origin['origin_seg'] != 'none'] # unknown, dark_traffic 필터링

In [16]:
df_origin_filtered.origin_seg.value_counts()

origin_seg
paid    318
free    271
Name: count, dtype: int64

In [17]:
# unknown, ohther은 포함하지 않음
paid_origins = ['paid_search', 'social', 'display','email','referral','other_publicities'] 
free_origins = ['organic_search']

# 유료 유입 경로에 따른 체결률 계산
paid_leads = df_mql[df_mql['origin'].isin(paid_origins)] # 3896명
paid_closed = pd.merge(paid_leads, df_origin_filtered, on='mql_id', how='inner') # 318명
paid_conversion_rate = len(paid_closed) / len(paid_leads)

# 무료 유입 경로에 따른 체결률 계산
free_leads = df_mql[df_mql['origin'].isin(free_origins)] # 2296명
free_closed = pd.merge(free_leads, df_origin_filtered, on='mql_id', how='inner') # 271명
free_conversion_rate = len(free_closed) / len(free_leads)

print(f"유료 유입 경로의 체결률: {paid_conversion_rate:.2%}")
print(f"무료 유입 경로의 체결률: {free_conversion_rate:.2%}")

유료 유입 경로의 체결률: 8.16%
무료 유입 경로의 체결률: 11.80%


In [22]:
# 파이차트 버전 시각화

# 유입경로별 mql 데이터 필터링
paid_mql = df_mql[df_mql['origin'].isin(paid_origins)]
free_mql = df_mql[df_mql['origin'].isin(free_origins)]

# 체결된 mql 데이터
closed_mql_paid = paid_mql.merge(df_closed_deals, on='mql_id', how='inner')
closed_mql_free = free_mql.merge(df_closed_deals, on='mql_id', how='inner')

# 체결률 계산
paid_conversion_rate = len(closed_mql_paid) / len(paid_mql)
free_conversion_rate = len(closed_mql_free) / len(free_mql)

# 시각화 1: 전체 유입경로 중 유료와 무료 각각의 비율 
labels = ['Paid', 'Free']
values = [len(paid_mql), len(free_mql)]
colors_1 = ['#4A55A2', '#A0BFE0']

fig1 = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig1.update_traces(marker=dict(colors=colors_1))
fig1.show()


In [23]:
# bar그래프 버전 시각화

channel_types = ["무료", "유료"]
conversion_rates = [11.803136, 8.162218]
colors = ['#4A55A2', '#6A75BB', '#7895CB', '#A0BFE0']

fig = go.Figure(data=[
    go.Bar(name='전환율', x=channel_types, y=conversion_rates, marker_color=[colors[0], colors[3]])
])

fig.update_layout(
    {
        "title": {
            "text": "<b>무료채널과 유료채널의 전환율</b>",
            "x": 0.5,
            "y": 0.9,
            "font": {"size": 15}
        },
        "xaxis_title": "채널 유형",
        "yaxis_title": "전환율 (%)"
    }
)


fig.show()

# 무료/유료 채널 유입 판매자별 특성분석

## origin_seg별 총 매출과 총 셀러 수

In [24]:
merged_data = pd.merge(df_origin_filtered, df_order_items, on='seller_id', how='left')

# origin_seg별 총 매출
total_sales_per_origin_seg = merged_data.groupby('origin_seg')['price'].sum()

# origin_seg별 총 seller 수 (중복 제거)
total_sellers_per_origin_seg = merged_data.groupby('origin_seg')['seller_id'].nunique()


In [25]:
fig = go.Figure()

# 막대 그래프 추가 (primary y-axis: Total Sales Value)
fig.add_trace(go.Bar(
    x=total_sales_per_origin_seg.index, 
    y=total_sales_per_origin_seg.values, 
    name='Total Sales Value',
    marker_color='blue'
))

# 라인 그래프 추가 (secondary y-axis: Number of Sellers)
fig.add_trace(go.Scatter(
    x=total_sellers_per_origin_seg.index, 
    y=total_sellers_per_origin_seg.values, 
    name='Number of Sellers',
    yaxis='y2',  # secondary y-axis
    mode='lines+markers',
    line=dict(color='red')
))

# 레이아웃 설정
fig.update_layout(
    yaxis=dict(
        title='Total Sales Value',
        titlefont=dict(color='blue'),
        tickfont=dict(color='blue')
    ),
    yaxis2=dict(
        title='Number of Sellers',
        titlefont=dict(color='red'),
        tickfont=dict(color='red'),
        overlaying='y',
        side='right'
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    )
)

fig.show()


In [26]:
import plotly.graph_objects as go

fig = go.Figure()

# 막대 그래프 추가: Total Sales Value
fig.add_trace(go.Bar(
    x=total_sales_per_origin_seg.index, 
    y=total_sales_per_origin_seg.values, 
    name='Total Sales Value',
    marker_color='blue',
    yaxis='y1',
    offsetgroup=1  # 해당 막대의 위치 조절
))

# 막대 그래프 추가: Number of Sellers
fig.add_trace(go.Bar(
    x=total_sellers_per_origin_seg.index, 
    y=total_sellers_per_origin_seg.values, 
    name='Number of Sellers',
    marker_color='red',
    yaxis='y2',
    offsetgroup=2  # 해당 막대의 위치 조절
))

# 레이아웃 설정
fig.update_layout(
    barmode='group',
    yaxis=dict(
        title='Sales Value'
    ),
    yaxis2=dict(
        title='Seller Count',
        overlaying='y',
        side='right'
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    )
)

fig.show()



## origin_seg별 가장 많은 Business segment

In [27]:
# origin_seg별 가장 많은 Business segment top1
origin_business_segment = df_origin_filtered.groupby('origin_seg')['business_segment'].value_counts().groupby(level=0).idxmax()
origin_business_segment

origin_seg
free       (free, home_decor)
paid    (paid, health_beauty)
Name: count, dtype: object

In [227]:
## origin_seg별 가장 많은 Business segment top1 시각화

# colors = ['#4A55A2', '#6A75BB', '#7895CB', '#A0BFE0', '#C5DFF8']
# fig1 = go.Figure()

# for idx, origin in enumerate(['paid', 'free']):
#     temp = df_origin_filtered[df_origin_filtered['origin_seg'] == origin]['business_segment'].value_counts().head(5)
#     for i, (business, count) in enumerate(temp.items()):
#         fig1.add_trace(go.Bar(x=[origin], 
#                               y=[count], 
#                               name=business,
#                               marker_color=colors[i]))

# fig1.update_layout(title_text='Top 5 Business Segment 별 Origin Segment', 
#                    xaxis_title='Origin Segment', 
#                    yaxis_title='Count of Business Segment',
#                    barmode='stack')
# fig1.show()


In [34]:
# origin_seg별 가장 많은 Business segment top 10
origin_business_segment_top10 = df_origin_filtered.groupby('origin_seg')['business_segment'].value_counts().groupby(level=0).head(10)
origin_business_segment_top10

origin_seg  business_segment               
free        home_decor                         44
            car_accessories                    26
            audio_video_electronics            21
            health_beauty                      21
            construction_tools_house_garden    19
            household_utilities                19
            sports_leisure                     11
            food_supplement                    11
            pet                                10
            food_drink                         10
paid        health_beauty                      42
            home_decor                         36
            car_accessories                    31
            household_utilities                28
            audio_video_electronics            26
            construction_tools_house_garden    25
            computers                          15
            pet                                12
            food_supplement                    11
      

In [35]:
df_plot = origin_business_segment_top10.reset_index()
df_plot.columns = ['origin_seg', 'business_segment', 'count']

# 상위 10개의 business_segment
top_10_segments = df_origin_filtered['business_segment'].value_counts().head(10).index.tolist()

# 필터링된 데이터 프레임 생성
df_filtered = df_plot[df_plot['business_segment'].isin(top_10_segments)]

# free와 paid 분리
df_free = df_filtered[df_filtered['origin_seg'] == 'free'].set_index('business_segment')
df_paid = df_filtered[df_filtered['origin_seg'] == 'paid'].set_index('business_segment')

# 누락된 business_segment에 대한 처리
for segment in top_10_segments:
    if segment not in df_free.index:
        df_free.loc[segment, 'count'] = 0
    if segment not in df_paid.index:
        df_paid.loc[segment, 'count'] = 0

df_free = df_free.reindex(top_10_segments)
df_paid = df_paid.reindex(top_10_segments)

# 시각화
fig = go.Figure()

fig.add_trace(go.Scatter(x=top_10_segments, y=df_free['count'], mode='lines+markers', name='무료채널', line=dict(color='#4A55A2')))
fig.add_trace(go.Scatter(x=top_10_segments, y=df_paid['count'], mode='lines+markers', name='유료채널', line=dict(color='#A0BFE0')))

fig.update_layout(
    title="유입채널별 상위 10개의 Business Segment ",
    xaxis_title="Business Segment",
    yaxis_title="Count"
)

fig.show()


In [31]:
df_origin_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 589 entries, 0 to 840
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   mql_id                    589 non-null    int64         
 1   seller_id                 589 non-null    int64         
 2   sdr_id                    589 non-null    int64         
 3   sr_id                     589 non-null    int64         
 4   won_date                  589 non-null    datetime64[ns]
 5   business_segment          589 non-null    object        
 6   lead_type                 589 non-null    object        
 7   lead_behaviour_profile    589 non-null    object        
 8   business_type             589 non-null    object        
 9   declared_monthly_revenue  589 non-null    float64       
 10  first_contact_date        589 non-null    datetime64[ns]
 11  landing_page_id           589 non-null    int64         
 12  origin                    5

## origin_seg별 선언 월 매출(declared_monthly_revenue)

In [36]:
avg_monthly_revenue = df_origin_filtered.groupby('origin_seg')['declared_monthly_revenue'].mean()
avg_monthly_revenue 
# 근데 이건 판매자가 직접 본인의 월 매출을 말한거라 의미가 없을 듯 

origin_seg
free    189763.837638
paid     30455.974843
Name: declared_monthly_revenue, dtype: float64

## origin_seg별 월 체결 수

In [38]:
# 월별로 그룹화
df_origin_filtered['month'] = df_origin_filtered['first_contact_date'].dt.to_period('M')
monthly_closures = df_origin_filtered.groupby(['month', 'origin_seg']).size().reset_index(name='count')
monthly_closures 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,month,origin_seg,count
0,2017-08,free,2
1,2017-08,paid,6
2,2017-09,free,5
3,2017-09,paid,2
4,2017-10,free,1
5,2017-10,paid,10
6,2017-11,free,4
7,2017-11,paid,10
8,2017-12,free,4
9,2017-12,paid,3


In [39]:
# Period 객체를 문자열로 변환
monthly_closures['month'] = monthly_closures['month'].astype(str)

# free와 paid 데이터 분리
free_data = monthly_closures[monthly_closures['origin_seg'] == 'free']
paid_data = monthly_closures[monthly_closures['origin_seg'] == 'paid']

# 시각화
fig = go.Figure()

# free 
fig.add_trace(go.Scatter(
    x=free_data['month'],
    y=free_data['count'],
    mode='lines+markers',
    name='무료채널',
    line=dict(color='#4A55A2')
))

# paid 
fig.add_trace(go.Scatter(
    x=paid_data['month'],
    y=paid_data['count'],
    mode='lines+markers',
    name='유료채널',
    line=dict(color='#A0BFE0')
))

# 모든 월 x축에 표시
all_months = sorted(monthly_closures['month'].unique())
fig.update_layout(
    title="유입채널별 월 체결 판매자 수",
    xaxis_title="Month",
    yaxis_title="Number of Sellers",
    xaxis=dict(
        tickvals=all_months,
        ticktext=all_months,
        tickangle=45
    )
)

fig.show()



## origin_seg별 lead_behaivour_profile

In [40]:
filter_values = ['cat', 'eagle', 'wolf', 'shark']

lead_behaviour_profile = df_origin_filtered[df_origin_filtered['lead_behaviour_profile'].isin(filter_values)]
grouped_lead_behaviour_profile = lead_behaviour_profile.groupby('origin_seg')['lead_behaviour_profile'].value_counts()

grouped_lead_behaviour_profile

origin_seg  lead_behaviour_profile
free        cat                       130
            eagle                      38
            wolf                       26
            shark                       4
paid        cat                       152
            eagle                      56
            wolf                       39
            shark                      13
Name: count, dtype: int64

In [41]:
# 데이터를 리셋 인덱스하여 데이터프레임으로 변환
df_grouped = grouped_lead_behaviour_profile.reset_index(name='count')


fig = go.Figure()

colors = ['#4A55A2', '#6A75BB', '#7895CB', '#A0BFE0']

for idx, animal in enumerate(filter_values):
    # 각 동물에 대한 데이터를 선택
    animal_data = df_grouped[df_grouped['lead_behaviour_profile'] == animal]
    
    fig.add_trace(go.Bar(
        x=animal_data['origin_seg'],
        y=animal_data['count'],
        name=animal,
        marker_color=colors[idx]
    ))


fig.update_layout(
    title="유입채널별 판매자 행동 유형",
    xaxis_title="Channel Type",
    yaxis_title="Count",
    barmode='group'
)

fig.show()


## origin_seg별 배송기간

In [42]:
df_merged = df_origin_filtered.merge(df_order_items, on='seller_id').merge(df_payments, on='order_id')

# df_order와 df_merged를 merge
df_full = pd.merge(df_merged, df_order[['order_id', 'order_purchase_timestamp', 'order_delivered_customer_date']], on='order_id', how='left')

In [44]:
# df_origin_filtered의 seller_id 목록 가져오기
origin_seller_ids = df_origin_filtered['seller_id'].unique()

# df_full에서 해당 seller_id만 필터링
df_full = df_full[df_full['seller_id'].isin(origin_seller_ids)]

# seller_id를 기준으로 중복 제거
df_full_filtered = df_full.drop_duplicates(subset='seller_id', keep='first')


In [45]:
# df_origin_filtered에서 seller_id와 그 외 필요한 컬럼들만 추출
origin_seller_data = df_origin_filtered[['seller_id', 'origin_seg', 'origin', 'business_segment', 'lead_type', 'lead_behaviour_profile', 'mql_id', 'declared_monthly_revenue', 'landing_page_id']]

# df_full에서 df_origin_filtered의 seller_id 목록에 있는 데이터만 가져오기
filtered_data_from_full = df_full[df_full['seller_id'].isin(origin_seller_ids)]

# 위의 데이터와 origin_seller_data를 합치기 (df_full에 없는 seller_id의 경우 origin_seller_data의 정보를 사용)
df_full_filtered = pd.concat([filtered_data_from_full, origin_seller_data]).drop_duplicates(subset='seller_id', keep='first')



In [46]:
df_full_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 589 entries, 0 to 840
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   mql_id                         589 non-null    int64         
 1   seller_id                      589 non-null    int64         
 2   sdr_id                         262 non-null    float64       
 3   sr_id                          262 non-null    float64       
 4   won_date                       262 non-null    datetime64[ns]
 5   business_segment               589 non-null    object        
 6   lead_type                      589 non-null    object        
 7   lead_behaviour_profile         589 non-null    object        
 8   business_type                  262 non-null    object        
 9   declared_monthly_revenue       589 non-null    float64       
 10  first_contact_date             262 non-null    datetime64[ns]
 11  landing_page_id         

In [47]:
# datetime 형식으로 변환
df_full_filtered['order_purchase_timestamp'] = pd.to_datetime(df_full_filtered['order_purchase_timestamp'])
df_full_filtered['order_delivered_customer_date'] = pd.to_datetime(df_full_filtered['order_delivered_customer_date'])

# 배송 시간 계산
df_full_filtered['delivery_duration'] = (df_full_filtered['order_delivered_customer_date'] - df_full_filtered['order_purchase_timestamp']).dt.total_seconds() / 86400  # convert to days

avg_delivery_duration_by_type = df_full_filtered.groupby('origin_seg')['delivery_duration'].mean()
print(avg_delivery_duration_by_type)

origin_seg
free    8.611304
paid    8.756547
Name: delivery_duration, dtype: float64


## origin_seg별 체결까지 기간

In [48]:
# 체결까지 걸리는 기간을 계산
df_origin_filtered['conversion_duration'] = (df_origin_filtered['won_date'] - df_origin_filtered['first_contact_date']).dt.days

# 유료/무료 채널별 체결까지 걸리는 평균 기간 계산
avg_conversion_duration = df_origin_filtered.groupby('origin_seg')['conversion_duration'].mean().reset_index()

# 시각화
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Bar(
    x=avg_conversion_duration['origin_seg'],
    y=avg_conversion_duration['conversion_duration'],
    marker_color=['#4A55A2', '#A0BFE0']
))

fig.update_layout(
    title="Average Conversion Duration by Channel Type",
    xaxis_title="Channel Type",
    yaxis_title="Average Duration (Days)"
)

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# 유입채널별 LTV,CAC

In [49]:
df_orders_payments = df_order.merge(df_payments, on="order_id")

df_orders_full = df_orders_payments.merge(df_order_items, on="order_id")

# 판매자별 총 수익
seller_revenue = df_orders_full.groupby("seller_id")["payment_value"].sum()


df_origin_filtered = df_origin_filtered.merge(seller_revenue, on="seller_id", how="left")

# origin_seg별 LTV
ltv = df_origin_filtered.groupby("origin_seg")["payment_value"].mean().round(2)
ltv


origin_seg
free    2599.21
paid    2357.21
Name: payment_value, dtype: float64

In [50]:
# CAC 

df_origin_filtered["marketing_cost"] = df_origin_filtered["payment_value"] * 0.09

num_sellers = df_origin_filtered["origin_seg"].value_counts()

cac = (df_origin_filtered.groupby("origin_seg")["marketing_cost"].sum()) / num_sellers
cac


origin_seg
free    97.542462
paid    99.402957
dtype: float64

In [52]:
# 'paid' 채널을 통한 판매자의 총 수익에서 마케팅 비용을 계산합니다.
df_origin_filtered.loc[df_origin_filtered["origin_seg"] == "paid", "marketing_cost"] = df_origin_filtered["payment_value"] * 0.09
df_origin_filtered.loc[df_origin_filtered["origin_seg"] == "free", "marketing_cost"] = 0

# 판매자 수를 계산하여 origin_seg 별로 나눕니다.
num_sellers = df_origin_filtered["origin_seg"].value_counts()

# CAC를 계산합니다.
cac2 = (df_origin_filtered.groupby("origin_seg")["marketing_cost"].sum()) / num_sellers
cac2 

origin_seg
free     0.000000
paid    99.402957
dtype: float64

In [53]:
# LTV 계산
ltv = df_origin_filtered.groupby("origin_seg")["payment_value"].sum() / num_sellers

# CAC 계산
df_origin_filtered.loc[df_origin_filtered["origin_seg"] == "paid", "marketing_cost"] = df_origin_filtered["payment_value"] * 0.09
df_origin_filtered.loc[df_origin_filtered["origin_seg"] == "free", "marketing_cost"] = 0
cac = (df_origin_filtered.groupby("origin_seg")["marketing_cost"].sum()) / num_sellers

# ROI 계산
roi = ((ltv - cac) / cac) * 100
roi


origin_seg
free            inf
paid    1011.111111
dtype: float64

## 전체 origin별 ltv,cac,roi

In [286]:
merged_data = pd.merge(df_origin_filtered, df_order_items, on='seller_id', how='left')

# 전체 판매자 수
total_sellers = merged_data['seller_id'].nunique()

# LTV 계산: 전체 매출 / 전체 판매자 수
ltv = merged_data['payment_value'].sum() / total_sellers

# CAC 계산 (마케팅비용 : 매출의  9% 로 가정)
marketing_cost = merged_data['payment_value'].sum() * 0.09
cac = marketing_cost / total_sellers

# ROI 계산
roi = ((ltv - cac) / cac) * 100


In [274]:
print(ltv)
print(cac)
print(roi)

70380.51494057725
6334.246344651952
1011.1111111111112


In [54]:
# 공식
# 평균 구매 금액 = 한 business_segment에 속한 seller들의 매출 합계 / seller 수
# 평균 구매 빈도 = 한 business_segment에 속한 seller들의 판매 횟수 / seller 수
# 평균 고객 수명 = (마지막 판매일 - 첫번째 판매일)의 평균

In [285]:
merged_data = pd.merge(df_origin_filtered, df_order_items, on='seller_id', how='left')

# origin 별 매출과 판매자 수
origin_sales = merged_data.groupby('origin')['payment_value'].sum()
origin_sellers = merged_data.groupby('origin')['seller_id'].nunique()

# LTV 계산: 각 origin 별 매출 / 각 origin 별 판매자 수
ltv_per_origin = origin_sales / origin_sellers

# CAC 계산 (마케팅비용 : 매출의  9% 로 가정)
marketing_cost_per_origin = origin_sales * 0.09
cac_per_origin = marketing_cost_per_origin / origin_sellers


# ROI 계산
roi_per_origin = ((ltv_per_origin - cac_per_origin) / cac_per_origin) * 100


result = pd.DataFrame({
    'LTV': ltv_per_origin,
    'CAC': cac_per_origin,
    'ROI': roi_per_origin
})

result.round(2)


Unnamed: 0_level_0,LTV,CAC,ROI
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
display,1087.97,97.92,1011.11
email,2944.3,264.99,1011.11
organic_search,38004.66,3420.42,1011.11
other_publicities,0.0,0.0,
paid_search,141642.73,12747.85,1011.11
referral,11200.79,1008.07,1011.11
social,42866.89,3858.02,1011.11


In [279]:
origin_sales #각 origin 별 매출

origin
display                  6527.80
email                   44164.46
organic_search       10299262.99
other_publicities           0.00
paid_search          27620332.44
referral               268819.00
social                3215016.61
Name: payment_value, dtype: float64

In [280]:
origin_sellers #각 origin 별 셀러수

origin
display                6
email                 15
organic_search       271
other_publicities      3
paid_search          195
referral              24
social                75
Name: seller_id, dtype: int64

In [294]:
# origin_sales와 origin_sellers를 결합하여 데이터프레임 생성
df_combined = pd.concat([origin_sales, origin_sellers], axis=1)
df_combined.columns = ['Sales', 'Sellers']

# plotly 그래프 생성
fig = go.Figure()

# 막대 그래프 추가 (primary y-axis: Sales Value)
fig.add_trace(go.Bar(
    x=df_combined.index, 
    y=df_combined['Sales'], 
    name='Sales Value',
    marker_color='blue',
))

# 라인 그래프 추가 (secondary y-axis: Number of Sellers)
fig.add_trace(go.Scatter(
    x=df_combined.index, 
    y=df_combined['Sellers'], 
    name='Number of Sellers',
    yaxis='y2',  # secondary y-axis
    mode='lines+markers',
    line=dict(color='red')
))

# 레이아웃 설정
fig.update_layout(
    yaxis=dict(
        title='Sales Value',
        titlefont=dict(color='blue'),
        tickfont=dict(color='blue')
    ),
    yaxis2=dict(
        title='Number of Sellers',
        titlefont=dict(color='red'),
        tickfont=dict(color='red'),
        overlaying='y',
        side='right'
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    )
)

fig.show()


In [None]:
# ltv 시각화

In [313]:
# "organic search"를 제외하고 LTV 값을 기준으로 정렬
sorted_ltv = ltv_per_origin.drop('organic_search').sort_values(ascending=False)

# "organic search"를 앞에 추가
sorted_ltv = pd.concat([ltv_per_origin[ltv_per_origin.index == 'organic_search'], sorted_ltv])

# 색상을 매핑
colors = ['#4A55A2' if origin != 'organic_search' else '#A0BFE0' for origin in sorted_ltv.index]

fig = go.Figure()

# LTV 막대 그래프 추가
fig.add_trace(go.Bar(
    x=sorted_ltv.index, 
    y=sorted_ltv.values, 
    name='LTV',
    marker_color=colors,
))

# 레이아웃 설정
fig.update_layout(
    title='유입 채널에 따른 고객 생애 가치(LTV)',
    xaxis_title='유입 채널',
    yaxis_title='고객 생애 가치',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()


In [None]:
#범례추가하기

In [312]:
#cac시각화

cac_per_origin['organic search'] = 0

# "organic search"를 앞에 추가하여 정렬
sorted_cac = cac_per_origin.drop('organic search').sort_values(ascending=False)
sorted_cac = pd.concat([cac_per_origin[cac_per_origin.index == 'organic search'], sorted_cac])

# 시각화
fig = go.Figure()

colors = ['#A0BFE0' if index == 'organic search' else '#4A55A2' for index in sorted_cac.index]

fig.add_trace(go.Bar(x=sorted_cac.index, y=sorted_cac.values, marker_color=colors, name='CAC'))

fig.update_layout(title='유입 채널에 따른 고객 획득 비용(CAC) ',
                  xaxis_title='유입 채널',
                  yaxis_title='고객 획득 비용',
                  barmode='group')

fig.show()


In [315]:
#평균선 삽입버전
# LTV 그래프

avg_ltv = sorted_ltv.mean()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=sorted_ltv.index, 
    y=sorted_ltv.values, 
    name='LTV',
    marker_color=colors,
))

# 평균선 추가
fig.add_shape(
    type="line", 
    x0=-0.5, 
    x1=len(sorted_ltv)-0.5, 
    y0=avg_ltv, 
    y1=avg_ltv,
    line=dict(color="red", dash="dot"),
    name="Average LTV"
)

fig.update_layout(
    title='유입 채널에 따른 고객 생애 가치(LTV)',
    xaxis_title='유입 채널',
    yaxis_title='고객 생애 가치',
    annotations=[dict(
        x=0.85,
        y=avg_ltv,
        xref="paper",
        yref="y",
        text=f"Average LTV: {avg_ltv:.2f}",
        showarrow=True,
        arrowhead=4,
        ax=0,
        ay=-40
    )]
)

fig.show()


# CAC 그래프

avg_cac = sorted_cac.mean()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=sorted_cac.index,
    y=sorted_cac.values, 
    marker_color=colors, 
    name='CAC'
))

# 평균선 추가
fig.add_shape(
    type="line", 
    x0=-0.5, 
    x1=len(sorted_cac)-0.5, 
    y0=avg_cac, 
    y1=avg_cac,
    line=dict(color="red", dash="dot"),
    name="Average CAC"
)

fig.update_layout(
    title='유입 채널에 따른 고객 획득 비용(CAC)',
    xaxis_title='유입 채널',
    yaxis_title='고객 획득 비용',
    annotations=[dict(
        x=0.85,
        y=avg_cac,
        xref="paper",
        yref="y",
        text=f"Average CAC: {avg_cac:.2f}",
        showarrow=True,
        arrowhead=4,
        ax=0,
        ay=-40
    )]
)

fig.show()


In [325]:
# origin 별로 리드 수와 체결된 리드 수 계산
origin_leads = df_mql.groupby('origin').size()
origin_closed = pd.merge(df_mql, df_closed_deals, on='mql_id', how='inner').groupby('origin').size()

# 전환율 계산
conversion_rates = (origin_closed / origin_leads)*100

conversion_rates


origin
direct_traffic       11.222445
display               5.084746
email                 3.042596
organic_search       11.803136
other                 2.666667
other_publicities     4.615385
paid_search          12.295082
referral              8.450704
social                5.555556
unknown              16.652286
dtype: float64

In [320]:
# "organic search"를 제외하고 전환율 값을 기준으로 정렬
sorted_conversion_rates = conversion_rates.drop('organic_search').sort_values(ascending=False)

# "organic search"를 앞에 추가
sorted_conversion_rates = pd.concat([conversion_rates[conversion_rates.index == 'organic_search'], sorted_conversion_rates])

# 시각화
import plotly.graph_objects as go

colors = ['#4A55A2' if index == 'organic_search' else '#A0BFE0' for index in sorted_conversion_rates.index]

fig = go.Figure()

fig.add_trace(go.Bar(x=sorted_conversion_rates.index, y=sorted_conversion_rates.values, marker_color=colors, name='Conversion Rate'))

fig.update_layout(title='유입 채널에 따른 판매자 전환율',
                  xaxis_title='유입 채널',
                  yaxis_title='판매자 전환율 (%)',
                  barmode='group')

fig.show()


In [321]:
# 전환율의 평균 계산
avg_conversion_rate = conversion_rates.mean()

colors = ['#4A55A2' if index == 'organic_search' else '#A0BFE0' for index in sorted_conversion_rates.index]

fig = go.Figure()

fig.add_trace(go.Bar(x=sorted_conversion_rates.index, y=sorted_conversion_rates.values, marker_color=colors, name='Conversion Rate'))

# 평균선 추가
fig.add_shape(
    go.layout.Shape(
        type="line",
        x0=-0.5,
        y0=avg_conversion_rate,
        x1=len(sorted_conversion_rates.index)-0.5,
        y1=avg_conversion_rate,
        line=dict(
            color="red",
            width=1.5,
            dash="dashdot",
        )
    )
)

fig.add_annotation(
    x=len(sorted_conversion_rates.index)/2,
    y=avg_conversion_rate + 1,
    text=f"평균 전환율: {avg_conversion_rate:.2f}%",
    showarrow=False,
    font=dict(color="red")
)

fig.update_layout(title='유입 채널에 따른 판매자 전환율',
                  xaxis_title='유입 채널',
                  yaxis_title='판매자 전환율 (%)',
                  barmode='group')

fig.show()


In [None]:
# unknown, direct_taffic없는 전환률 

In [327]:
# 'unknown'과 'direct_traffic' 제거
conversion_rates_drop = conversion_rates.drop(['unknown', 'direct_traffic'])

# 전환율의 평균 계산
avg_conversion_rate = conversion_rates_drop.mean()

# 정렬
#sorted_conversion_rates = conversion_rates_drop.sort_values(ascending=False)
sorted_conversion_rates = pd.concat([conversion_rates_drop[conversion_rates_drop.index == 'organic_search'], sorted_conversion_rates.drop('organic_search')])

# 시각화

colors = ['#4A55A2' if index == 'organic_search' else '#A0BFE0' for index in sorted_conversion_rates.index]

fig = go.Figure()

fig.add_trace(go.Bar(x=sorted_conversion_rates.index, y=sorted_conversion_rates.values, marker_color=colors, name='Conversion Rate'))

# 평균선 추가
fig.add_shape(
    go.layout.Shape(
        type="line",
        x0=-0.5,
        y0=avg_conversion_rate,
        x1=len(sorted_conversion_rates.index)-0.5,
        y1=avg_conversion_rate,
        line=dict(
            color="red",
            width=1.5,
            dash="dashdot",
        )
    )
)

fig.add_annotation(
    x=len(sorted_conversion_rates.index)/2,
    y=avg_conversion_rate + 1,
    text=f"평균 전환율: {avg_conversion_rate:.2f}%",
    showarrow=False,
    font=dict(color="red")
)

fig.update_layout(title='유입 채널에 따른 판매자 전환율',
                  xaxis_title='유입 채널',
                  yaxis_title='판매자 전환율 (%)',
                  barmode='group')

fig.show()


In [329]:
from plotly.subplots import make_subplots
# 두 그래프를 그리기 위한 subplot 생성
fig = make_subplots(specs=[[{"secondary_y": True}]])

# CAC (꺾은선 그래프) 추가
fig.add_trace(go.Scatter(
    x=sorted_cac.index,
    y=sorted_cac.values,
    mode='lines+markers',
    name='CAC',
    line=dict(color="#4A55A2")
), secondary_y=False)

# 전환율 (막대 그래프) 추가
fig.add_trace(go.Bar(
    x=sorted_conversion_rates.index,
    y=sorted_conversion_rates.values,
    name='Conversion Rate',
    marker_color=colors
), secondary_y=True)

# 레이아웃 업데이트
fig.update_layout(
    title_text="유입 채널에 따른 CAC와 판매자 전환율"
)

# x 축 레이아웃 업데이트
fig.update_xaxes(title_text="유입 채널")

# y 축 레이아웃 업데이트
fig.update_yaxes(title_text="<b>CAC</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>판매자 전환율 (%)</b>", secondary_y=True)

fig.show()

In [330]:
from plotly.subplots import make_subplots

# 'organic_search' 제외
sorted_cac = sorted_cac.drop('organic_search')
sorted_conversion_rates = sorted_conversion_rates.drop('organic_search')
colors = ['#A0BFE0' for index in sorted_conversion_rates.index]

# 두 그래프를 그리기 위한 subplot 생성
fig = make_subplots(specs=[[{"secondary_y": True}]])

# CAC (꺾은선 그래프) 추가
fig.add_trace(go.Scatter(
    x=sorted_cac.index,
    y=sorted_cac.values,
    mode='lines+markers',
    name='CAC',
    line=dict(color="#4A55A2", width=2.5),  # 라인의 굵기를 더 크게 설정
    marker=dict(size=8)  # 마커의 크기를 더 크게 설정
), secondary_y=False)

# 전환율 (막대 그래프) 추가
fig.add_trace(go.Bar(
    x=sorted_conversion_rates.index,
    y=sorted_conversion_rates.values,
    name='Conversion Rate',
    marker=dict(color=colors, opacity=0.6)  # 막대 그래프의 투명도 조절
), secondary_y=True)

# 레이아웃 업데이트
fig.update_layout(
    title_text="유입 채널에 따른 CAC와 판매자 전환율"
)

# x 축 레이아웃 업데이트
fig.update_xaxes(title_text="유입 채널")

# y 축 레이아웃 업데이트
fig.update_yaxes(title_text="<b>CAC</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>판매자 전환율 (%)</b>", secondary_y=True)

fig.show()


In [334]:
sorted_cac

origin
organic search           0.000000
paid_search          12747.845742
social                3858.019932
referral              1008.071250
email                  264.986760
display                 97.917000
other_publicities        0.000000
dtype: float64

In [337]:
# organic_search를 맨 앞으로 정렬
sorted_conversion_rates = pd.concat([conversion_rates_drop[conversion_rates_drop.index == 'organic_search'], sorted_conversion_rates.drop('organic_search')])

fig = go.Figure()

# CAC 꺾은선 그래프 추가
fig.add_trace(go.Scatter(
    x=sorted_cac.index,
    y=sorted_cac.values,
    mode='lines+markers',
    name='CAC',
    line=dict(color="#4A55A2", width=2.5),
    marker=dict(size=8),
    yaxis='y1' # 첫 번째 y축 (왼쪽)
))

# 전환율 막대 그래프 추가
fig.add_trace(go.Bar(
    x=sorted_conversion_rates.index,
    y=sorted_conversion_rates.values,
    name='Conversion Rate',
    marker_color='#A0BFE0',
    yaxis='y2' # 두 번째 y축 (오른쪽)
))

# 레이아웃 설정
fig.update_layout(
    title="유입 채널에 따른 CAC와 판매자 전환율",
    xaxis=dict(title='유입 채널'),
    yaxis=dict(title='CAC', position=0.05),
    yaxis2=dict(title='판매자 전환율 (%)', overlaying='y', side='right'),
    barmode='overlay' # 꺾은선 그래프와 막대 그래프가 겹치도록 설정
)

fig.show()

