### 분석준비

In [1]:
# 필요 라이브러리

# 분석을 위한 라이브러리
import numpy as np
import pandas as pd

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import os

# 에러메세지 안나오게
import warnings
warnings.filterwarnings('ignore')

# pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

In [2]:
import matplotlib

matplotlib.rcParams['font.family'] ='Malgun Gothic'

matplotlib.rcParams['axes.unicode_minus'] =False

In [3]:
# 디비 연결
import psycopg2
import json

def rds_python_conn():
    # rds와 python 연동을 위한 정보 입력
    with open('postgres.info.json','r') as json_file:
        db_info = json.load(json_file)

    host = db_info.get('host')
    dbname = 'postgres' 
    user = 'postgres' 
    password = db_info.get('password')
    port = 5432

    connection = psycopg2.connect(host=host,
                                  dbname=dbname ,
                                  user=user ,
                                  password=password ,
                                  port=port)
    return connection

### 유입경로별 체결 건수와 비율

In [59]:
SQL = """
SELECT *
FROM mql m full outer join closed_deals j on m.mql_id=j.mql_id
;
"""
mql = pd.read_sql(SQL, rds_python_conn())
mql

Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin,mql_id.1,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,business_type,declared_monthly_revenue
0,5143,2018-02-01,490,social,,,,,NaT,,,,,
1,5819,2017-10-20,247,paid_search,,,,,NaT,,,,,
2,1369,2018-03-22,95,organic_search,,,,,NaT,,,,,
3,312,2018-01-22,435,email,,,,,NaT,,,,,
4,3690,2018-02-21,119,organic_search,3690.0,2187.0,17.0,2.0,2018-02-26 19:58:54,pet,online_medium,cat,reseller,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,2920,2018-05-22,300,paid_search,,,,,NaT,,,,,
7996,3895,2018-03-27,269,paid_search,,,,,NaT,,,,,
7997,7527,2017-08-27,119,organic_search,,,,,NaT,,,,,
7998,639,2017-10-06,489,organic_search,,,,,NaT,,,,,


In [60]:
# 체결 날짜의 연도-월 컬럼 생성
mql['won_date_new'] = pd.to_datetime(mql['won_date']).dt.strftime('%Y-%m')

# 컨택한 날짜의 연도-월 컬럼 생성
mql['first_contact_date_new'] = pd.to_datetime(mql['first_contact_date']).dt.strftime('%Y-%m')

In [61]:
won_counts = mql.groupby('origin')['seller_id'].count().sort_values(ascending=False).reset_index()
won_percentages = round(won_counts['seller_id'] / won_counts['seller_id'].sum() * 100, 1)

# make_subplots를 사용하여 서로 다른 유형의 그래프 생성
fig = make_subplots(specs=[[{"secondary_y": True}]])

# 첫 번째 그래프 추가 (막대 그래프)
fig.add_trace(
    go.Bar(
        x=won_counts['origin'],
        y=won_counts['seller_id'],
        name='체결 건수',
        marker_color='#A0BFE0'  # 막대 그래프 색상 지정
    ),
    secondary_y=False
)

# 두 번째 그래프 추가 (라인 플롯)
fig.add_trace(
    go.Scatter(
        x=won_counts['origin'],
        y=won_percentages,
        name='체결 비율',
        mode='lines+markers',
        line=dict(color='#4A55A2'),  # 라인 플롯 색상 지정
        marker=dict(color='#4A55A2')
    ),
    secondary_y=True
)

# 레이아웃 설정
fig.update_layout(
    title='유입경로별 체결 건수와 비율',  # 전체 그래프의 제목
    yaxis=dict(title='체결 건수'),
    yaxis2=dict(title='체결 비율 (%)', overlaying='y', side='right'),
    legend=dict(),
    width=800
)

fig.show()



### 유입경로별 체결 건수

In [62]:
import chart_studio
chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')


from chart_studio.plotly import plot, iplot


won_counts = mql.groupby('origin')['seller_id'].count().sort_values(ascending=False).reset_index()

# make_subplots를 사용하여 서로 다른 유형의 그래프 생성
fig = make_subplots(specs=[[{"secondary_y": True}]])

# 첫 번째 그래프 추가 (막대 그래프)
fig.add_trace(
    go.Bar(
        x=won_counts['origin'],
        y=won_counts['seller_id'],
        name='체결 건수',
        marker_color='#4A55A2'  # 막대 그래프 색상 지정
    ),
    secondary_y=False
)

# 레이아웃 설정
fig.update_layout(
    title='유입경로별 체결 건수',  # 전체 그래프의 제목
    xaxis=dict(title='유입경로'),
    yaxis=dict(title='체결 건수'),
    legend=dict(),
    width=800
)

fig.show()

plot(fig, filename = '유입경로별 체결 건수', auto_open=True)



'https://plotly.com/~ghktkf7788/4/'

### 유입경로별 체결 비율

In [9]:
won_percentages = round(won_counts['seller_id'] / won_counts['seller_id'].sum() * 100, 1)

color = ['#4A55A2','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3','#BAC3F3']
fig = px.pie(values=won_percentages, names=won_counts['origin'])

fig.update_layout(
    title='유입경로별 체결 비율',  # 전체 그래프의 제목
    legend=dict(),
    width=650
)

fig.update_traces(pull=[0.1, 0, 0, 0],marker_colors=color)

fig.show()
plot(fig, filename = '유입경로별 체결 비율', auto_open=True)


'https://plotly.com/~ghktkf7788/6/'

### 유입경로별 체결,미체결된 잠재고객 비율과 효과

In [11]:
a = (mql.groupby('origin').size()/mql.shape[0]*100).reset_index(name='mql')
b = (mql[mql['won_date'].notnull()]['origin'].value_counts(normalize=True)*100).reset_index(name='closed_deals')
ab = a.merge(b, on='origin')
ab['difference'] = ab['closed_deals'] - ab['mql']
print(ab)

# make_subplots를 사용하여 서로 다른 유형의 그래프 생성
fig = make_subplots(specs=[[{"secondary_y": True}]])

# 첫 번째 그래프 추가 (라인 플롯)
fig.add_trace(
    go.Scatter(
        x=ab['origin'],
        y=ab['mql'],
        name='잠재고객비율',
        marker_color='#A0BFE0'  # 막대 그래프 색상 지정
    ),
    secondary_y=False
)

# 두 번째 그래프 추가 (라인 플롯)
fig.add_trace(
    go.Scatter(
        x=ab['origin'],
        y=ab['closed_deals'],
        name='체결된 잠재고객비율',
        mode='lines+markers',
        line=dict(color='#7895CB'),  # 라인 플롯 색상 지정
        marker=dict(color='#7895CB')
    )
)
# 세 번째 그래프 추가 (라인 플롯)
fig.add_trace(
    go.Scatter(
        x=ab['origin'],
        y=ab['difference'],
        name='유입경로별 효과',
        mode='lines+markers',
        line=dict(color='#4A55A2'),  # 라인 플롯 색상 지정
        marker=dict(color='#4A55A2')
    )
)

# 레이아웃 설정
fig.update_layout(
    title='유입경로별 체결,미체결된 잠재고객 비율과 효과',  # 전체 그래프의 제목
    yaxis=dict(title='비율'),
    legend=dict(),
    width=800
)

fig.show()

chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '유입경로별 체결,미체결된 잠재고객 비율과 효과', auto_open=True)

              origin      mql  closed_deals  difference
0     direct_traffic   6.2375      6.650831    0.413331
1            display   1.4750      0.712589   -0.762411
2              email   6.1625      1.781473   -4.381027
3     organic_search  28.7000     32.185273    3.485273
4              other   1.8750      0.475059   -1.399941
5  other_publicities   0.8125      0.356295   -0.456205
6        paid_search  19.8250     23.159145    3.334145
7           referral   3.5500      2.850356   -0.699644
8             social  16.8750      8.907363   -7.967637
9            unknown  14.4875     22.921615    8.434115


'https://plotly.com/~ghktkf7788/8/'

### 유입경로별 효과(체결된잠재고객비율-잠재고객비율)

In [14]:
a = (mql.groupby('origin').size() / mql.shape[0] * 100).reset_index(name='mql')
b = (mql[mql['won_date'].notnull()]['origin'].value_counts(normalize=True) * 100).reset_index(name='closed_deals')
ab = a.merge(b, on='origin')
ab['difference'] = ab['closed_deals'] - ab['mql']
ab=ab.sort_values(by='difference',ascending=False)

# 첫 번째 그래프 추가 (막대 그래프)
fig = go.Figure(data=[go.Bar(
    x=ab['origin'],
    y=ab['difference'],
    marker_color='#4A55A2'  # 모든 막대의 색상을 #4A55A2로 설정
)])

# 레이아웃 설정
fig.update_layout(
    title='유입경로별 효과(체결된잠재고객비율-잠재고객비율)',  # 전체 그래프의 제목
    yaxis=dict(title='비율'),
    legend=dict(),
    width=800
)

fig.show()
chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '유입경로별 효과(체결된잠재고객비율-잠재고객비율)', auto_open=True)

'https://plotly.com/~ghktkf7788/18/'

### 매출파악(구매자 결제건)

In [12]:
# ('delivered','shipped','invoiced','processing') 이경우만 가지고 파악.
SQL = """
SELECT *
FROM "order"
WHERE order_status in ('delivered','shipped','invoiced','processing') and order_purchase_timestamp < '2018-09-01'
order by order_id
"""


order = pd.read_sql(SQL, rds_python_conn())
order

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,0,10071,delivered,2017-02-13 16:12:43,2017-02-13 16:25:09,2017-02-14 10:58:44,2017-03-01 11:16:06,2017-03-24
1,1,36159,delivered,2018-01-07 21:13:42,2018-01-07 21:27:12,2018-01-10 18:36:40,2018-01-15 20:10:13,2018-01-31
2,2,23371,delivered,2018-08-23 13:56:50,2018-08-23 14:10:15,2018-08-23 15:42:00,2018-08-27 18:32:55,2018-09-04
3,3,31171,delivered,2017-12-12 08:39:09,2017-12-12 10:32:39,2017-12-13 18:38:53,2017-12-18 20:09:12,2018-01-11
4,4,53810,delivered,2017-11-08 15:02:42,2017-11-09 15:11:00,2017-11-16 19:34:27,2017-11-21 12:52:26,2017-11-22
...,...,...,...,...,...,...,...,...
98194,99436,27340,delivered,2018-05-08 19:42:13,2018-05-08 20:31:59,2018-05-09 12:39:00,2018-05-16 23:38:52,2018-05-22
98195,99437,87752,delivered,2017-07-05 11:56:09,2017-07-05 17:43:11,2017-07-11 16:34:32,2017-07-19 20:34:51,2017-08-04
98196,99438,92997,delivered,2017-07-05 18:06:31,2017-07-05 18:23:31,2017-07-07 13:49:57,2017-07-10 19:15:30,2017-07-21
98197,99439,97562,delivered,2017-10-27 12:07:08,2017-10-27 12:28:41,2017-10-30 13:21:46,2017-11-14 17:47:07,2017-11-21


In [13]:
SQL = """
SELECT customer_id, customer_unique_id
FROM customers
"""


uniqueid = pd.read_sql(SQL, rds_python_conn())
uniqueid

Unnamed: 0,customer_id,customer_unique_id
0,93596,86343
1,96387,55527
2,71593,44820
3,17589,36016
4,46972,96049
...,...,...
99436,40421,29126
99437,65930,3710
99438,83443,38830
99439,52687,79036


In [57]:
uniqueid.nunique()

customer_id           99441
customer_unique_id    96096
dtype: int64

In [14]:
order_uniqueid = order.merge(uniqueid,on='customer_id',how='left')
order_uniqueid

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id
0,0,10071,delivered,2017-02-13 16:12:43,2017-02-13 16:25:09,2017-02-14 10:58:44,2017-03-01 11:16:06,2017-03-24,21081
1,1,36159,delivered,2018-01-07 21:13:42,2018-01-07 21:27:12,2018-01-10 18:36:40,2018-01-15 20:10:13,2018-01-31,9745
2,2,23371,delivered,2018-08-23 13:56:50,2018-08-23 14:10:15,2018-08-23 15:42:00,2018-08-27 18:32:55,2018-09-04,53790
3,3,31171,delivered,2017-12-12 08:39:09,2017-12-12 10:32:39,2017-12-13 18:38:53,2017-12-18 20:09:12,2018-01-11,60982
4,4,53810,delivered,2017-11-08 15:02:42,2017-11-09 15:11:00,2017-11-16 19:34:27,2017-11-21 12:52:26,2017-11-22,77280
...,...,...,...,...,...,...,...,...,...
98194,99436,27340,delivered,2018-05-08 19:42:13,2018-05-08 20:31:59,2018-05-09 12:39:00,2018-05-16 23:38:52,2018-05-22,84335
98195,99437,87752,delivered,2017-07-05 11:56:09,2017-07-05 17:43:11,2017-07-11 16:34:32,2017-07-19 20:34:51,2017-08-04,33367
98196,99438,92997,delivered,2017-07-05 18:06:31,2017-07-05 18:23:31,2017-07-07 13:49:57,2017-07-10 19:15:30,2017-07-21,48993
98197,99439,97562,delivered,2017-10-27 12:07:08,2017-10-27 12:28:41,2017-10-30 13:21:46,2017-11-14 17:47:07,2017-11-21,69845


In [20]:
# ('delivered','shipped','invoiced','processing') 이경우만 가지고 파악.
SQL = """
SELECT *
FROM "order" o join payments p on o.order_id=p.order_Id 
WHERE order_status in ('delivered','shipped','invoiced','processing') and order_purchase_timestamp < '2018-09-01'
order by o.order_id
"""


orderpayment = pd.read_sql(SQL, rds_python_conn())
orderpayment = orderpayment.loc[:, ~orderpayment.columns.duplicated()]
orderpayment

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_sequential,payment_type,payment_installments,payment_value
0,0,10071,delivered,2017-02-13 16:12:43,2017-02-13 16:25:09,2017-02-14 10:58:44,2017-03-01 11:16:06,2017-03-24,1,credit_card,2,104.87
1,1,36159,delivered,2018-01-07 21:13:42,2018-01-07 21:27:12,2018-01-10 18:36:40,2018-01-15 20:10:13,2018-01-31,1,credit_card,3,126.54
2,2,23371,delivered,2018-08-23 13:56:50,2018-08-23 14:10:15,2018-08-23 15:42:00,2018-08-27 18:32:55,2018-09-04,1,credit_card,8,279.11
3,3,31171,delivered,2017-12-12 08:39:09,2017-12-12 10:32:39,2017-12-13 18:38:53,2017-12-18 20:09:12,2018-01-11,1,credit_card,7,73.64
4,4,53810,delivered,2017-11-08 15:02:42,2017-11-09 15:11:00,2017-11-16 19:34:27,2017-11-21 12:52:26,2017-11-22,1,credit_card,2,24.68
...,...,...,...,...,...,...,...,...,...,...,...,...
102560,99436,27340,delivered,2018-05-08 19:42:13,2018-05-08 20:31:59,2018-05-09 12:39:00,2018-05-16 23:38:52,2018-05-22,1,credit_card,10,1057.88
102561,99437,87752,delivered,2017-07-05 11:56:09,2017-07-05 17:43:11,2017-07-11 16:34:32,2017-07-19 20:34:51,2017-08-04,1,credit_card,1,264.66
102562,99438,92997,delivered,2017-07-05 18:06:31,2017-07-05 18:23:31,2017-07-07 13:49:57,2017-07-10 19:15:30,2017-07-21,1,credit_card,1,27.77
102563,99439,97562,delivered,2017-10-27 12:07:08,2017-10-27 12:28:41,2017-10-30 13:21:46,2017-11-14 17:47:07,2017-11-21,1,credit_card,1,61.05


In [82]:
# 객단가
# 가설 1 : 매출이 감소한건 객단가가 감소했기 때문이다.
# 전체기간기준
print(orderpayment['payment_value'].sum())
orderpayment['payment_value'].sum()/orderpayment.shape[0]

15738041.370000001


153.44456071759373

In [154]:
# (orderpayment.groupby(pd.to_datetime(orderpayment['order_purchase_timestamp']).dt.strftime('%Y-%m'))['payment_value'].sum()/orderpayment.groupby(pd.to_datetime(orderpayment['order_purchase_timestamp']).dt.strftime('%Y-%m'))['order_id'].size()).plot(label='연도-월별 객단가')
# plt.axhline(y=orderpayment['payment_value'].sum()/orderpayment.shape[0], color='gray', linestyle='--', label='전체객단가평균')
# plt.title('2018년-월별 객단가')
# plt.xlabel('연도-월')
# plt.ylabel('객단가')

# plt.legend()


### 연도-월별 고객의 주문 건수

In [17]:
a = order_uniqueid.groupby(pd.to_datetime(order_uniqueid['order_purchase_timestamp']).dt.strftime('%Y-%m'))['customer_unique_id'].size().reset_index()

# 첫 번째 그래프 추가 (라인 플롯)
fig = px.line(
        x=a['order_purchase_timestamp'],
        y=a['customer_unique_id'],
    )

# 라인 스타일과 색상 설정
fig.update_traces(
    line=dict(color='#4A55A2', dash='solid'),  # 선 스타일과 색상 설정
    marker=dict(color='#4A55A2'),  # 마커 색상 설정
)

# 레이아웃 설정
fig.update_layout(
    title='연도-월별 고객의 주문 건수',  # 전체 그래프의 제목
    xaxis=dict(title='날짜'),  # x축 레이블 설정
    yaxis=dict(title='주문 건수'),
    legend=dict(),
    width=800
)

fig.show()
chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '연도-월별 고객의 주문 건수', auto_open=True)



'https://plotly.com/~ghktkf7788/10/'

### 연도-월별 매출

In [21]:
a = orderpayment.groupby(pd.to_datetime(orderpayment['order_purchase_timestamp']).dt.strftime('%Y-%m'))['payment_value'].sum().reset_index()

# 첫 번째 그래프 추가 (라인 플롯)
fig = px.line(
        x=a['order_purchase_timestamp'],
        y=a['payment_value'],
    )

# 라인 스타일과 색상 설정
fig.update_traces(
    line=dict(color='#4A55A2', dash='solid'),  # 선 스타일과 색상 설정
    marker=dict(color='#4A55A2'),  # 마커 색상 설정
)

# 레이아웃 설정
fig.update_layout(
    title='연도-월별 매출',  # 전체 그래프의 제목
    xaxis=dict(title='날짜'),  # x축 레이블 설정
    yaxis=dict(title='매출'),
    legend=dict(),
    width=800
)

fig.show()
chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '연도-월별 매출', auto_open=True)

'https://plotly.com/~ghktkf7788/12/'

### 월별 매출 증가율

In [22]:
SQL = """
SELECT *
FROM "order" o join payments p on o.order_id=p.order_Id 
WHERE order_status in ('delivered','shipped','invoiced','processing') and order_purchase_timestamp < '2018-09-01'
order by o.order_id
"""


orderpayment = pd.read_sql(SQL, rds_python_conn())
orderpayment = orderpayment.loc[:, ~orderpayment.columns.duplicated()]
orderpayment

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_sequential,payment_type,payment_installments,payment_value
0,0,10071,delivered,2017-02-13 16:12:43,2017-02-13 16:25:09,2017-02-14 10:58:44,2017-03-01 11:16:06,2017-03-24,1,credit_card,2,104.87
1,1,36159,delivered,2018-01-07 21:13:42,2018-01-07 21:27:12,2018-01-10 18:36:40,2018-01-15 20:10:13,2018-01-31,1,credit_card,3,126.54
2,2,23371,delivered,2018-08-23 13:56:50,2018-08-23 14:10:15,2018-08-23 15:42:00,2018-08-27 18:32:55,2018-09-04,1,credit_card,8,279.11
3,3,31171,delivered,2017-12-12 08:39:09,2017-12-12 10:32:39,2017-12-13 18:38:53,2017-12-18 20:09:12,2018-01-11,1,credit_card,7,73.64
4,4,53810,delivered,2017-11-08 15:02:42,2017-11-09 15:11:00,2017-11-16 19:34:27,2017-11-21 12:52:26,2017-11-22,1,credit_card,2,24.68
...,...,...,...,...,...,...,...,...,...,...,...,...
102560,99436,27340,delivered,2018-05-08 19:42:13,2018-05-08 20:31:59,2018-05-09 12:39:00,2018-05-16 23:38:52,2018-05-22,1,credit_card,10,1057.88
102561,99437,87752,delivered,2017-07-05 11:56:09,2017-07-05 17:43:11,2017-07-11 16:34:32,2017-07-19 20:34:51,2017-08-04,1,credit_card,1,264.66
102562,99438,92997,delivered,2017-07-05 18:06:31,2017-07-05 18:23:31,2017-07-07 13:49:57,2017-07-10 19:15:30,2017-07-21,1,credit_card,1,27.77
102563,99439,97562,delivered,2017-10-27 12:07:08,2017-10-27 12:28:41,2017-10-30 13:21:46,2017-11-14 17:47:07,2017-11-21,1,credit_card,1,61.05


In [24]:
lis = []
name=[]
a = orderpayment.groupby(pd.to_datetime(orderpayment['order_purchase_timestamp']).dt.strftime('%Y-%m'))['payment_value'].sum()
for i in range(len(a)-1):
    lis.append(round((a[i+1]-a[i])/a[i]*100,1))
    name.append(list(a.index)[i+1])
lis_name = pd.DataFrame({'lis':lis,
                        'name':name})

In [25]:
fig = px.line(
        lis_name[7:],
        x='name',
        y='lis'
    )

# 라인 스타일과 색상 설정
fig.update_traces(
    line=dict(color='#4A55A2'),  # 선 스타일과 색상 설정
    marker=dict(color='#4A55A2'),  # 마커 색상 설정
)

# 레이아웃 설정
fig.update_layout(
    title='매출의 전월 대비 증가율',  # 전체 그래프의 제목
    xaxis=dict(title='날짜'),  # x축 레이블 설정
    yaxis=dict(title='증가율', range=[-40, 60]),  # y축 범위 설정
    legend=dict(),
    width=800
)

fig.show()

chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '매출의 전월 대비 증가율', auto_open=True)

'https://plotly.com/~ghktkf7788/14/'

### 재구매이력이 있는 고객의 주요구매요일

In [5]:
SQL = """
SELECT *
FROM customers
;
"""
customer = pd.read_sql(SQL, rds_python_conn())
customer

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,93596,86343,14409,franca,SP
1,96387,55527,9790,sao bernardo do campo,SP
2,71593,44820,1151,sao paulo,SP
3,17589,36016,8775,mogi das cruzes,SP
4,46972,96049,13056,campinas,SP
...,...,...,...,...,...
99436,40421,29126,3937,sao paulo,SP
99437,65930,3710,6764,taboao da serra,SP
99438,83443,38830,60115,fortaleza,CE
99439,52687,79036,92120,canoas,RS


In [18]:
SQL = """
SELECT *
FROM "order"
where order_status in ('delivered', 'shipped', 'invoiced', 'processing')
;
"""
order = pd.read_sql(SQL, rds_python_conn())
order

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,13974,65773,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,83165,13301,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,66104,44559,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,64037,69256,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,14183,35294,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26
...,...,...,...,...,...,...,...,...
98195,63791,92532,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28
98196,85704,30689,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02
98197,19817,23177,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27
98198,6896,84231,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15


In [41]:
repurcahse = customer.groupby('customer_unique_id').size().reset_index(name='count')
repurcahse_id = repurcahse[repurcahse['count']>1]
repurcahse_id

Unnamed: 0,customer_unique_id,count
3,3,2
22,22,2
36,36,2
69,69,2
80,80,2
...,...,...
95888,95888,2
95965,95965,2
95991,95991,2
96045,96045,2


In [31]:
repurcahse = customer.groupby('customer_unique_id').size().reset_index(name='count')
repurcahse_id = repurcahse[repurcahse['count']>1]
customer_filtered = customer[customer['customer_unique_id'].isin(repurcahse_id['customer_unique_id'])]
customer_filtered

# 재구매이력이 있는 customer_id

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
5,1911,35886,89254,jaragua do sul,SC
8,87023,93995,81560,curitiba,PR
13,22828,35246,5704,sao paulo,SP
32,48351,12200,42800,camacari,BA
33,94156,23164,27525,resende,RJ
...,...,...,...,...,...
99324,48147,80016,31565,belo horizonte,MG
99327,10527,45334,38050,uberaba,MG
99336,28315,59552,9530,sao caetano do sul,SP
99353,35480,78884,29134,viana,ES


In [39]:
order_filler = order.merge(customer_filtered, on='customer_id')
tmp = order_filler[['order_id','customer_id','order_purchase_timestamp','customer_unique_id']]
tmp['weekday'] = tmp['order_purchase_timestamp'].dt.weekday
tmp

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,customer_unique_id,weekday
0,13974,65773,2017-10-02 10:56:33,39184,0
1,49147,69377,2018-06-07 19:03:12,82658,3
2,23562,55857,2018-01-02 19:00:43,70950,1
3,42089,10689,2018-06-04 16:44:48,30021,0
4,60832,17680,2017-05-14 20:28:25,88858,6
...,...,...,...,...,...
6200,73760,45583,2018-02-06 08:38:54,59737,1
6201,25000,97759,2018-05-07 09:01:58,45493,0
6202,72705,11559,2018-03-17 12:52:37,63141,5
6203,65776,67923,2017-10-02 08:45:35,15121,0


In [63]:
ab=tmp.groupby('weekday')['customer_unique_id'].size().reset_index()

weekday_mapping = {
    0: '월요일',
    1: '화요일',
    2: '수요일',
    3: '목요일',
    4: '금요일',
    5: '토요일',
    6: '일요일'
}
ab['weekday'] = ab['weekday'].map(weekday_mapping)

# Plotly로 그래프 그리기
fig = px.bar(ab, x='weekday', y='customer_unique_id', title='요일별 고객 수')

# 색상 설정
fig.update_traces(marker_color='#4A55A2')

# 레이아웃 설정
fig.update_layout(
    title='재구매고객의 주요구매요일',
    yaxis=dict(title='카운트'),
    legend=dict(),
    width=800
)


fig.show()

chart_studio.tools.set_credentials_file(username='ghktkf7788', api_key='D9ASyMzNm4ltyTqLQ8CH')
plot(fig, filename = '재구매고객의 주요구매요일', auto_open=True)

'https://plotly.com/~ghktkf7788/25/'