In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.oauth2 import service_account
from google.cloud import bigquery

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 빅쿼리 설정
SERVICE_ACCOUNT_FILE = "./api_key.json"  # 키 json 파일
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)
project_id = "bigquery-test-408414" # 각자 프로젝트에 맞게 수정
client = bigquery.Client(credentials=credentials, project=project_id)

def import_bigquery_data(query):
    query_job = client.query(query)
    return query_job.to_dataframe()

---

# 사용 데이터 및 목적
- 사용 데이터 : order_items, orders, products, users
- 목적 : 커머스 데이터에 필수적으로 확인해야할 데이터와 이에 맞는 비지니스 통찰력 획득, 매출을 높이기 위한 인사이트 획득

In [None]:
products = pd.read_csv('products.csv')
order_items = pd.read_csv('order_items.csv')
orders = pd.read_csv('orders.csv')
users = pd.read_csv('users.csv')

## 1. 매출
- 월별 매출액, 구매수, 구매자수, 평균 구매 금액 및 주문수

In [None]:
# 전처리
using_order_items = order_items[~order_items['status'].isin(['Cancelled','Returned'])]
using_order_items['month'] = [i[5:7] for i in using_order_items['created_at']]
using_order_items = using_order_items[['order_id','sale_price','month']]

using_orders = orders[['order_id','user_id','num_of_item']]

# data merge
using_merge_data = pd.merge(
    using_order_items,
    using_orders,
    how='inner',
    on='order_id'
)

# revenue
using_merge_data['revenue'] = round(using_merge_data['num_of_item'] * using_merge_data['sale_price'])

In [None]:
result1 = pd.merge(
    using_merge_data.groupby('month')[['order_id','user_id']].nunique().reset_index(),
    using_merge_data.groupby('month')[['revenue']].sum().reset_index(),
    on='month'
)

result1.columns = ['month','order_cnt','usr_cnt', 'revenue']
result1['aov'] = round(result1['revenue'] / result1['order_cnt']) # 평균 주문 금액
result1['평균주문수'] = round(result1['order_cnt']/result1['usr_cnt'], 2) # 평균 주문수

In [None]:
result1

Unnamed: 0,month,order_cnt,usr_cnt,revenue,aov,평균주문수
0,1,3153,3080,469593.0,149.0,1.02
1,2,3256,3175,523314.0,161.0,1.03
2,3,3663,3570,580410.0,158.0,1.03
3,4,3868,3743,615168.0,159.0,1.03
4,5,4361,4191,673755.0,154.0,1.04
5,6,4780,4549,765115.0,160.0,1.05
6,7,5908,5464,962919.0,163.0,1.08
7,8,6259,5192,1025531.0,164.0,1.21


## 2. 주요 고객은 누구인가?
- 출신, 성별, 연령대 분포

In [None]:
# 출신, 성별 유저수 분포
revenue_usr_data = pd.merge(users, using_merge_data, how='right', left_on='id', right_on='user_id')
revenue_usr_data.groupby(['country','gender'])[['id']].nunique().sort_values(by='id', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,id
country,gender,Unnamed: 2_level_1
China,F,4862
China,M,4622
United States,M,3199
United States,F,3118
Brasil,M,2088
Brasil,F,2005
South Korea,F,759
South Korea,M,746
France,F,659
United Kingdom,F,658


In [None]:
# 성별 매출 분포
revenue_usr_data.groupby('gender')[['revenue']].sum()

Unnamed: 0_level_0,revenue
gender,Unnamed: 1_level_1
F,2808781.0
M,2807024.0


In [None]:
age_category_lst = []
for i in revenue_usr_data['age']:
    if i < 10:
        age_category_lst.append(0)
    elif i < 20:
        age_category_lst.append(10)
    elif i < 30:
        age_category_lst.append(20)
    elif i < 40:
        age_category_lst.append(30)
    elif i < 50:
        age_category_lst.append(40)
    else:
        age_category_lst.append(50)
revenue_usr_data['age_category'] = age_category_lst

In [None]:
revenue_usr_data.groupby('age_category')[['revenue']].sum().sort_values(by='revenue', ascending=False)

Unnamed: 0_level_0,revenue
age_category,Unnamed: 1_level_1
50,2002828.0
30,950778.0
40,945827.0
20,928584.0
10,787788.0


In [None]:
revenue_usr_data.groupby('age_category')[['order_id']].nunique().sort_values(by='order_id', ascending=False)

Unnamed: 0_level_0,order_id
age_category,Unnamed: 1_level_1
50,12500
30,5862
20,5781
40,5704
10,4763


## 3. 브랜드와 상품 카테고리의 분포

In [None]:
brand_category_data = import_bigquery_data('''
    SELECT
        C.brand AS brand,
        ROUND(SUM(A.sale_price * B.num_of_item), 2) AS revenue,
        SUM(B.num_of_item) AS quantity
    FROM `bigquery-public-data.thelook_ecommerce.order_items` A
    INNER JOIN `bigquery-public-data.thelook_ecommerce.orders` B ON A.order_id = B.order_id
    INNER JOIN `bigquery-public-data.thelook_ecommerce.products` C ON A.product_id = C.id
    WHERE A.status NOT IN ('Cancelled', 'Returned')
    GROUP BY C.brand
    ORDER BY revenue DESC
    ;
''')

In [None]:
brand_category_data.head(10)

Unnamed: 0,brand,revenue,quantity
0,Calvin Klein,295598.59,4478
1,Diesel,288618.29,2113
2,True Religion,263295.12,1300
3,7 For All Mankind,259057.76,1633
4,Carhartt,254961.7,3624
5,Tommy Hilfiger,179548.36,2478
6,Volcom,167418.59,2750
7,Joe's Jeans,164641.55,1106
8,Quiksilver,150270.6,2582
9,Columbia,141241.75,2115


In [None]:
goods_category_data = import_bigquery_data('''
    SELECT
        C.category AS category,
        ROUND(SUM(A.sale_price * B.num_of_item), 2) AS revenue,
        SUM(B.num_of_item) AS quantity
    FROM `bigquery-public-data.thelook_ecommerce.order_items` A
    INNER JOIN `bigquery-public-data.thelook_ecommerce.orders` B ON A.order_id = B.order_id
    INNER JOIN `bigquery-public-data.thelook_ecommerce.products` C ON A.product_id = C.id
    WHERE A.status NOT IN ('Cancelled', 'Returned')
    GROUP BY C.category
    ORDER BY revenue DESC
    ;
''')

In [None]:
goods_category_data.head(10)

Unnamed: 0,category,revenue,quantity
0,Outerwear & Coats,1878882.82,12918
1,Jeans,1793646.56,18191
2,Sweaters,1193029.88,15977
3,Suits & Sport Coats,932881.04,7286
4,Swim,932857.63,16348
5,Fashion Hoodies & Sweatshirts,918333.05,17152
6,Sleep & Lounge,791512.14,16041
7,Shorts,723847.38,15879
8,Tops & Tees,712285.89,17147
9,Dresses,657368.55,7940


## 4. 취소 및 반품


In [None]:
cancel_return_data = pd.merge(
    order_items[order_items['status'].isin(['Returned','Cancelled'])],
    users,
    left_on='user_id', right_on='id',
    how='left'
)

In [None]:
cancel_return_data.groupby('country')[['order_id']].nunique().sort_values(by='order_id',ascending=False)

Unnamed: 0_level_0,order_id
country,Unnamed: 1_level_1
China,4022
United States,2590
Brasil,1711
South Korea,600
France,567
United Kingdom,526
Spain,519
Germany,487
Australia,283
Japan,261


## 5. 마케팅 채널별

In [None]:
mkt_channel_data = import_bigquery_data('''
    SELECT
        B.traffic_source AS traffic_source,
        COUNT(DISTINCT A.user_id) AS total_customer
    FROM `bigquery-public-data.thelook_ecommerce.order_items` A
    INNER JOIN `bigquery-public-data.thelook_ecommerce.users` B ON A.user_id = B.id
    WHERE A.status NOT IN ('Cancelled', 'Returned')
    GROUP BY B.traffic_source
    ORDER BY total_customer DESC
''')

In [None]:
mkt_channel_data

Unnamed: 0,traffic_source,total_customer
0,Search,46444
1,Organic,9979
2,Facebook,3935
3,Email,3325
4,Display,2690
