In [1]:
import pandas as pd
import numpy as np

# import necessary file
from datetime import datetime
from tqdm.notebook import tqdm, tnrange
tqdm.pandas(desc="Loading...")

# Setting the pyecharts config
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

# Define pyecharts theme
from pyecharts.globals import ThemeType

# setting pyecharts' charts
from pyecharts.charts import Funnel, Bar, Line, Grid, TreeMap, Page, Pie
from pyecharts import options as opts

In [43]:
#importing all data
df = pd.read_csv('../Data/UserBehavior_2019.csv', header=None, parse_dates=True)
print('Finished loading! The file length is {} rows'.format(len(df)))

Finished loading! The file length is 100150807 rows


In [44]:
# 重新命名欄位
print('Renaming Columns...')

df.columns = ['User_Id', 'Item_Id', 'Category_Id', 'Behavior', 'Time']

print('Finished Renaming column...')

Renaming Columns...
Finished Renaming column...


In [None]:
time = df[['Time', "Behavior", 'User_Id']].groupby('User_Id')['Behavior'].count()

In [81]:
time

Unnamed: 0_level_0,Time
User_Id,Unnamed: 1_level_1
1,55
2,71
3,53
4,285
5,64
...,...
1018007,162
1018008,49
1018009,311
1018010,80


In [78]:
# t = time.value_counts()
# t.plot(kind='bar')

<function list.count>

In [15]:
# 轉換資料型態
df['Behavior'] = df.Behavior.astype('category')
df['Time'] = pd.to_datetime(df['Time'], unit="s")

# 過濾超出時間範圍的資料
print('Filtering data...')

mask_1 = df['Time'] >= datetime(2017,11,25)
mask_2 = df['Time'] <= datetime(2017,12,3)
df = df[mask_1 & mask_2]

print('Filtering data Finished...')

Filtering data...
Filtering data Finished...


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86953655 entries, 2 to 100150794
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   User_Id      int64         
 1   Item_Id      int64         
 2   Category_Id  int64         
 3   Behavior     category      
 4   Time         datetime64[ns]
dtypes: category(1), datetime64[ns](1), int64(3)
memory usage: 3.3 GB


In [17]:
df.head()

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior,Time
2,1,2576651,149192,pv,2017-11-25 01:21:25
3,1,3830808,4181361,pv,2017-11-25 07:04:53
4,1,4365585,2520377,pv,2017-11-25 07:49:06
5,1,4606018,2735466,pv,2017-11-25 13:28:01
6,1,230380,411153,pv,2017-11-25 21:22:22


## 用戶數據分析內容

### AARRR 模型

In [18]:
# 縮減資料量
df_behav = df[['Time', 'Behavior']]

In [19]:
df_behav.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86953655 entries, 2 to 100150794
Data columns (total 2 columns):
 #   Column    Dtype         
---  ------    -----         
 0   Time      datetime64[ns]
 1   Behavior  category      
dtypes: category(1), datetime64[ns](1)
memory usage: 1.4 GB


In [None]:
# 轉換資料
print('Transforming the data to seperate column as Week and Hour')
%time df_behav['Date'] = df_behav.Time.dt.strftime("%Y-%m-%d").astype('category')

In [None]:
%time df_behav['Hour'] = df_behav.Time.dt.strftime("%H").astype('category')

In [None]:
%time df_behav['Week'] = df_behav.Time.dt.strftime("%A").astype('category')
print('Finished Transforming!')

In [None]:
df_behav.info()

In [None]:
# 用於讀取 Pyecharts 所需的 Javascript 文件
Bar().load_javascript()

In [None]:
def week_behav(col):
    '''
    先把日期過濾成一週，再把數據進行透視分析以取出各項數據。
    '''
    df_week = df_behav[(df_behav['Time'] >= pd.to_datetime("2017-11-26")) & (df_behav['Time'] < pd.to_datetime("2017-12-03"))]
    behavior = df_week.pivot_table(index="Date", columns="Behavior", aggfunc=['count']).iloc[:, :4]
    behavior.columns = ['buy', 'cart', 'fav', 'pv']
    behavior = behavior.reset_index()[col].to_list()
    return behavior

In [None]:
def week_chart():
    week_day = ['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat']
    bar = (
        Bar()
        .add_xaxis(week_day)
        .add_yaxis("購買量", week_behav("buy"), stack="stack_1", category_gap=60)
        .add_yaxis("購物車數量", week_behav("cart"), stack="stack_1", category_gap=60)
        .add_yaxis("我的最愛", week_behav("fav"), stack="stack_1", category_gap=60)
        .set_series_opts(
            label_opts=opts.LabelOpts(is_show=True)
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="用戶變化週期曲線"
            ),
            legend_opts=opts.LegendOpts(
                pos_top="5%"
            )
        )
    )
    
    line = (
        Line()
        .add_xaxis(week_day)
        .add_yaxis("瀏覽人次", week_behav("pv"))
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="用戶瀏覽人次",
                pos_top="48%",
            ),
            legend_opts=opts.LegendOpts(
                pos_top="50%"
            )
        )
    )
    
    # 把兩張表格合在一起
    grid = (
        Grid()
        .add(bar, grid_opts=opts.GridOpts(pos_bottom="60%"))
        .add(line, grid_opts=opts.GridOpts(pos_top="60%"))
    )
    
    return grid

week_chart().render_notebook()

### 每天用戶行為變化：

In [None]:
def day_behav(date, col) -> list:
    '''
    選擇特定一天的數據來檢視該天的日期組成
    '''
    day = df_behav[df_behav['Date'] == date].pivot_table(index="Hour", columns="Behavior", aggfunc=['count']).iloc[:, :4]
    day.columns = ['buy', 'cart', 'fav', 'pv']
    hour = day.loc[:, col].to_list()
    return hour

In [None]:
def day_chart(date):
    hours = np.arange(24).tolist()
    bar = (
        Bar()
        .add_xaxis(hours)
        .add_yaxis("購買量", day_behav(date, 'buy'), stack="stack_1", category_gap=10)
        .add_yaxis("購物車數量", day_behav(date, 'cart'), stack="stack_1", category_gap=10)
        .add_yaxis("我的最愛", day_behav(date, 'fav'), stack="stack_1", category_gap=10)
        .set_series_opts(
            label_opts=opts.LabelOpts(is_show=False)
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="{} 用戶變化週期曲線".format(date)
            ),
            legend_opts=opts.LegendOpts(
                pos_top="5%"
            )
        )
    )
    
    line = (
        Line()
        .add_xaxis(hours)
        .add_yaxis("瀏覽人次", day_behav(date, 'pv'))
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="用戶瀏覽人次",
                pos_top="48%",
            ),
            legend_opts=opts.LegendOpts(
                pos_top="50%"
            )
        )
    )
    
    # 把兩張表格合在一起
    grid = (
        Grid()
        .add(bar, grid_opts=opts.GridOpts(pos_bottom="60%"))
        .add(line, grid_opts=opts.GridOpts(pos_top="60%"))
    )
    
    return grid

day_chart('2017-12-02').render_notebook()

## 產品銷售樹圖
### 銷售最佳排行

In [None]:
import json

# 設定搜尋資料庫
df_item = df[['Item_Id', 'Behavior']]

def search_item_sales(behavior_Type):
    # 篩選前 20 名的產品 ID
    behav = df_item[df_item['Behavior'] == behavior_Type]
    ranking = behav.groupby('Item_Id')['Behavior'].count().reset_index().sort_values('Behavior', ascending=False)
    top_twenty_rank = ranking.iloc[:20,:].reset_index(drop=True).to_dict()
    
    # 進行資料轉換並儲存成 list 型態以供 pyechart tree Map 使用
    item_list = []
    for rank in range(20):
        treemap = {}
        treemap['value'] = top_twenty_rank['Behavior'][rank]
        treemap['name'] = top_twenty_rank['Item_Id'][rank]
        item_list.append(treemap)
    
    return item_list

In [None]:
def item_ranking(Behavior_Type) -> TreeMap:
    data = [
        {
            "name": Behavior_Type,
            "children": search_item_sales(Behavior_Type),   
        }
    ]
    
    treemap = (
        TreeMap(
            init_opts=opts.InitOpts(theme="light")
        )
        .add("數量", data, is_selected=True, roam='move',
             label_opts=opts.LabelOpts(
                 position='inside'
             )   
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="產品銷售商品排行"
            )
        )
    )
    return treemap
item_ranking('buy').render_notebook()

In [None]:
# 顯示購物車排行
item_ranking('cart').render_notebook()

In [None]:
# 顯示我的最愛排行
item_ranking('fav').render_notebook()

In [None]:
# 顯示瀏覽次數排行
item_ranking('pv').render_notebook()

## RFM Model
### R

In [None]:
# 縮減資料量
df_rfm = df.query('Behavior == "buy"').loc[:, ['User_Id', 'Time', 'Date']]
df_rfm.info()

In [None]:
r_pivot = df_rfm.pivot_table(index="User_Id", values="Time", aggfunc=['max']).reset_index()
r_pivot.columns = ['d', 'Time']
r_pivot['R'] = (pd.to_datetime('2017-12-03') - r_pivot['Time']).dt.days
r_pivot = r_pivot[['d', 'R']]
r_pivot
r_pivot['d'] = r_pivot.d.astype('str')
r_pivot.info()

### F

In [None]:
f = df_rfm.pivot_table(index=['User_Id', 'Date'], values='Date', aggfunc=['count']).reset_index()
f.columns = ['Id', 'Date', 'Time']

f = f.pivot_table(index="Id", values='Time', aggfunc=['sum']).reset_index()
f.columns = ['d', 'F']
f = f[f['F'] > 0].reset_index(drop=True)

In [None]:
f['d'] = f.d.astype('str')
f.info()