In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import pyecharts
import numpy as np

In [2]:
df = pd.read_csv('../Data/UserBehavior.csv', nrows=2000000, header=None, parse_dates=True)

In [3]:
df.columns = ['User_Id', 'Item_Id', 'Category_Id', 'Behavior_Type', 'Time_Stamp']
df.head()

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior_Type,Time_Stamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [4]:
df_time = pd.to_datetime(df['Time_Stamp'], unit='s')
df['Time_Stamp'] = df_time

In [5]:
df.head()

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior_Type,Time_Stamp
0,1,2268318,2520377,pv,2017-11-24 17:21:10
1,1,2333346,2520771,pv,2017-11-24 22:15:33
2,1,2576651,149192,pv,2017-11-25 01:21:25
3,1,3830808,4181361,pv,2017-11-25 07:04:53
4,1,4365585,2520377,pv,2017-11-25 07:49:06


In [6]:
df['Behavior_Type'] = df.Behavior_Type.astype('category')
df['Time_Stamp'] = df.Time_Stamp.astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
User_Id          int64
Item_Id          int64
Category_Id      int64
Behavior_Type    category
Time_Stamp       object
dtypes: category(1), int64(3), object(1)
memory usage: 62.9+ MB


In [7]:
df['Date_split'] = df['Time_Stamp'].str.split(' ')

In [8]:
df['Date'] = df.Date_split.str.get(0)
df['Time'] = df.Date_split.str.get(1)

In [10]:
df = df.drop(['Date_split', 'Time_Stamp'], axis=1)

In [11]:
df.head()

Unnamed: 0,User_Id,Item_Id,Category_Id,Behavior_Type,Date,Time
0,1,2268318,2520377,pv,2017-11-24,17:21:10
1,1,2333346,2520771,pv,2017-11-24,22:15:33
2,1,2576651,149192,pv,2017-11-25,01:21:25
3,1,3830808,4181361,pv,2017-11-25,07:04:53
4,1,4365585,2520377,pv,2017-11-25,07:49:06


In [12]:
df.columns = ['User_Id', 'Item_Id', 'Category_Id', 'Behavior_Type', 'Date','Time']
df = df[['Date','Time', 'User_Id', 'Item_Id', 'Category_Id', 'Behavior_Type']]

In [13]:
df.User_Id.sort_values(ascending=False)
df.head()

Unnamed: 0,Date,Time,User_Id,Item_Id,Category_Id,Behavior_Type
0,2017-11-24,17:21:10,1,2268318,2520377,pv
1,2017-11-24,22:15:33,1,2333346,2520771,pv
2,2017-11-25,01:21:25,1,2576651,149192,pv
3,2017-11-25,07:04:53,1,3830808,4181361,pv
4,2017-11-25,07:49:06,1,4365585,2520377,pv


In [14]:
df.Behavior_Type.unique()

[pv, fav, buy, cart]
Categories (4, object): [pv, fav, buy, cart]

In [15]:
df['Date'] = pd.to_datetime(df['Date'])
df['Time'] = pd.to_datetime(df['Time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 6 columns):
Date             datetime64[ns]
Time             datetime64[ns]
User_Id          int64
Item_Id          int64
Category_Id      int64
Behavior_Type    category
dtypes: category(1), datetime64[ns](2), int64(3)
memory usage: 78.2 MB


In [16]:
total_day = df.Date.nunique()
print(total_day)
print(df.Time.unique())

32
['2020-01-10T17:21:10.000000000' '2020-01-10T22:15:33.000000000'
 '2020-01-10T01:21:25.000000000' ... '2020-01-10T21:24:28.000000000'
 '2020-01-10T21:34:13.000000000' '2020-01-10T21:34:48.000000000']


# Alibaba User Behavior Analysis

By Using AARRR and RFM Model to analyze the process of buying and user behavior.

# 0. 資料概述
* 時間：2017 年 11 月 25 日至 2017 年 12 月 3 日
* 資料類別：
1. 用戶 ID
2. 物品 ID
3. 品類 ID
4. 行為類別
5. 時間戳記

## 1. Building the Analytical Model

AARRR Model 主要作為分析用戶，在收到刺激與最後轉化的效益，以及評估在轉化過程中，是否有部分流程轉化率過低的問題。其中，流程可以分成：
1. 瀏覽
2. 收藏
3. 加入購物車
4. 購買

### A. 用戶總訪問量與用戶數與總用戶數分析
平均每個用戶在這 32 的數據，共有 102.33 次的月瀏覽量，平均每日的瀏覽量共有 3.2 次。

#### 跳離率數據呈現
跳離率：僅一次點擊的用戶總數 / 用戶總量。
資料統計為 32 天，在 19544 個用戶中，只有 55 個人瀏覽過一次便跳離，佔總訪問人數僅有 0.281%。說明淘寶擁有足夠的吸引力，讓用戶停留在 APP 中。

In [17]:
user_count = df.User_Id.nunique()
page_view = len(df['Behavior_Type'] == 'pv')
view_per_person = round(page_view/user_count, 2)
only_one_click = df.groupby(['User_Id'])['Behavior_Type'].count().iloc[:1][1]
print('User Number: ' + str(user_count))
print('Page View: ' + str(page_view))
print('Page View / User (One Month): ' + str(view_per_person) + ' times')
print('Page View / User (One Day): ' + str(round(view_per_person/total_day, 2)) + ' times')
print('Leaving Rate: {:.3f}%'.format(only_one_click/user_count * 100))

User Number: 19544
Page View: 2000000
Page View / User (One Month): 102.33 times
Page View / User (One Day): 3.2 times
Leaving Rate: 0.281%
