In [1]:
import pandas as pd
import numpy as np

# import necessary file
from datetime import datetime
from tqdm.notebook import tqdm, tnrange
tqdm.pandas(desc="Loading...")

# Setting the pyecharts config
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

# Define pyecharts theme
from pyecharts.globals import ThemeType

# setting pyecharts' charts
from pyecharts.charts import Funnel, Bar, Line, Grid, TreeMap, Page, Pie
from pyecharts import options as opts

In [5]:
#importing all data
df = pd.read_csv('../Data/UserBehavior.csv', header=None, parse_dates=True)
print('Finished loading! The file length is {} rows'.format(len(df)))

Finished loading! The file length is 100150807 rows


In [6]:
# 重新命名欄位
print('Renaming Columns...')

df.columns = ['User_Id', 'Item_Id', 'Category_Id', 'Behavior', 'Time']

print('Finished Renaming column...')

Renaming Columns...
Finished Renaming column...


In [7]:
# 轉換資料型態
df['Behavior'] = df.Behavior.astype('category')
df['Time'] = pd.to_datetime(df['Time'], unit="s")

# 過濾超出時間範圍的資料
print('Filtering data...')

mask_1 = df['Time'] >= datetime(2017,11,25)
mask_2 = df['Time'] <= datetime(2017,12,3)
df = df[mask_1 & mask_2]

print('Filtering data Finished...')

Filtering data...
Filtering data Finished...


## 用戶數據分析內容

### AARRR 模型

In [8]:
# 縮減資料量
df = df[['Time', 'Behavior']]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86953655 entries, 2 to 100150794
Data columns (total 2 columns):
 #   Column    Dtype         
---  ------    -----         
 0   Time      datetime64[ns]
 1   Behavior  category      
dtypes: category(1), datetime64[ns](1)
memory usage: 1.4 GB


In [24]:
# 轉換資料
print('Transforming the data to seperate column as Week and Hour')
df['Date'] = df.Time.apply(lambda time: '{}-{}-{}'.format(time.year, time.month, time.day))
# %time df['Date'] = df.Time.dt.strftime("%Y-%m-%d")

Transforming the data to seperate column as Week and Hour


KeyboardInterrupt: 

In [22]:
%time df[:20]['Time'].apply(lambda time: '{}-{}-{}'.format(time.year, time.month, time.day))

CPU times: user 2.04 ms, sys: 42 µs, total: 2.08 ms
Wall time: 3.16 ms


2     2017-11-25
3     2017-11-25
4     2017-11-25
5     2017-11-25
6     2017-11-25
7     2017-11-26
8     2017-11-26
9     2017-11-26
10    2017-11-27
11    2017-11-27
12    2017-11-27
13    2017-11-27
14    2017-11-27
15    2017-11-28
16    2017-11-28
17    2017-11-28
18    2017-11-28
19    2017-11-28
20    2017-11-28
21    2017-11-28
Name: Time, dtype: object

In [None]:
%time df['Hour'] = df.Time.dt.strftime("%H").astype('category')

In [None]:
%time df['Week'] = df.Time.dt.strftime("%A").astype('category')
print('Finished Transforming!')