# 淘宝用户购物行为数据分析

In [1]:
import pandas as pd

In [42]:
data = pd.read_csv('./user_action/user_action.csv')

In [3]:
data.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,98047837,232431562,1,4245,2014-12-06 02
1,97726136,383583590,1,5894,2014-12-09 20
2,98607707,64749712,1,2883,2014-12-18 11
3,98662432,320593836,1,6562,2014-12-06 10
4,98145908,290208520,1,13926,2014-12-16 21


## 任务目标一:从数据集角度分析

In [5]:
# 任务1
## 行数和列数
r = data.shape[0]
c = len(data.columns)

In [12]:
#唯一user_id的数量
data['user_id'].nunique()#去除重复数据之后的数据量

In [6]:
#唯一item_id的数量
data['item_id'].nunique()

2876947

In [7]:
# item_category的唯一值的数量
data['item_category'].nunique()

8916

In [43]:
dicts = {
    1:'浏览',
    2:'收藏',
    3:'加购物车',
    4:'购买'
}

In [44]:
data['behavior_type'] = data['behavior_type'].map(dicts)

In [10]:
data['behavior_type'].value_counts()

behavior_type
浏览      11550581
加购物车      343564
收藏        242556
购买        120205
Name: count, dtype: int64

In [45]:
#统计时间跨度
data['time'] = pd.to_datetime(data['time'])
data['time'].describe()#描述性值来计算时间跨度

count                         12256906
mean     2014-12-04 04:47:28.445699072
min                2014-11-18 00:00:00
25%                2014-11-26 15:00:00
50%                2014-12-04 14:00:00
75%                2014-12-11 23:00:00
max                2014-12-18 23:00:00
Name: time, dtype: object

In [15]:
#缺失值的处理
data.isna().sum()#计算每列缺失值的数量

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
12256901,False,False,False,False,False
12256902,False,False,False,False,False
12256903,False,False,False,False,False
12256904,False,False,False,False,False


In [16]:
#统计重复的行
duplicated_rows = data[data.duplicated()]

In [18]:
data['time'].dt.year.unique()

array([2014], dtype=int32)

In [20]:
data['time'].dt.month

0           12
1           12
2           12
3           12
4           12
            ..
12256901    12
12256902    12
12256903    12
12256904    12
12256905    12
Name: time, Length: 12256906, dtype: int32

In [46]:
data['day_name'] = data['time'].dt.day_name()##统计每天周几
data['month'] = data['time'].dt.month
data['year'] = data['time'].dt.year
data['month_day'] = data['time'].dt.strftime('%m-%d')

In [24]:
data['month_day'] = data['time'].apply(lambda x:x.strftime('%m-%d'))

In [25]:
data['day_name'].value_counts()

day_name
Thursday     1988870
Wednesday    1938026
Tuesday      1935266
Friday       1727332
Sunday       1586614
Monday       1557976
Saturday     1522822
Name: count, dtype: int64

## 任务目标2:PV和UV分析

In [47]:
# 要求进行11月和12月的PV量和UV量,并进行可视化工作
PV_count = data['month'].value_counts().reset_index().values.tolist()

In [54]:
data2 = data.drop_duplicates(subset=['user_id'])

In [55]:
UV_count = data2['month'].value_counts().reset_index().values.tolist()

In [51]:
PV_count

[[12, 7512976], [11, 4743930]]

In [56]:
UV_count

[[12, 6141], [11, 3859]]

In [60]:
itemstyle_pie = {
    'normal': {
        'borderRadius':15,
        'borderWidth':.5,
        'borderColor':'auto',
    }
}

In [59]:
from pyecharts import options as opts
from pyecharts.charts import Bar,Scatter,Pie,Timeline
from pyecharts.globals import ThemeType,ChartType,SymbolType,JsCode

In [63]:
from pyecharts.charts import Pie

pie=(Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK,width='100%',height='95vh'))
    .add(series_name='PV量',data_pair=PV_count,
        rosetype='area', 
        radius=['20%','40%'],
        center=['20%','55%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}'),
        itemstyle_opts=itemstyle_pie,
        )
     .add(series_name='UV量',data_pair=UV_count,
        rosetype='area', 
        radius=['20%','40%'],
        center=['75%','55%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}'),
        itemstyle_opts=itemstyle_pie,
        )
     .set_global_opts(
         title_opts=[
              dict(text='PV量',top='53%',left='19%'),
              dict(text='UV量',top='53%',left='74%'),
         ],
         legend_opts=opts.LegendOpts(pos_top='5%'),
         tooltip_opts=opts.TooltipOpts(trigger='item'),
                      )
   )
pie.render('./result/Q1.html')

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\result\\Q1.html'