# 淘宝用户购物行为数据分析

In [1]:
from calendar import day_name

import pandas as pd

In [2]:
data = pd.read_csv('./user_action/user_action.csv')

In [3]:
data.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,98047837,232431562,1,4245,2014-12-06 02
1,97726136,383583590,1,5894,2014-12-09 20
2,98607707,64749712,1,2883,2014-12-18 11
3,98662432,320593836,1,6562,2014-12-06 10
4,98145908,290208520,1,13926,2014-12-16 21


## 任务目标一:从数据集角度分析

In [5]:
# 任务1
## 行数和列数
r = data.shape[0]
c = len(data.columns)

In [12]:
#唯一user_id的数量
data['user_id'].nunique()#去除重复数据之后的数据量

In [6]:
#唯一item_id的数量
data['item_id'].nunique()

2876947

In [7]:
# item_category的唯一值的数量
data['item_category'].nunique()

8916

In [3]:
dicts = {
    1:'浏览',
    2:'收藏',
    3:'加购物车',
    4:'购买'
}

In [4]:
data['behavior_type'] = data['behavior_type'].map(dicts)

In [10]:
data['behavior_type'].value_counts()

behavior_type
浏览      11550581
加购物车      343564
收藏        242556
购买        120205
Name: count, dtype: int64

In [3]:
#统计时间跨度
data['time'] = pd.to_datetime(data['time'])
data['time'].describe()#描述性值来计算时间跨度

count                         12256906
mean     2014-12-04 04:47:28.445699072
min                2014-11-18 00:00:00
25%                2014-11-26 15:00:00
50%                2014-12-04 14:00:00
75%                2014-12-11 23:00:00
max                2014-12-18 23:00:00
Name: time, dtype: object

In [15]:
#缺失值的处理
data.isna().sum()#计算每列缺失值的数量

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
12256901,False,False,False,False,False
12256902,False,False,False,False,False
12256903,False,False,False,False,False
12256904,False,False,False,False,False


In [16]:
#统计重复的行
duplicated_rows = data[data.duplicated()]

In [18]:
data['time'].dt.year.unique()

array([2014], dtype=int32)

In [20]:
data['time'].dt.month

0           12
1           12
2           12
3           12
4           12
            ..
12256901    12
12256902    12
12256903    12
12256904    12
12256905    12
Name: time, Length: 12256906, dtype: int32

In [4]:
data['day_name'] = data['time'].dt.day_name()##统计每天周几
data['month'] = data['time'].dt.month
data['year'] = data['time'].dt.year
data['month_day'] = data['time'].dt.strftime('%m-%d')

In [24]:
data['month_day'] = data['time'].apply(lambda x:x.strftime('%m-%d'))

In [5]:
data['hour'] = data['time'].dt.hour

In [6]:
data.to_csv('./user_action/clear_data.csv')

## 任务目标2:PV和UV分析

In [47]:
# 要求进行11月和12月的PV量和UV量,并进行可视化工作
PV_count = data['month'].value_counts().reset_index().values.tolist()

In [54]:
data2 = data.drop_duplicates(subset=['user_id'])

In [55]:
UV_count = data2['month'].value_counts().reset_index().values.tolist()

In [51]:
PV_count

[[12, 7512976], [11, 4743930]]

In [56]:
UV_count

[[12, 6141], [11, 3859]]

In [60]:
itemstyle_pie = {
    'normal': {
        'borderRadius':15,
        'borderWidth':.5,
        'borderColor':'auto',
    }
}

In [39]:
from pyecharts import options as opts
from pyecharts.charts import Bar,Scatter,Pie,Timeline
from pyecharts.globals import ThemeType,ChartType,SymbolType,JsCode

In [63]:
from pyecharts.charts import Pie

pie=(Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK,width='100%',height='95vh'))
    .add(series_name='PV量',data_pair=PV_count,
        rosetype='area', 
        radius=['20%','40%'],
        center=['20%','55%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}'),
        itemstyle_opts=itemstyle_pie,
        )
     .add(series_name='UV量',data_pair=UV_count,
        rosetype='area', 
        radius=['20%','40%'],
        center=['75%','55%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}'),
        itemstyle_opts=itemstyle_pie,
        )
     .set_global_opts(
         title_opts=[
              dict(text='PV量',top='53%',left='19%'),
              dict(text='UV量',top='53%',left='74%'),
         ],
         legend_opts=opts.LegendOpts(pos_top='5%'),
         tooltip_opts=opts.TooltipOpts(trigger='item'),
                      )
   )
pie.render('./result/Q1.html')

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\result\\Q1.html'

In [8]:
from pyecharts.globals import CurrentConfig  # 加载全局配置
CurrentConfig.ONLINE_HOST = "http://127.0.0.1:8000/assets/"  # 设定静态资源地址
from pyecharts.datasets import register_url  # 部分地图文件注册
# 注册本地服务器的地图数据源
register_url("http://127.0.0.1:8000/") 

In [9]:
dp4 = data.groupby(['month_day'])['user_id'].agg([('pv','count'),('uv','nunique')])

In [16]:
dp4

Unnamed: 0_level_0,pv,uv
month_day,Unnamed: 1_level_1,Unnamed: 2_level_1
11-18,366701,6343
11-19,358823,6420
11-20,353429,6333
11-21,333104,6276
11-22,361355,6187
11-23,382702,6373
11-24,378342,6513
11-25,370239,6351
11-26,360896,6357
11-27,371384,6359


In [10]:
dp4_pv = dp4['pv'].reset_index().values.tolist()
dp4_uv = dp4['uv'].reset_index().values.tolist()

Index(['11-18', '11-19', '11-20', '11-21', '11-22', '11-23', '11-24', '11-25',
       '11-26', '11-27', '11-28', '11-29', '11-30', '12-01', '12-02', '12-03',
       '12-04', '12-05', '12-06', '12-07', '12-08', '12-09', '12-10', '12-11',
       '12-12', '12-13', '12-14', '12-15', '12-16', '12-17', '12-18'],
      dtype='object', name='month_day')

In [14]:
bar_itemstyle1 = {
    'normal': {
        'color': JsCode("""new echarts.graphic.LinearGradient(0, 0, 1, 1, [
          { offset: 0, color: 'rgb(49, 141, 234)' },
          { offset: 1, color: 'rgb(224, 62, 76)' }
        ],)"""),
        'barBorderRadius': [10,10,10,10],      #  柱子的四个角圆角设计​
        'shadowColor':'rgba(108,80,243,0.9)', # 阴影的颜色​
        'shadowBlur':5, # 阴影的宽度​
    }
}
bar_itemstyle = {
    'normal': {
        'color': JsCode("""new echarts.graphic.LinearGradient(0, 0, 1, 1, [
          { offset: 0, color: 'rgb(255, 191, 0)' },
          { offset: 1, color: 'rgb(224, 62, 76)' }
        ],)"""),
        'barBorderRadius': [10,10,10,10],      #  柱子的四个角圆角设计
        'shadowColor':'rgba(108,80,243,0.9)', # 阴影的颜色
        'shadowBlur':5, # 阴影的宽度
    }
}

line_itemstyle1 = {
    'normal': {
        'color': JsCode("""new echarts.graphic.LinearGradient(0, 0, 1, 1, [
          { offset: 0, color: 'rgb(49, 141, 234)' },
          { offset: 1, color: 'rgb(224, 62, 76)' }
        ],)"""),
        'barBorderRadius': [10,10,10,10],      #  柱子的四个角圆角设计​
        'shadowColor':'rgba(108,80,243,0.9)', # 阴影的颜色​
        'shadowBlur':5, # 阴影的宽度​
        'width':2,
    }
}
line_itemstyle = {
    'normal': {
        'color': JsCode("""new echarts.graphic.LinearGradient(0, 0, 1, 1, [
          { offset: 0, color: 'rgb(255, 191, 0)' },
          { offset: 1, color: 'rgb(224, 62, 76)' }
        ],)"""),
        'barBorderRadius': [10,10,10,10],      #  柱子的四个角圆角设计
        'shadowColor':'rgba(108,80,243,0.9)', # 阴影的颜色
        'shadowBlur':5, # 阴影的宽度
                'width':2,
    }
}

In [15]:
from pyecharts.charts import Line,Grid
bar = (Bar()
    .add_xaxis([i[0] for i in dp4_pv])
    .add_yaxis('pv',[i[1] for i in dp4_pv],z=0,
            itemstyle_opts=bar_itemstyle,
              )
    .set_global_opts(tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross'),
            axispointer_opts=opts.AxisPointerOpts(is_show=True,link=
                    [{'xAxisIndex':'all'},],),
            datazoom_opts=opts.DataZoomOpts(xaxis_index=[0,1],type_='inside'),
            xaxis_opts=opts.AxisOpts(splitline_opts={'show':False},
                        axislabel_opts=opts.LabelOpts(font_weight='bold',color = 'white')),
            yaxis_opts=opts.AxisOpts(splitline_opts={'show':False},
                        axislabel_opts=opts.LabelOpts(font_weight='bold',color = 'white'),)                         
                                                 
                    
                                     
                    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
      )
bar1 = (Bar()
    .add_xaxis([i[0] for i in dp4_uv])
    .add_yaxis('uv',[i[1] for i in dp4_uv],z=0,
            itemstyle_opts=bar_itemstyle1,
              )
        
    .set_global_opts(legend_opts=opts.LegendOpts(pos_left='center',pos_top='5%' ))
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))    
      )
line = (Line()
    .add_xaxis([i[0] for i in dp4_pv])
    .add_yaxis('pv-line',[i[1] for i in dp4_pv],z=1,
              linestyle_opts=line_itemstyle
              )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))   
       )
line1 = (Line()
    .add_xaxis([i[0] for i in dp4_uv])
    .add_yaxis('uv-line',[i[1] for i in dp4_uv],z=1,
                            linestyle_opts=line_itemstyle1
              )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))   
       )
bar.overlap(line)
bar1.overlap(line1)
grid = (Grid()
    .add(bar,opts.GridOpts(pos_left='7%',pos_top='15%',pos_bottom='53%',pos_right='3%'))
    .add(bar1,opts.GridOpts(pos_left='7%',pos_top='55%',pos_bottom='5%',pos_right='3%'))
       )
grid.render('./result/Q2_按天分析.html')

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\result\\Q2_按天分析.html'

In [27]:
hour_pv = data.groupby(by = 'hour')['user_id'].count().reset_index().values.tolist()


In [30]:
hour_uv = data.groupby(by = 'hour')['user_id'].nunique().reset_index().values.tolist()

In [61]:
from pyecharts.charts import Line,Grid
time_list = [f'{hour:02d}:00' for hour in range(24)]
line = (Line(init_opts=opts.InitOpts(theme=ThemeType.DARK))
       .add_xaxis(time_list)
         .add_yaxis('每小时的UV量',[i[1] for i in hour_uv],
                   linestyle_opts=line_itemstyle1,
                    is_smooth=True,
                    markline_opts=opts.MarkLineOpts(data=[
                   opts.MarkLineItem(type_='average',name='均值线')
               ]))
       .add_yaxis('每小时的PV量',[i[1] for i in hour_pv],
                  yaxis_index=1,
                  linestyle_opts=line_itemstyle,
                  is_smooth=True,
                  markline_opts=opts.MarkLineOpts(data=[
                   opts.MarkLineItem(type_='average',name='均值线')
               ]
                  ))
        .extend_axis(yaxis=opts.AxisOpts(type_ = 'value',position='right'))
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                         ) 
        .set_global_opts(            
            yaxis_opts=opts.AxisOpts(max_ = 12000),
        tooltip_opts=opts.TooltipOpts(trigger='axis'),
    xaxis_opts=opts.AxisOpts(splitline_opts={'show':False},
                        axislabel_opts=opts.LabelOpts(rotate = 45,font_weight='bold',color = 'white')),
)
      

)
line.render('./result/hour_pv.html')

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\result\\hour_pv.html'

In [28]:
day_name_pv = data.groupby(by = ['day_name','hour'])['user_id'].count().reset_index().values.tolist()
day_name_uv = data.groupby(by = ['day_name','hour'])['user_id'].nunique().reset_index().values.tolist()

In [72]:
data.groupby(by = ['day_name','hour'])['user_id'].count().reset_index()

Unnamed: 0,day_name,hour,user_id
0,Friday,0,94556
1,Friday,1,44804
2,Friday,2,23410
3,Friday,3,14848
4,Friday,4,12818
...,...,...,...
163,Wednesday,19,118976
164,Wednesday,20,153873
165,Wednesday,21,177736
166,Wednesday,22,172632


In [30]:
weeks = data['day_name'].unique().tolist()

In [None]:
day_name_pv

In [31]:
weeks

['Saturday', 'Tuesday', 'Thursday', 'Wednesday', 'Friday', 'Monday', 'Sunday']

In [27]:
hours = [
    '12a', '1a', '2a', '3a', '4a', '5a', '6a',
    '7a', '8a', '9a', '10a', '11a',
    '12p', '1p', '2p', '3p', '4p', '5p',
    '6p', '7p', '8p', '9p', '10p', '11p'
]

In [34]:
dp6 = data.groupby(['day_name','hour'])['user_id'].agg(
    [('pv', 'count'),('uv', 'nunique')]).reset_index()

from sklearn.preprocessing import MinMaxScaler
sca = MinMaxScaler(feature_range=(3,40))
dp6[['pv_s','uv_s']] = sca.fit_transform(dp6[['uv','pv']]).astype('int')
weekday_mapping = {
    'Monday': '周一',
    'Tuesday': '周二',
    'Wednesday': '周三',
    'Thursday': '周四',
    'Friday': '周五',
    'Saturday': '周六',
    'Sunday': '周日'
}
dp6['day_name'] = dp6['day_name'].map(weekday_mapping)

In [49]:
[int(i[2] )for i in day_name_pv]

[94556,
 44804,
 23410,
 14848,
 12818,
 14421,
 28124,
 48088,
 60178,
 72053,
 81840,
 72393,
 76553,
 85377,
 80673,
 80580,
 76066,
 69310,
 74671,
 96247,
 118266,
 139113,
 141212,
 121731,
 60981,
 31162,
 16812,
 11462,
 8802,
 10418,
 16955,
 31167,
 44525,
 57073,
 68744,
 69080,
 72408,
 81993,
 77520,
 80477,
 73684,
 64246,
 69570,
 97080,
 125400,
 143508,
 140237,
 104672,
 67049,
 34015,
 19445,
 13198,
 9937,
 11233,
 22779,
 39368,
 54273,
 60866,
 67239,
 63525,
 60432,
 69466,
 71945,
 73470,
 72160,
 64951,
 68504,
 89811,
 114534,
 131266,
 134449,
 108907,
 67631,
 37366,
 20379,
 14324,
 10084,
 9663,
 18541,
 39527,
 58235,
 66235,
 71752,
 67308,
 61800,
 71075,
 74819,
 75909,
 78299,
 65493,
 73194,
 94683,
 120171,
 141942,
 141717,
 106467,
 76750,
 39989,
 21456,
 15763,
 13588,
 13979,
 24343,
 43767,
 58731,
 76743,
 88601,
 86336,
 86297,
 96613,
 97855,
 97364,
 90983,
 82421,
 88214,
 120680,
 153216,
 178935,
 185903,
 150343,
 75367,
 40871,
 24540

In [36]:
allinfo = dp6.values.tolist()
weeks = ['周一','周二','周三','周四','周五','周六','周日']
hours = [
    '12a', '1a', '2a', '3a', '4a', '5a', '6a',
    '7a', '8a', '9a', '10a', '11a',
    '12p', '1p', '2p', '3p', '4p', '5p',
    '6p', '7p', '8p', '9p', '10p', '11p'
]

In [46]:

from pyecharts.charts import Scatter,Tab
single_axis, titles = [], []     #  创建两个空列表,用于存放所有的轴,以及标题
sca = Scatter(init_opts=opts.InitOpts(width='100%')) # 实例化一个散点图对象
for idx, day in enumerate(weeks):       # 按天遍历数据
    sca.add_xaxis(xaxis_data=hours)  # 添加x轴
    single_axis.append({    # 设定每个轴的样式
        'left': 100,        # 坐标轴距离坐标点位置
         'nameGap': 20,    # 坐标轴名称距离轴线的位置
        'nameLocation': 'start',   # 坐标轴名称的位置
        'type': 'category',     # 设定为分类数据
        'data': hours,         # 每个轴上面的x轴数据
        'top': f"{idx * 100 / 7 + 5}%",   # z坐标轴距离顶部的距离
        'height':f"{100 / 7 - 10}%",    # 坐标轴的高度
        'gridIndex':idx,      # 因为有多个轴,因此需要设定轴索引
        'axisLabel': {'interval':2, 'color':'red'}, # 轴标签的配置,设定间隔为2,颜色为红色
    })
    titles.append(dict(text=day,top=f'{idx * 100 / 7 + 5}%',left='2%')) # 每个轴有一个标题,循环添加多个
    sca.add_yaxis('',   # 绘制y轴数据,设定seriesname为空
                 y_axis=[int(item[1]) for item in  day_name_pv if item[0] == day], # 设定y轴的数据
                 symbol_size=JsCode('function(x){console.log(x);return x[1] }'), # 设定点的大小,采用回调进行设置
                 label_opts=opts.LabelOpts(is_show=False), # 关闭点上面的数字
                  yaxis_index=0,
                 )    
    sca.options['series'][idx]['singleAxisIndex'] =  idx  # 轴索引设置
    sca.options['series'][idx]['coordinateSystem'] = 'singleAxis'  # 单轴设定

sca.options['singleAxis'] = single_axis  # 设定单轴
sca.set_global_opts(
    xaxis_opts=opts.AxisOpts(is_show=False),
    yaxis_opts=opts.AxisOpts(is_show=False),
    title_opts = titles,
)
sca.render()

IndexError: list index out of range

In [32]:
data.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time,day_name,month,year,month_day,hour
0,98047837,232431562,浏览,4245,2014-12-06 02:00:00,Saturday,12,2014,12-06,2
1,97726136,383583590,浏览,5894,2014-12-09 20:00:00,Tuesday,12,2014,12-09,20
2,98607707,64749712,浏览,2883,2014-12-18 11:00:00,Thursday,12,2014,12-18,11
3,98662432,320593836,浏览,6562,2014-12-06 10:00:00,Saturday,12,2014,12-06,10
4,98145908,290208520,浏览,13926,2014-12-16 21:00:00,Tuesday,12,2014,12-16,21


In [51]:
data_view = data.groupby(by = ['behavior_type','item_id'])['user_id'].nunique().reset_index().set_index('behavior_type')

In [63]:
data_view.index.unique()

Index(['加购物车', '收藏', '浏览', '购买'], dtype='object', name='behavior_type')

In [54]:
view_list = data_view.loc['浏览'].sort_values('user_id',ascending=False).iloc[:10].values.tolist()

In [64]:
dicts = ['浏览','加购物车','购买','收藏']

In [78]:
colors_list3 = ['pink', 'magenta', 'violet', 'lavender']

In [79]:
bar = Bar()
for index,item in enumerate(dicts):
    data_pair = data_view.loc[item].sort_values('user_id',ascending=False).iloc[:10].values.tolist()
    bar.add_xaxis([i[0] for i in data_pair])
    bar.add_yaxis(f'{item}排名',[i[1] for i in data_pair],
                  itemstyle_opts=opts.ItemStyleOpts(color=colors_list3[index],))
    bar.reversal_axis()
bar.set_global_opts(legend_opts=opts.LegendOpts(selected_mode='single'))
bar.set_global_opts(
         title_opts=opts.TitleOpts(title='浏览收藏加购物车购买量', pos_left='center'),
         legend_opts=opts.LegendOpts(pos_top='5%',
                                    textstyle_opts=opts.TextStyleOpts(color='auto'),  # 文字样式
                                    ),
         tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='shadow',
                                      ),
         yaxis_opts=opts.AxisOpts(
                                 axislabel_opts=opts.LabelOpts(font_weight='bold',color='white',##粗体白色
                                                                ),
                                 )
     )
bar.render()

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\render.html'

In [69]:
itemstyle_pie = {
    'normal': {
        'borderRadius':15,
        'borderWidth':.5,
        'borderColor':'auto',
    }
}

In [75]:
itemstyle = {
    'normal': {
        'color': JsCode("""new echarts.graphic.LinearGradient(0, 0, 1, 1, [
          { offset: 0, color: 'rgb(255, 191, 0)' },
          { offset: 1, color: 'rgb(224, 62, 76)' }
        ],)"""),
        'barBorderRadius': [20,20,20,20],      #  柱子的四个角圆角设计
        'shadowColor':'rgba(108,80,243,0.9)', # 阴影的颜色
        'shadowBlur':5, # 阴影的宽度
        'width': 4,
    }
}

In [73]:
pie=(Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK,width='100%',height='95vh'))
    .add(series_name='浏览',data_pair= data_view.loc['浏览'].sort_values('user_id',ascending=False).iloc[:10].values.tolist(),
        rosetype='area', 
        radius=['20%','40%'],
        center=['20%','25%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}\n百分占比{d}%'),
        itemstyle_opts=itemstyle_pie,
        )
     .add(series_name='加购物车',data_pair=data_view.loc['加购物车'].sort_values('user_id',ascending=False).iloc[:10].values.tolist(),
        rosetype='area', 
        radius=['20%','40%'],
        center=['75%','25%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}\n百分占比{d}%'),
        itemstyle_opts=itemstyle_pie,
        )
     .add(series_name='收藏',data_pair=data_view.loc['收藏'].sort_values('user_id',ascending=False).iloc[:10].values.tolist(),
        rosetype='area', 
        radius=['20%','40%'],
        center=['20%','72%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}\n百分占比{d}%'),
        itemstyle_opts=itemstyle_pie,
        )
    .add(series_name='购买',data_pair=data_view.loc['购买'].sort_values('user_id',ascending=False).iloc[:10].values.tolist(),
        rosetype='area', 
        radius=['20%','40%'],
        center=['75%','72%'],
        label_opts=opts.LabelOpts(formatter='{b}:{c}\n百分占比{d}%'),
        itemstyle_opts=itemstyle_pie,
        )
     .set_global_opts(
         title_opts=[
              dict(text='浏览',top='23%',left='18%'),
              dict(text='加购物车',top='23%',left='73%'),
              dict(text='收藏',top='70%',left='18%'),
              dict(text='购买',top='70%',left='73%'),
         ],
         legend_opts=opts.LegendOpts(pos_top='5%'),
         tooltip_opts=opts.TooltipOpts(trigger='item'),
   )
)
pie.render()

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\render.html'

In [76]:
dp1 = data_view.loc['浏览'].sort_values('user_id',ascending=False).iloc[:10].values.tolist()
dp2 = data_view.loc['购买'].sort_values('user_id',ascending=False).iloc[:10].values.tolist()
dp3 = data_view.loc['加购物车'].sort_values('user_id',ascending=False).iloc[:10].values.tolist()
dp4 = data_view.loc['收藏'].sort_values('user_id',ascending=False).iloc[:10].values.tolist()
tab = Tab()
bar = (Bar()
       .add_xaxis([i[0] for i in dp1])
       .add_yaxis('浏览量',[i[1] for i in dp1],
                  itemstyle_opts=itemstyle)
       )
bar1 = (Bar()
       .add_xaxis([i[0] for i in dp2])
       .add_yaxis('购买',[i[1] for i in dp2],
                  itemstyle_opts=itemstyle)
       )
bar2 = (Bar()
       .add_xaxis([i[0] for i in dp3])
       .add_yaxis('加购物车',[i[1] for i in dp3],
                  itemstyle_opts=itemstyle)
       )
bar3 = (Bar()
       .add_xaxis([i[0] for i in dp4])
       .add_yaxis('收藏',[i[1] for i in dp4],
                  itemstyle_opts=itemstyle)
       )
tab.add(bar,'浏览')
tab.add(bar1,'购买')
tab.add(bar2,'加购物车')
tab.add(bar3,'收藏')
tab.render()

'C:\\Users\\wang\\Desktop\\pyLearn\\综合实训课程1\\淘宝用户行为分析\\render.html'

In [8]:
data.columns

Index(['user_id', 'item_id', 'behavior_type', 'item_category', 'time',
       'day_name', 'month', 'year', 'month_day', 'hour'],
      dtype='object')

In [10]:
data[['item_id','user_id']] = data[['item_id','user_id']].astype('int32')

data['item_category'] = data['item_category'].astype('int16')

In [14]:
data['behavior_type'] = data['behavior_type'].astype('int8')
data['day_name'] = data['day_name'].map({
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}).astype('int8')

In [13]:
data.head()

Unnamed: 0,user_id,item_id,behavior_type,item_category,time,day_name,month,year,month_day,hour
0,98047837,232431562,1,4245,2014-12-06 02:00:00,Saturday,12,2014,12-06,2
1,97726136,383583590,1,5894,2014-12-09 20:00:00,Tuesday,12,2014,12-09,20
2,98607707,64749712,1,2883,2014-12-18 11:00:00,Thursday,12,2014,12-18,11
3,98662432,320593836,1,6562,2014-12-06 10:00:00,Saturday,12,2014,12-06,10
4,98145908,290208520,1,13926,2014-12-16 21:00:00,Tuesday,12,2014,12-16,21


In [15]:
data.drop('time',axis=1,inplace=True)

In [16]:
data.to_csv('./user_action/clear_data.csv',index=None)