In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('goods_list.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,关键词,店铺名称,地理位置,产品名称,产品价格,付款人数,商品ID,浏览停留时间(min),同类商品浏览次数,评论数,收藏数
0,咖啡机,德颐电器旗舰店,江苏苏州,德颐DE-320触屏一键花式咖啡机家用全自动商用高压意式蒸汽小型,2998.0,88人付款,46551720000.0,7,1,2825,2304
1,咖啡机,飞利浦官方旗舰店,江苏无锡,Philips飞利浦EP3146意式全自动咖啡机家用办公室研磨一体打奶泡,5599.0,242人付款,604246000000.0,6,1,4744,5842
2,咖啡机,delonghi德龙旗舰店,上海,Delonghi/德龙D3T咖啡机全自动进口家用研磨意式触屏办公室小型,5790.0,201人付款,589764000000.0,5,14,3485,2976
3,咖啡机,nespresso官方旗舰店,江苏苏州,【赵又廷同款】NESPRESSOEssenzaMini迷你全自动进口胶囊咖啡机,866.0,741人付款,557677000000.0,3,0,2354,7150
4,咖啡机,sparllo官方海外旗舰店,香港香港岛,德国Derlla全半自动意式浓缩咖啡机家用小型奶泡机一体迷你复古,699.0,737人付款,623583000000.0,11,14,1452,6153


# 数据处理、筛选
#### 将付款人数字段转化为数字格式；筛选付款人数为准确数值（不包含+/万+）；筛选产品价格为后75%（<=Q3）。

In [2]:
df['付款人数'] = df['付款人数'].str[:-3]
df = df[~df['付款人数'].str.contains('\+')]
df = df[df['产品价格']<=df.describe()['产品价格']['75%']]
df['付款人数'] = df['付款人数'].astype(int)
df = df[['产品价格','付款人数','浏览停留时间(min)','同类商品浏览次数','评论数','收藏数']]
df.rename(columns = {'浏览停留时间(min)':'停留时长', '同类商品浏览次数':'对比次数'}, inplace = True)
df

Unnamed: 0,产品价格,付款人数,停留时长,对比次数,评论数,收藏数
3,866.0,741,3,0,2354,7150
4,699.0,737,11,14,1452,6153
12,199.0,866,13,6,4328,910
14,149.0,1452,10,4,725,5722
16,299.0,1624,13,14,4691,4156
...,...,...,...,...,...,...
2557,88.0,3324,1,1,1480,1442
2560,209.0,2877,14,13,238,443
2563,209.0,4248,1,11,4915,4961
2564,138.0,2175,10,10,3345,7100


# 产品价格
#### 产品价格与付款人数是负相关关系，随着价格的增长，购买的人数逐渐减少。

In [3]:
import altair as alt
alt.Chart(df, width=500, height=400).mark_circle(opacity=0.5, size=50).encode(
    alt.X('产品价格'),
    alt.Y('sum(付款人数)', title='总付款人数'),
    alt.Tooltip('产品价格')
).configure_axis(
    labelFontSize=12, titleFontSize=16) 

In [4]:
df_price = df.groupby(by='产品价格', as_index=False)['付款人数'].sum()
df_price.corr()

Unnamed: 0,产品价格,付款人数
产品价格,1.0,-0.101491
付款人数,-0.101491,1.0


# 停留时长
#### 停留时长与付款人数为正相关关系，用户在商品页面停留的时间越长，越有可能下单该商品。

In [5]:
alt.Chart(df).mark_line(interpolate = 'monotone').encode(
    alt.X('停留时长'),
    alt.Y('sum(付款人数)', title='总付款人数', scale = alt.Scale(zero=False))
).configure_axis(
    labelFontSize=12, titleFontSize=16) 

In [6]:
df_stay = df.groupby(by='停留时长', as_index=False)['付款人数'].sum()
df_stay.corr()

Unnamed: 0,停留时长,付款人数
停留时长,1.0,0.400063
付款人数,0.400063,1.0


# 与同类产品对比次数
#### 与同类产品对比次数与购买人数的关系极弱。

In [7]:
alt.Chart(df).mark_line(interpolate = 'monotone').encode(
    alt.X('对比次数'),
    alt.Y('sum(付款人数)', title='总付款人数', scale = alt.Scale(zero=False))
).configure_axis(
    labelFontSize=12, titleFontSize=16) 

In [8]:
df_comp = df.groupby(by='对比次数', as_index=False)['付款人数'].sum()
df_comp.corr()

Unnamed: 0,对比次数,付款人数
对比次数,1.0,0.009898
付款人数,0.009898,1.0


# 热门程度（评论数、收藏数）
#### 评论数对用户购买决策的影响极小，`应该将评论数区分为好评数和差评数来分析`。
#### 收藏数与购买人数成正相关关系，但关系较弱。

In [9]:
alt.Chart(df, width=500, height=400).mark_circle(opacity = 0.5, size=50).encode(
    alt.X('评论数'),
    alt.Y('sum(付款人数)', title='总付款人数', scale = alt.Scale(zero=False))
).configure_axis(
    labelFontSize=12, titleFontSize=16) 

In [10]:
df_comment = df.groupby(by='评论数', as_index=False)['付款人数'].sum()
df_comment.corr()

Unnamed: 0,评论数,付款人数
评论数,1.0,-0.001643
付款人数,-0.001643,1.0


In [11]:
alt.Chart(df, width=500, height=400).mark_circle(opacity = 0.5, size=50).encode(
    alt.X('收藏数'),
    alt.Y('sum(付款人数)', title='总付款人数', scale = alt.Scale(zero=False))
).configure_axis(
    labelFontSize=12, titleFontSize=16) 

In [12]:
df_save = df.groupby(by='收藏数', as_index=False)['付款人数'].sum()
df_save.corr()

Unnamed: 0,收藏数,付款人数
收藏数,1.0,0.014112
付款人数,0.014112,1.0


#### 将评论数和收藏数各切分为10个等级（区间）。
#### 从新生成的图表和关联性来看，相比于评论数，收藏数对用户购买决策的影响较大。商品的收藏数越高，用户越有可能购买。

In [13]:
df['评论数_bins'] = pd.cut(x=df['评论数'], bins=[x for x in range(100,5001,490)])
df['评论数等级'] = pd.Categorical(df['评论数_bins']).codes
df['收藏数_bins'] = pd.cut(x=df['收藏数'], bins=[x for x in range(-1,8001,800)])
df['收藏数等级'] = pd.Categorical(df['收藏数_bins']).codes
df

Unnamed: 0,产品价格,付款人数,停留时长,对比次数,评论数,收藏数,评论数_bins,评论数等级,收藏数_bins,收藏数等级
3,866.0,741,3,0,2354,7150,"(2060, 2550]",4,"(6399, 7199]",8
4,699.0,737,11,14,1452,6153,"(1080, 1570]",2,"(5599, 6399]",7
12,199.0,866,13,6,4328,910,"(4020, 4510]",8,"(799, 1599]",1
14,149.0,1452,10,4,725,5722,"(590, 1080]",1,"(5599, 6399]",7
16,299.0,1624,13,14,4691,4156,"(4510, 5000]",9,"(3999, 4799]",5
...,...,...,...,...,...,...,...,...,...,...
2557,88.0,3324,1,1,1480,1442,"(1080, 1570]",2,"(799, 1599]",1
2560,209.0,2877,14,13,238,443,"(100, 590]",0,"(-1, 799]",0
2563,209.0,4248,1,11,4915,4961,"(4510, 5000]",9,"(4799, 5599]",6
2564,138.0,2175,10,10,3345,7100,"(3040, 3530]",6,"(6399, 7199]",8


In [14]:
save = df.groupby(by='收藏数等级', as_index=False)['付款人数'].sum()
comment = df.groupby(by='评论数等级', as_index=False)['付款人数'].sum()
chart1 = alt.Chart(save, width=300, height=250).mark_circle().encode(
    alt.X('收藏数等级'),
    alt.Y('付款人数', title = '总付款人数', scale = alt.Scale(zero=False)))
chart2 = alt.Chart(comment, width=300, height=250).mark_circle().encode(
    alt.X('评论数等级'),
    alt.Y('付款人数', title = '总付款人数', scale = alt.Scale(zero=False)))
chart1 | chart2

In [15]:
save.corr(), comment.corr()

(          收藏数等级      付款人数
 收藏数等级  1.000000  0.167483
 付款人数   0.167483  1.000000,
           评论数等级      付款人数
 评论数等级  1.000000  0.067634
 付款人数   0.067634  1.000000)

# 随机森林
#### 综上所述，对购买决策影响较大的因素有：产品价格、停留时长和收藏数。根据这三个特征建立随机森林模型对购买人数进行预测

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
X = df[['产品价格','停留时长','收藏数']]
y = df['付款人数']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
score_rf = rf.score(X_train, y_train)
score_rf

0.844369421606009