In [27]:
# 引入库
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 设置基本参数
pd.set_option('display.width', 60)

In [28]:
# 导入完整数据
df = pd.read_excel('../data/supermarket_data_clean_min.xlsx', sheet_name="Data")

# 过滤数据类型错误的信息所在的行
def is_number(value):
    return isinstance(value, (int, float))
def is_string(value):
    return isinstance(value, (str,))
df = df[df["Quantity"].map(lambda x: isinstance(x, (int, float)))]
df = df[df["Sub-Category"].map(is_string)]
print(df)

      Order Date Order Date Year Month  Order Date Year  \
0       1/1/2011               2011-01             2011   
1       1/2/2011               2011-01             2011   
2       1/3/2011               2011-01             2011   
3       1/3/2011               2011-01             2011   
4       1/4/2011               2011-01             2011   
...          ...                   ...              ...   
2559  31-12-2013               2013-12             2013   
2560  31-12-2013               2013-12             2013   
2561  31-12-2014               2014-12             2014   
2562  31-12-2014               2014-12             2014   
2563  31-12-2014               2014-12             2014   

      Order Date Month  Order Date Day Ship Date  \
0                    1               1  6/1/2011   
1                    1               2  7/2/2011   
2                    1               3  8/3/2011   
3                    1               3  6/3/2011   
4                    1         

In [29]:
# 查看每个用户的订单数量 [与下方代码无关]
cc = df.groupby("Customer Name").agg({"count"})
print(cc)

                   Order Date Order Date Year Month  \
                        count                 count   
Customer Name                                         
Aaron Bergman               5                     5   
Aaron Hawkins               4                     4   
Aaron Smayling              2                     2   
Adam Bellavance             4                     4   
Adam Hart                   3                     3   
...                       ...                   ...   
Xylona Preis                7                     7   
Yana Sorensen               3                     3   
Yoseph Carroll              2                     2   
Zuschuss Carroll            3                     3   
Zuschuss Donatelli          1                     1   

                   Order Date Year Order Date Month  \
                             count            count   
Customer Name                                         
Aaron Bergman                    5                5   
Aaron Haw

In [30]:
# 筛选指定用户的有效信息 [与下方代码无关]

u1 = df[df['Customer Name'] == 'Aaron Bergman'][['Sub-Category', 'Sales', 'Quantity', 'Order Date']]
print(u1)

     Sub-Category     Sales  Quantity  Order Date
627        Phones    221.98       2.0  11/11/2013
1175  Accessories    225.24       3.0  15-01-2014
1255       Chairs     610.6       2.0  15-12-2014
2364  Furnishings  561.5379       7.0  28-12-2012
2366      Binders   23.3064       4.0  28-12-2012


In [31]:
# 获取指定用户在各类别上的购买次数作为特征 [与下方代码无关]
u1sc = u1[["Quantity", 'Sub-Category']].groupby('Sub-Category').agg({'sum'})
print(u1sc)

             Quantity
                  sum
Sub-Category         
Accessories       3.0
Binders           4.0
Chairs            2.0
Furnishings       7.0
Phones            2.0


In [32]:
# 获取所有类别的总购买次数
sc = df[['Sub-Category', 'Quantity']].groupby('Sub-Category').agg({'sum'})
sc.columns = ['All']
print(sc)

                                                    All
Sub-Category                                           
Accessories                                     500.000
Acme Box Cutter, Easy Grip                        0.000
Advantus Door Stop, Durable                       0.000
Appliances                                      273.000
Art                                             830.000
Avery Binder, Recycled                            0.000
BIC Markers, Fluorescent                          0.400
BIC Sketch Pad, Water Color                       0.000
Binders                                         964.000
Bookcases                                       392.000
Chairs                                          617.000
Copiers                                         369.000
Eaton Parchment Paper, Premium                    0.000
Eldon Clock, Durable                              0.400
Eldon Trays, Industrial                           0.400
Elite Shears, Easy Grip                         

In [33]:
# 指定用户 在每个类别上的 累计购买次数
user = "Aaron Bergman"
u1 = df[df['Customer Name'] == user][['Sub-Category', 'Sales', 'Quantity', 'Order Date']]
u1sc = u1[["Quantity", 'Sub-Category']].groupby('Sub-Category').agg({'sum'})
u1sc.columns = [user]
print(u1sc)

              Aaron Bergman
Sub-Category               
Accessories             3.0
Binders                 4.0
Chairs                  2.0
Furnishings             7.0
Phones                  2.0


In [34]:
# 获取各个用户的购买次数特征并拼接至一张表上
users = ["Aaron Bergman", "Aaron Hawkins", "Aaron Smayling", "Adam Bellavance"]
for user in users:
    u1 = df[df['Customer Name'] == user][['Sub-Category', 'Sales', 'Quantity', 'Order Date']]
    u1sc = u1[["Quantity", 'Sub-Category']].groupby('Sub-Category').agg({'sum'})
    u1sc.columns = [user]
    sc = pd.concat([sc, u1sc], axis=1)
sc = sc.fillna(0)
print(sc)

                                                    All  \
Sub-Category                                              
Accessories                                     500.000   
Acme Box Cutter, Easy Grip                        0.000   
Advantus Door Stop, Durable                       0.000   
Appliances                                      273.000   
Art                                             830.000   
Avery Binder, Recycled                            0.000   
BIC Markers, Fluorescent                          0.400   
BIC Sketch Pad, Water Color                       0.000   
Binders                                         964.000   
Bookcases                                       392.000   
Chairs                                          617.000   
Copiers                                         369.000   
Eaton Parchment Paper, Premium                    0.000   
Eldon Clock, Durable                              0.400   
Eldon Trays, Industrial                           0.400 

In [35]:
# 生成表格每列之间的相似度情况
similar = sc.corr(method = 'pearson', min_periods=1)
print(similar)

                      All  Aaron Bergman  Aaron Hawkins  \
All              1.000000       0.460703       0.276046   
Aaron Bergman    0.460703       1.000000       0.016339   
Aaron Hawkins    0.276046       0.016339       1.000000   
Aaron Smayling   0.330839       0.252276       0.040287   
Adam Bellavance  0.354117       0.237492      -0.005988   

                 Aaron Smayling  Adam Bellavance  
All                    0.330839         0.354117  
Aaron Bergman          0.252276         0.237492  
Aaron Hawkins          0.040287        -0.005988  
Aaron Smayling         1.000000         0.401543  
Adam Bellavance        0.401543         1.000000  


In [36]:
# 筛选出与指定用户最相似的用户
name = "Aaron Bergman"
us = similar[[name]]
us = us.drop('All')
us = us.drop(name)
similar_user = us.iloc[us[name].argmax()].index.values[0]
print(similar_user)

Aaron Bergman


In [37]:
# 获取最相似的用户最喜欢的商品
suf = sc[[similar_user]]
line = suf.iloc[suf[similar_user].argmax()]
print(line)

Aaron Bergman    7.0
Name: Furnishings, dtype: float64


In [38]:
# 学生可自行拓展 获取最相似的几个用户喜欢的几件商品等