In [13]:
import pandas as pd

# 读取需要合并的数据表

In [14]:
order_list = pd.read_csv("./订单表.csv", encoding = "gbk")
date_list = pd.read_csv("./日期表.csv", encoding = "gbk")
order_list.rename(columns = {"订单日期" : "日期"}, inplace = True)
order_list = order_list.merge(date_list, how = "left", on = ["日期"])

# 根据客户ID整理数据

In [12]:
customer_list = pd.DataFrame(order_list.groupby(by = "客户ID").产品名称.agg(" ".join))
customer_list = customer_list.drop("产品名称", 1).join(customer_list.产品名称.str.get_dummies(" "))
print(customer_list)

       公路自行车  头盔  山地自行车  帽子  手套  挂架  挡泥板  旅行自行车  水壶和水壶架  水袋背包  清洁剂  短裤  背心  \
客户ID                                                                         
11000      0   1      1   0   0   0    1      1       0     0    0   0   0   
11001      1   1      1   1   0   0    1      0       1     0    0   0   0   
11002      0   1      1   0   0   0    0      1       0     0    0   0   0   
11003      0   0      1   1   0   0    0      1       1     0    0   0   0   
11004      0   1      1   0   0   0    1      1       0     0    0   0   0   
...      ...  ..    ...  ..  ..  ..  ...    ...     ...   ...  ...  ..  ..   
29479      0   0      1   0   0   0    0      0       0     0    0   0   0   
29480      0   1      0   1   0   0    0      1       1     0    0   0   0   
29481      0   0      1   0   0   0    0      0       0     0    0   0   0   
29482      0   0      1   0   0   0    0      0       0     0    0   0   0   
29483      0   0      1   0   0   0    0      0       0     0   

In [15]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# 挖掘频繁项集

In [16]:
itemsets = apriori(customer_list, use_colnames = True, min_support = 0.05)
itemsets = itemsets.sort_values(by = "support" , ascending = False) 
print("*" * 20, "频繁项集", "*" * 20)
print(itemsets)

******************** 频繁项集 ********************
     support         itemsets
9   0.459316          (车胎和内胎)
0   0.346083          (公路自行车)
1   0.322441             (头盔)
7   0.246051         (水壶和水壶架)
2   0.221218          (山地自行车)
10  0.172690            (骑行服)
19  0.158029      (车胎和内胎, 头盔)
6   0.115938          (旅行自行车)
3   0.115343             (帽子)
5   0.114153            (挡泥板)
12  0.111069   (山地自行车, 公路自行车)
11  0.110961      (头盔, 公路自行车)
13  0.086020  (水壶和水壶架, 公路自行车)
18  0.078609     (水壶和水壶架, 头盔)
14  0.078446   (车胎和内胎, 公路自行车)
16  0.075092      (山地自行车, 头盔)
4   0.074443             (手套)
20  0.067410   (山地自行车, 旅行自行车)
21  0.067410  (山地自行车, 水壶和水壶架)
15  0.062432     (骑行服, 公路自行车)
23  0.062216     (车胎和内胎, 骑行服)
22  0.059890   (车胎和内胎, 山地自行车)
17  0.057239      (旅行自行车, 头盔)
8   0.055129             (短裤)


# 计算关联规则

In [17]:
rules =  association_rules(itemsets, metric = 'lift', min_threshold = 1)
rules = rules.sort_values(by = "lift" , ascending = False) 
print("*" * 20, "关联规则", "*" * 20)
print(rules)

******************** 关联规则 ********************
   antecedents consequents  antecedent support  consequent support   support  \
9      (旅行自行车)     (山地自行车)            0.115938            0.221218  0.067410   
8      (山地自行车)     (旅行自行车)            0.221218            0.115938  0.067410   
14     (旅行自行车)        (头盔)            0.115938            0.322441  0.057239   
15        (头盔)     (旅行自行车)            0.322441            0.115938  0.057239   
2      (山地自行车)     (公路自行车)            0.221218            0.346083  0.111069   
3      (公路自行车)     (山地自行车)            0.346083            0.221218  0.111069   
10     (山地自行车)    (水壶和水壶架)            0.221218            0.246051  0.067410   
11    (水壶和水壶架)     (山地自行车)            0.246051            0.221218  0.067410   
0      (车胎和内胎)        (头盔)            0.459316            0.322441  0.158029   
1         (头盔)     (车胎和内胎)            0.322441            0.459316  0.158029   
6      (山地自行车)        (头盔)            0.221218            0.322441  0.075