In [1]:
# 数据加载
import pandas as pd
df = pd.read_csv('./BreadBasket.csv')
df.head()

Unnamed: 0,Date,Time,Transaction,Item
0,2016/10/30,9:58:11,1,NONE
1,2016/10/30,10:05:34,2,Scandinavian
2,2016/10/30,10:05:34,2,Scandinavian
3,2016/10/30,10:07:57,3,Hot chocolate
4,2016/10/30,10:07:57,3,Jam


In [3]:
# 数据预处理
# 统一小写
df['Item'] = df['Item'].str.lower()
#去掉None项
df = df.drop(df[df.Item=='none'].index)

In [6]:
df1= df.groupby(['Transaction','Item'])['Item'].count()
#.unstack()
df1

Transaction  Item                
2            scandinavian            2
3            cookies                 1
             hot chocolate           1
             jam                     1
4            muffin                  1
5            bread                   1
             coffee                  1
             pastry                  1
6            medialuna               1
             muffin                  1
             pastry                  1
7            coffee                  1
             medialuna               1
             pastry                  1
             tea                     1
8            bread                   1
             pastry                  1
9            bread                   1
             muffin                  1
10           medialuna               1
             scandinavian            1
11           bread                   2
             medialuna               1
12           coffee                  1
             jam              

In [7]:
import time
# 采用efficient_apriori工具包
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 得到一维数组orders_series，并且将Transaction作为index, value为Item取值
    orders_series = df.set_index('Transaction')['Item']
    print(orders_series.head())
    # 将数据集格式进行转换
    transactions = []
    temp_index = 0
    for i,v in orders_series.items():
        if i!=temp_index:
            temp_set = set()
            temp_index = i 
            temp_set.add(v)
            transactions.append(temp_set)
        else:
            temp_set.add(v)
    # 挖掘频繁项集和频繁规则
    item_sets,rules = apriori(transactions,min_support=0.02,min_confidence=0.5)
    print('频繁项集:',item_sets)
    print('关联规则:',rules)
    end = time.time()
    print('用时:',end-start)
    
rule1()

Transaction
1             none
2     scandinavian
2     scandinavian
3    hot chocolate
3              jam
Name: Item, dtype: object
频繁项集: {1: {('alfajores',): 344, ('bread',): 3096, ('brownie',): 379, ('cake',): 983, ('coffee',): 4528, ('cookies',): 515, ('farm house',): 371, ('hot chocolate',): 552, ('juice',): 365, ('medialuna',): 585, ('muffin',): 364, ('none',): 754, ('pastry',): 815, ('sandwich',): 680, ('scandinavian',): 275, ('scone',): 327, ('soup',): 326, ('tea',): 1350, ('toast',): 318, ('truffles',): 192}, 2: {('bread', 'cake'): 221, ('bread', 'coffee'): 852, ('bread', 'none'): 196, ('bread', 'pastry'): 276, ('bread', 'tea'): 266, ('cake', 'coffee'): 518, ('cake', 'tea'): 225, ('coffee', 'cookies'): 267, ('coffee', 'hot chocolate'): 280, ('coffee', 'juice'): 195, ('coffee', 'medialuna'): 333, ('coffee', 'none'): 401, ('coffee', 'pastry'): 450, ('coffee', 'sandwich'): 362, ('coffee', 'tea'): 472, ('coffee', 'toast'): 224}}
关联规则: [{cake} -> {coffee}, {cookies} -> {coffee}, {h

In [22]:
# 采用mlxtend.frequent_patterns工具包
def encode_units(x):
    if x>=1:
        return 1
    if x<=0:
        return 0
    
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pd.options.display.max_columns=100
    start = time.time()
    hot_encoded_df = df.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
    hot_encoded_df = hot_encoded_df.applymap(encode_units)
    frequent_items = apriori(hot_encoded_df,min_support=0.02,use_colnames=True)
    rules = association_rules(frequent_items,metric='lift',min_threshold=0.5)
    print('频繁项集',frequent_items)
    print('关联规则',rules)
    end = time.time()
    print('用时:',end-start)
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/ 名词解释    
rule2()

频繁项集      support                 itemsets
0   0.036093              (alfajores)
1   0.324835                  (bread)
2   0.039765                (brownie)
3   0.103137                   (cake)
4   0.475081                 (coffee)
5   0.054034                (cookies)
6   0.038926             (farm house)
7   0.057916          (hot chocolate)
8   0.038296                  (juice)
9   0.061379              (medialuna)
10  0.038191                 (muffin)
11  0.079110                   (none)
12  0.085510                 (pastry)
13  0.071346               (sandwich)
14  0.028853           (scandinavian)
15  0.034309                  (scone)
16  0.034204                   (soup)
17  0.141643                    (tea)
18  0.033365                  (toast)
19  0.020145               (truffles)
20  0.023187            (cake, bread)
21  0.089393          (bread, coffee)
22  0.020564            (bread, none)
23  0.028958          (pastry, bread)
24  0.027909             (bread, tea)
25  0.0