In [44]:
import numpy as np
import pandas as pd

from libs.Apriori import apriori, generateRules
from libs.fpgrowth import find_frequent_patterns, generate_association_rules

# 实验目的
1. 使用Apriori/FPtree中的函数，对数据进行关联规则挖掘，熟悉两种关联规则挖掘算法。
2. 分析、比较Apriori/FPtree在实验中挖掘的性能与结果。分析不同参数设置对结果的影响。
# 实验数据集
## Kosarak.dat
+ 用户浏览网页新闻的数据，每行数据代表一个用户浏览的新闻页面对应的ID；共99万左右的记录
## Transactions.xls
+ 交易数据集；每行数据代表一个用户购物（对20种食品）的交易记录；共计1万条记录,属于稀疏数据。
# 实验过程
## 2. 对Transactions.xls挖掘关联规则
### 数据预处理
+ xls文件处理困难，先用excel转换为csv文件然后再处理。
+ 读取方法同上个数据集。

In [45]:
def preprocess(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
        all_data = [line.split(',') for line in lines]
        products = all_data[0]
        data = []
        for d in all_data[1:]:
            things = []
            for i, buy in enumerate(d):
                if buy == '1':
                    things.append(products[i])
            data.append(things)
    return data

In [46]:
# super params
FILE = 'data/Transactions.csv'
SUPPORT_THRESHOLD = 0.01
CONFIDENCE_THRESHOLD = 0.5

In [47]:
%%time
data = preprocess(FILE)

Wall time: 45 ms


In [48]:
NUM_DATA = int(len(data))
SUPPORT_THRESHOLD_NUM = NUM_DATA * SUPPORT_THRESHOLD

### 使用apriori算法挖掘关联规则

In [49]:
%%time
# use apriori
L, supportData = apriori(data, SUPPORT_THRESHOLD)
apriori_rules = generateRules(L, supportData, CONFIDENCE_THRESHOLD)


frozenset({'yoghurt'}) --> frozenset({'milk'}) conf: 0.5197421434327155
frozenset({'brioches'}) --> frozenset({'milk'}) conf: 0.5037650602409639
frozenset({'tomato souce'}) --> frozenset({'milk'}) conf: 0.5308285163776494
frozenset({'tomato souce'}) --> frozenset({'pasta'}) conf: 0.5539499036608864
frozenset({'juices'}) --> frozenset({'milk'}) conf: 0.6123076923076923
frozenset({'biscuits'}) --> frozenset({'milk'}) conf: 0.5174102285092492
Wall time: 414 ms


In [50]:
# output
for cause, effect, conf in apriori_rules:
    print(list(cause), ' --> ', list(effect), '; conf =', conf, '; support = {:.3f}'.format(supportData[cause]))

['yoghurt']  -->  ['milk'] ; conf = 0.5197421434327155 ; support = 0.124
['brioches']  -->  ['milk'] ; conf = 0.5037650602409639 ; support = 0.133
['tomato souce']  -->  ['milk'] ; conf = 0.5308285163776494 ; support = 0.104
['tomato souce']  -->  ['pasta'] ; conf = 0.5539499036608864 ; support = 0.104
['juices']  -->  ['milk'] ; conf = 0.6123076923076923 ; support = 0.065
['biscuits']  -->  ['milk'] ; conf = 0.5174102285092492 ; support = 0.184


### 使用fpgrowth算法挖掘关联规则

In [51]:
%%time
patterns = find_frequent_patterns(data, SUPPORT_THRESHOLD_NUM)
rules = generate_association_rules(patterns, CONFIDENCE_THRESHOLD)

Wall time: 167 ms


In [52]:
# output
for cause, (effect, conf) in rules.items():
    # print(i, ' --> ', rules[i])
    print(list(cause), ' --> ', list(effect), '; conf =', conf, '; support = {:.3f}'.format(patterns[cause]/NUM_DATA))

['biscuits', 'juices']  -->  ['milk'] ; conf = 0.6491228070175439 ; support = 0.017
['brioches', 'juices']  -->  ['milk'] ; conf = 0.6549707602339181 ; support = 0.017
['juices', 'water']  -->  ['milk'] ; conf = 0.7009803921568627 ; support = 0.020
['juices', 'pasta']  -->  ['milk'] ; conf = 0.689922480620155 ; support = 0.026
['rice', 'water']  -->  ['pasta'] ; conf = 0.6 ; support = 0.017
['milk', 'rice']  -->  ['pasta'] ; conf = 0.5493421052631579 ; support = 0.030
['beer', 'pasta']  -->  ['milk'] ; conf = 0.5555555555555556 ; support = 0.031
['coke', 'tomato souce']  -->  ['pasta'] ; conf = 0.6538461538461539 ; support = 0.016
['biscuits', 'coke']  -->  ['milk'] ; conf = 0.5531914893617021 ; support = 0.019
['coke', 'pasta']  -->  ['milk'] ; conf = 0.5213032581453634 ; support = 0.040
['tomato souce', 'yoghurt']  -->  ['milk'] ; conf = 0.6578947368421053 ; support = 0.015
['milk', 'tomato souce', 'water']  -->  ['pasta'] ; conf = 0.6829268292682927 ; support = 0.021
['pasta', 'toma

In [53]:
print(patterns)

{('mozzarella',): 163, ('oil', 'pasta'): 141, ('milk', 'oil'): 151, ('frozen fish', 'pasta'): 132, ('frozen fish', 'milk'): 147, ('crackers', 'pasta'): 142, ('crackers', 'milk'): 161, ('juices', 'tunny'): 102, ('coke', 'juices'): 116, ('coffee', 'juices'): 124, ('juices', 'yoghurt'): 147, ('biscuits', 'juices'): 171, ('biscuits', 'juices', 'milk'): 111, ('brioches', 'juices'): 171, ('brioches', 'juices', 'milk'): 112, ('juices', 'water'): 204, ('juices', 'milk', 'water'): 143, ('juices', 'pasta'): 258, ('juices', 'milk', 'pasta'): 178, ('juices', 'milk'): 398, ('coffee', 'rice'): 119, ('biscuits', 'rice'): 155, ('rice', 'water'): 170, ('pasta', 'rice', 'water'): 102, ('milk', 'rice'): 304, ('milk', 'pasta', 'rice'): 167, ('pasta', 'rice'): 335, ('beer', 'brioches'): 122, ('beer', 'coffee'): 126, ('beer', 'biscuits'): 141, ('beer', 'coke'): 174, ('beer', 'pasta', 'water'): 120, ('beer', 'milk', 'water'): 153, ('beer', 'pasta'): 306, ('beer', 'milk', 'pasta'): 170, ('beer', 'milk'): 365,

# 小结
Transactions的数据集量级明显比kosarak要小。即使是在这么小的数据集上，fpgrowth算法依然具有平均四倍于apriori的性能。
最早我使用了默认的Support threshold=0.01和Confidence threshold=0.7的默认值，结果一条规则都没有发现。多次调参（炼丹），得到了当前的超参数，应该是比较好的结果：既具有比较高的测度，也有比较大的输出量。