In [1]:
import numpy as np
import pandas as pd

from libs.Apriori import apriori, generateRules
from libs.fpgrowth import find_frequent_patterns, generate_association_rules

# 实验目的
1. 使用Apriori/FPtree中的函数，对数据进行关联规则挖掘，熟悉两种关联规则挖掘算法。
2. 分析、比较Apriori/FPtree在实验中挖掘的性能与结果。分析不同参数设置对结果的影响。

# 实验数据集

## Kosarak.dat
用户浏览网页新闻的数据，每行数据代表一个用户浏览的新闻页面对应的ID；共99万左右的记录

## Transactions.xls
交易数据集；每行数据代表一个用户购物（对20种食品）的交易记录；共计1万条记录,属于稀疏数据。

# 实验过程

## 1. 对Kosarak.dat挖掘关联规则

### 数据预处理
由于kosarak.dat文件每行长度不一且没有标题，所以选择通过普通的文件I/O读取为列表。
由于两种算法的工具都是传入列表，所以不做df或array的组装。

In [2]:
def load_data(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
        return [line.split(' ') for line in lines]

In [14]:
# super params
KOSARAK_FILE = 'data/kosarak.dat'
KOSARAK_SUPPORT_THRESHOLD = 0.01
KOSARAK_CONFIDENCE_THRESHOLD = 0.7

In [None]:
%%time
kosarak_list = load_data(KOSARAK_FILE)

In [15]:
NUM_DATA = int(len(kosarak_list))
KOSARAK_SUPPORT_THRESHOLD_NUM = NUM_DATA * KOSARAK_SUPPORT_THRESHOLD

### 使用apriori算法挖掘关联规则
记录：
+ 使用原始的Apriori算法，我本地用i7-7700hq的CPU跑了半个小时的时间没有出结果

In [None]:
%%time
# use apriori
L, supportData = apriori(kosarak_list, KOSARAK_SUPPORT_THRESHOLD)
apriori_rules = generateRules(L, supportData, KOSARAK_CONFIDENCE_THRESHOLD)


In [None]:
# output
for cause, effect, conf in apriori_rules:
    print(list(cause), ' --> ', list(effect), '; conf =', conf, '; support = {:.3f}'.format(supportData[cause]))

### 使用fpgrowth算法挖掘关联规则
记录：
+ 10%的支持度阈值，耗时7.6秒
+ 1%的支持度阈值，耗时7min 36s

In [16]:
%%time
patterns = find_frequent_patterns(kosarak_list, KOSARAK_SUPPORT_THRESHOLD_NUM)
rules = generate_association_rules(patterns, KOSARAK_CONFIDENCE_THRESHOLD)

Wall time: 7min 36s


In [17]:
# output
for cause, (effect, conf) in rules.items():
    # print(i, ' --> ', rules[i])
    print(list(cause), ' --> ', list(effect), '; conf =', conf, '; support = {:.3f}'.format(patterns[cause]/NUM_DATA))

['423']  -->  ['6'] ; conf = 0.9711862861311207 ; support = 0.011
['32']  -->  ['6'] ; conf = 0.9124582166410696 ; support = 0.011
['155']  -->  ['6'] ; conf = 0.9627631807899774 ; support = 0.012
['378']  -->  ['6'] ; conf = 0.9316996871741398 ; support = 0.012
['14']  -->  ['6'] ; conf = 0.8916171275954166 ; support = 0.012
['11', '364']  -->  ['6'] ; conf = 0.9959506363285769 ; support = 0.010
['364', '6']  -->  ['11'] ; conf = 0.8787749893662271 ; support = 0.012
['11', '25']  -->  ['6'] ; conf = 0.995909259947936 ; support = 0.011
['25', '6']  -->  ['11'] ; conf = 0.8767392371910296 ; support = 0.012
['11', '49']  -->  ['6'] ; conf = 0.9961464354527938 ; support = 0.011
['49', '6']  -->  ['11'] ; conf = 0.8892620198214432 ; support = 0.012
['11', '512']  -->  ['6'] ; conf = 0.9958539882830104 ; support = 0.011
['512', '6']  -->  ['11'] ; conf = 0.8901152018045597 ; support = 0.013
['7', '897']  -->  ['6'] ; conf = 0.9367743959836837 ; support = 0.013
['6', '897']  -->  ['7'] ; con

In [19]:
print(patterns)

{('229',): 10034, ('254',): 10077, ('28',): 10111, ('361',): 10122, ('667',): 10616, ('423',): 10967, ('423', '6'): 10651, ('32',): 11069, ('32', '6'): 10100, ('155',): 11494, ('155', '6'): 11066, ('378',): 11508, ('378', '6'): 10722, ('14',): 11607, ('14', '6'): 10349, ('11', '364'): 10372, ('11', '364', '6'): 10330, ('364', '6'): 11755, ('91',): 12248, ('11', '25'): 10756, ('11', '25', '6'): 10712, ('25', '6'): 12218, ('11', '49'): 10899, ('11', '49', '6'): 10857, ('49', '6'): 12209, ('11', '512'): 11095, ('11', '512', '6'): 11049, ('512', '6'): 12413, ('27', '897'): 10270, ('11', '7', '897'): 9934, ('11', '6', '897'): 10626, ('7', '897'): 12748, ('6', '7', '897'): 11942, ('6', '897'): 13068, ('11', '504'): 10109, ('11', '504', '6'): 9918, ('504', '6'): 12555, ('11', '314'): 10991, ('11', '314', '6'): 10844, ('314', '6'): 13281, ('11', '56'): 13483, ('11', '56', '6'): 13418, ('56', '6'): 15151, ('11', '273'): 12555, ('11', '273', '6'): 12363, ('273', '6'): 15270, ('11', '737'): 12606

# 小结
在较大的数据集上跑了两种算法，fpgrowth可以在数分钟内完成关系规则挖掘并且剪枝效率奇高，在阈值较高的情况下居然可以数秒完成；而apriori虽然在前期在小型数据集上测试时表现良好，一旦应用在kosarak上，表现十分差劲，以致于并没有跑出结果。可见，在关联规则挖掘的问题上，确实是fpgrowth算法比较适合。