# 1. 安装工具包
在命令提示行输入：
> pip install efficient-apriori

# 2. 测试

In [1]:
from efficient_apriori import apriori

In [2]:
transactions = [('eggs', 'bacon', 'soup'),
                ('eggs', 'bacon', 'apple'),
                ('soup', 'bacon', 'banana')]

In [3]:
itemsets, rules = apriori(transactions, min_support=0.5,  min_confidence=1)

In [4]:
print(rules)  # [{eggs} -> {bacon}, {soup} -> {bacon}]

[{eggs} -> {bacon}, {soup} -> {bacon}]


In [5]:
print(itemsets)

{1: {('bacon',): 3, ('eggs',): 2, ('soup',): 2}, 2: {('bacon', 'eggs'): 2, ('bacon', 'soup'): 2}}


In [6]:
# Print out every rule with 2 items on the left hand side,
# 1 item on the right hand side, sorted by lift
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)

In [7]:
list(rules_rhs)

[{eggs} -> {bacon}, {soup} -> {bacon}]

In [8]:
print(dir(rules[0]))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_decimals', '_pf', 'confidence', 'conviction', 'count_full', 'count_lhs', 'count_rhs', 'lhs', 'lift', 'num_transactions', 'rhs', 'support']


In [9]:
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
result = sorted(rules_rhs, key=lambda rule: rule.lift)
print(list(result))

[{eggs} -> {bacon}, {soup} -> {bacon}]


In [10]:
for rule in result:
    print(rule) # Prints the rule and its confidence, support, lift, ...

{eggs} -> {bacon} (conf: 1.000, supp: 0.667, lift: 1.000, conv: 0.000)
{soup} -> {bacon} (conf: 1.000, supp: 0.667, lift: 1.000, conv: 0.000)


Lift(A→B) = (Confidence (A→B))/(Support (B))  
refers to the increase in the ratio of sale of B when A is sold  
P(A,B)/(P(A)P(B))

# 3. PPT中的例子

In [11]:
from efficient_apriori import apriori
transactions = [('A', 'C', 'D'),
                ('B', 'C', 'E'),
                ('A', 'B', 'C', 'E'),
                ('B', 'E')]

In [12]:
itemsets, rules = apriori(transactions, min_support=0.5,  min_confidence=1)

In [13]:
itemsets

{1: {('A',): 2, ('B',): 3, ('C',): 3, ('E',): 3},
 2: {('A', 'C'): 2, ('B', 'C'): 2, ('B', 'E'): 3, ('C', 'E'): 2},
 3: {('B', 'C', 'E'): 2}}

In [14]:
rules

[{A} -> {C}, {E} -> {B}, {B} -> {E}, {C, E} -> {B}, {B, C} -> {E}]

# 4. 大数据关联规则

如果要分析的数据较大，无法直接导入内存，可以传递一个返回生成器的函数，而不是交易的列表

In [15]:
import pandas as pd

In [16]:
def data_generator(filename):
    """
    Data generator, needs to return a generator to be called several times.
    """
    def data_gen():
        with open(filename) as file:
            for line in file:
                yield tuple(k.strip() for k in line.split(','))
    return data_gen

In [17]:
# file_path = "https://github.com/seratch/apriori.js/blob/master/dataset.csv"
transactions = data_generator("dataset.csv")
itemsets, rules = apriori(transactions, min_support=0.5,  min_confidence=1)

In [18]:
itemsets

{1: {('A',): 2, ('B',): 3, ('C',): 3, ('E',): 3},
 2: {('A', 'C'): 2, ('B', 'C'): 2, ('B', 'E'): 3, ('C', 'E'): 2},
 3: {('B', 'C', 'E'): 2}}

In [19]:
rules

[{A} -> {C}, {E} -> {B}, {B} -> {E}, {C, E} -> {B}, {B, C} -> {E}]

In [20]:
transactions_2 = data_generator("store_data.csv")
itemsets_2, rules_2 = apriori(transactions_2, min_support=0.0045,  min_confidence=0.2)

In [21]:
rules

[{A} -> {C}, {E} -> {B}, {B} -> {E}, {C, E} -> {B}, {B, C} -> {E}]

In [22]:
for rule in rules[:10]:
    print(rule)

{A} -> {C} (conf: 1.000, supp: 0.500, lift: 1.333, conv: 250000000.000)
{E} -> {B} (conf: 1.000, supp: 0.750, lift: 1.333, conv: 250000000.000)
{B} -> {E} (conf: 1.000, supp: 0.750, lift: 1.333, conv: 250000000.000)
{C, E} -> {B} (conf: 1.000, supp: 0.500, lift: 1.333, conv: 250000000.000)
{B, C} -> {E} (conf: 1.000, supp: 0.500, lift: 1.333, conv: 250000000.000)
