# 数据准备

In [1]:
import numpy as np
dataset = np.array([[1, 3, 4],
                    [2, 3, 5],
                    [1, 2, 3, 5],
                    [2, 5]])

In [2]:
def create_1st_itemset(dataset):
    '''
    根据初始数据集创建单个物品的项集
    '''
#     tmp=dataset.flatten(dataset)    # 当输入数据规整时可使用numpy
#     return np.unique(tmp)
    tmp = list()
    for itemset in dataset:
        for item in itemset:
            if [item] not in tmp:
                tmp.append([item])
    tmp.sort()
    return list(map(frozenset, tmp))    # frozenset可用作字典key

In [3]:
def itemset_filter(dataset, itemsets, min_sup=0.5):
    '''
    过滤小于支持度阈值的项集，返回频繁项集
    '''
    sup_dict = dict()    # 支持度字典

    # 先计数
    for itemset in dataset:
        for item in itemsets:
            if item.issubset(itemset):
                sup_dict[item] = sup_dict.get(item, 0)+1

    len_data = len(dataset)    # 计数/总数=支持度
    freq_sets = list()
    for itemset in sup_dict:
        sup = sup_dict[itemset]/len_data
        if sup >= min_sup:
            freq_sets.append(itemset)
        sup_dict[itemset] = sup

    return freq_sets, sup_dict

In [4]:
def extend_itemset(freq_sets):
    '''
    根据已有的频繁项集生成高阶项集
    '''
    res = list()
    raw_size = len(freq_sets)

    for i in range(raw_size):
        for j in range(i+1, raw_size):    # 两两组合
            # 当两项集头部都相等时，可以进行合并
            head_1 = list(freq_sets[i])[:-1]
            head_2 = list(freq_sets[j])[:-1]
            if head_1 == head_2:
                res.append(freq_sets[i] | freq_sets[j])
    return res


# itemsets_1st= create_1st_itemset(dataset)
# freq_sets_1st,_=itemset_filter(dataset, itemsets_1st, 0.5)
# itemsets_2nd = extend_itemset(freq_sets_1st)
# freq_sets_2nd, _ = itemset_filter(dataset, itemsets_2nd, 0.5)
# itemsets_3th = extend_itemset(freq_sets_2nd)
# freq_sets_3th, _ = itemset_filter(dataset, itemsets_3th, 0.5)
# freq_sets_3th

In [5]:
def apriori(dataset, min_sup=0.5):
    itemsets_1st = create_1st_itemset(dataset)
    freq_sets_1st, sup_dict = itemset_filter(dataset, itemsets_1st, min_sup)
    freq_sets = [[], freq_sets_1st]

    while len(freq_sets[-1]) > 0:
        cur_itemsets = extend_itemset(freq_sets[-1])
        cur_freq_sets, cur_sup_dict = itemset_filter(
            dataset, cur_itemsets, min_sup)
        sup_dict.update(cur_sup_dict)
        freq_sets.append(cur_freq_sets)

    return freq_sets, sup_dict


apriori(dataset)

([[],
  [frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})],
  [frozenset({1, 3}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})],
  [frozenset({2, 3, 5})],
  []],
 {frozenset({1}): 0.5,
  frozenset({3}): 0.75,
  frozenset({4}): 0.25,
  frozenset({2}): 0.75,
  frozenset({5}): 0.75,
  frozenset({1, 3}): 0.5,
  frozenset({2, 3}): 0.5,
  frozenset({3, 5}): 0.5,
  frozenset({2, 5}): 0.75,
  frozenset({1, 2}): 0.25,
  frozenset({1, 5}): 0.25,
  frozenset({2, 3, 5}): 0.5})