In [18]:
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataSet):
    C1 = []  #创建空列表，用来存储所有不重复项值
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])  #添加只包含该项值的一个列表
    C1.sort()
    return list(map(frozenset, C1))  ##对C1中每个项构成一个不可改变的集合，因为之后必须将这些集合作为字典键值使用

def scanD(D, Ck, minSupport):
    '''
    该函数用于从C1生成L1,同时会返回一个包含支持度值的字典以备后用
    D:数据集
    Ck:候选项集列表
    minSupport:感兴趣项集的最小支持度
    '''
    ssCnt = {}  #创建一个空字典
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt: ssCnt[can] = 1
                else: ssCnt[can] += 1
    D = list(map(set,D))
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems  ## 计算所有项集的支持度
        if support >= minSupport:
            retList.insert(0, key)
        supportData[key] = support
    return retList, supportData



In [19]:
dataSet = loadDataSet()
print(dataSet)
C1 = createC1(dataSet)
print(C1)

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]


In [20]:
D = list(map(set, dataSet))
print(D)

[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]


In [21]:
L1, suppData0 = scanD(D, C1, 0.5)
print(L1)

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]


In [22]:
def aprioriGen(Lk, k):
    '''
    Args:
     Lk: 频繁项集列表Lk
     k:项集元素个数k
     
    Returns:
    Ck: 以{0}，{1}，{2}作为输入，得到{0,1}，{1,2}，{0,2}
    '''
    retList = []
    lenLk = len(Lk) #计算Lk的元素数目
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1==L2:
                retList.append(Lk[i] | Lk[j])  #前k-2项相同时，将两个集合合并
    return retList

def apriori(dataSet, minSupport = 0.5):
    '''
    Args:
      dataSet: 数据集
      minSupport: 最小支持度
    
    Returns:
      
    '''
    C1 = createC1(dataSet)
    D = list(map(set, dataSet)) #列表集合
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]  #L1放入列表L
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)  #创建Ck:候选项集列表
        Lk, supK = scanD(D, Ck, minSupport)  #扫描数据集，丢掉不满足最小支持度要求的项集，从Ck得到Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

In [23]:
L,suppData = apriori(dataSet)
print(L)

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})], []]


In [24]:
L[0]

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [25]:
L[1]

[frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]

In [26]:
L[2]

[frozenset({2, 3, 5})]

In [27]:
L[3]

[]

In [28]:
aprioriGen(L[0], 2)

[frozenset({2, 5}),
 frozenset({3, 5}),
 frozenset({1, 5}),
 frozenset({2, 3}),
 frozenset({1, 2}),
 frozenset({1, 3})]

In [29]:
L, suppData = apriori(dataSet, minSupport=0.7)
print(L)

[[frozenset({5}), frozenset({2}), frozenset({3})], [frozenset({2, 5})], []]


In [30]:
suppData

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75,
 frozenset({2, 5}): 0.75,
 frozenset({3, 5}): 0.5,
 frozenset({2, 3}): 0.5}

频繁项集的量化定义：满足最小支持度要求。关联规则，也有类似量化方法，这种指标称为可信度。
如果某条规则并不满足最小可信度要求，那么该规则的所有子集也不会满足最小可信度要求。可以使用该性质减少需要测试的规则数目


In [36]:
def generateRules(L, supportData, minConf=0.7):
    '''
    主函数
    Args:
      L: 频繁项集列表
      supportData:包含频繁项集支持数据的字典
      minConf:最小可信度阈值，默认为0.7
    
    Returns:
      bigRuleList: 生成一个包含可信度的规则列表
    
    '''
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:  #无法从单元素项集中构建关联规则，所以要从包含两个或更多元素的项集开始
            H1 = [frozenset([item]) for item in freqSet]  #遍历L中的每个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1
            if i > 1:
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    '''
    对规则进行评估：计算规则的可信度及找到满足最小可信度要求的规则
    
    Returns:
      prunedH:返回满足最小可信度要求的规则列表
    '''
    prunedH = []  #创建空列表
    for conseq in H:  #遍历所有项集并计算可信度
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:  #如果满足规则，输出到屏幕
            print(freqSet - conseq,'--->',conseq, 'conf:', conf)
            brl.append((freqSet-conseq, conseq, conf)) #对brl进行填充
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    '''
    生成候选规则集合
    Args:
      freqSet:
    '''
    m = len(H[0])  #计算H中的频繁集大小m
    if(len(freqSet) > (m + 1)):
        Hmp1 = aprioriGen(H, m+1)  #使用aprioriGen()生成H中元素的无重复组合，保存在Hmp1中，并做为下次迭代的列表
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) #测试可信度以确定规则是否满足要求
        if (len(Hmp1) > 1): #如果不止一条规则满足要求
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) #调用函数判断是否可以进一步组合这些规则
            

In [32]:
L1, suppData1 = apriori(dataSet, minSupport=0.5)
print(L1)
print(suppData1)

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})], []]
{frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({2, 3, 5}): 0.5}


In [37]:
rules = generateRules(L1, suppData1, minConf=0.7)
print(rules)

frozenset({5}) ---> frozenset({2}) conf: 1.0
frozenset({2}) ---> frozenset({5}) conf: 1.0
frozenset({1}) ---> frozenset({3}) conf: 1.0
[(frozenset({5}), frozenset({2}), 1.0), (frozenset({2}), frozenset({5}), 1.0), (frozenset({1}), frozenset({3}), 1.0)]


In [38]:
rules2 = generateRules(L1, suppData1, minConf=0.5)
print(rules2)

frozenset({3}) ---> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) ---> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) ---> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) ---> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) ---> frozenset({2}) conf: 1.0
frozenset({2}) ---> frozenset({5}) conf: 1.0
frozenset({3}) ---> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) ---> frozenset({3}) conf: 1.0
frozenset({5}) ---> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) ---> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) ---> frozenset({3, 5}) conf: 0.6666666666666666
[(frozenset({3}), frozenset({2}), 0.6666666666666666), (frozenset({2}), frozenset({3}), 0.6666666666666666), (frozenset({5}), frozenset({3}), 0.6666666666666666), (frozenset({3}), frozenset({5}), 0.6666666666666666), (frozenset({5}), frozenset({2}), 1.0), (frozenset({2}), frozenset({5}), 1.0), (frozenset({3}), frozenset({1}), 0.6666666666666666), (frozenset({1}), frozenset(

示例：发现毒蘑菇的相似特征

* 第一个特征表示有毒或者可食用。样本有毒，值为2。下一个特征为蘑菇伞的形状。

In [39]:
mushDatSet = [line.split() for line in open('mushroom.dat').readlines()]
print(mushDatSet[5])

['2', '3', '10', '14', '23', '26', '34', '36', '39', '41', '52', '55', '59', '63', '67', '76', '85', '86', '90', '93', '98', '108', '114']


In [41]:
print(mushDatSet[1])

['2', '3', '9', '14', '23', '26', '34', '36', '39', '40', '52', '55', '59', '63', '67', '76', '85', '86', '90', '93', '99', '108', '114']


In [43]:
L2,suppData2 = apriori(mushDatSet, minSupport=0.3)
for item in L2[1]:
    if item.intersection('2'): print(item)

frozenset({'28', '2'})
frozenset({'53', '2'})
frozenset({'23', '2'})
frozenset({'34', '2'})
frozenset({'36', '2'})
frozenset({'59', '2'})
frozenset({'63', '2'})
frozenset({'67', '2'})
frozenset({'76', '2'})
frozenset({'85', '2'})
frozenset({'86', '2'})
frozenset({'90', '2'})
frozenset({'93', '2'})
frozenset({'39', '2'})


In [44]:
print(suppData2)

{frozenset({'1'}): 0.48202855736090594, frozenset({'107'}): 0.1536189069423929, frozenset({'113'}): 0.04529788281634663, frozenset({'13'}): 0.2811422944362383, frozenset({'23'}): 0.4155588380108321, frozenset({'25'}): 0.03151157065484983, frozenset({'3'}): 0.4500246184145741, frozenset({'34'}): 0.9741506646971935, frozenset({'36'}): 0.8385032003938946, frozenset({'38'}): 0.30920728705071393, frozenset({'40'}): 0.050221565731166914, frozenset({'52'}): 0.4327917282127031, frozenset({'54'}): 0.137863121614968, frozenset({'59'}): 0.637124569177745, frozenset({'63'}): 0.6075824716888233, frozenset({'67'}): 0.5494830132939439, frozenset({'76'}): 0.5396356474643033, frozenset({'85'}): 1.0, frozenset({'86'}): 0.9753815854258986, frozenset({'9'}): 0.31462333825701627, frozenset({'90'}): 0.9217134416543574, frozenset({'93'}): 0.48842934515017233, frozenset({'98'}): 0.23042836041358936, frozenset({'108'}): 0.049236829148202856, frozenset({'114'}): 0.26440177252584934, frozenset({'14'}): 0.1319547

In [45]:
for item in L2[3]:
    if item.intersection('2'): print(item)

frozenset({'28', '63', '2', '34'})
frozenset({'28', '2', '85', '34'})
frozenset({'28', '90', '2', '34'})
frozenset({'28', '2', '34', '53'})
frozenset({'28', '59', '2', '34'})
frozenset({'28', '59', '63', '2'})
frozenset({'28', '59', '2', '85'})
frozenset({'28', '59', '2', '90'})
frozenset({'28', '63', '2', '85'})
frozenset({'28', '86', '2', '34'})
frozenset({'28', '86', '59', '2'})
frozenset({'28', '86', '63', '2'})
frozenset({'28', '86', '2', '85'})
frozenset({'28', '86', '2', '90'})
frozenset({'28', '86', '2', '53'})
frozenset({'28', '2', '85', '90'})
frozenset({'28', '2', '90', '53'})
frozenset({'28', '2', '85', '53'})
frozenset({'28', '2', '39', '53'})
frozenset({'28', '2', '39', '34'})
frozenset({'28', '59', '2', '39'})
frozenset({'28', '63', '2', '39'})
frozenset({'28', '2', '39', '85'})
frozenset({'28', '86', '2', '39'})
frozenset({'28', '2', '39', '90'})
frozenset({'2', '85', '34', '53'})
frozenset({'86', '2', '34', '53'})
frozenset({'86', '85', '2', '53'})
frozenset({'86', '2'

In [47]:
for item in L2[2]:
    if item.intersection('2'): 
        print(item)

frozenset({'39', '53', '2'})
frozenset({'90', '53', '2'})
frozenset({'86', '53', '2'})
frozenset({'2', '53', '85'})
frozenset({'34', '53', '2'})
frozenset({'39', '28', '2'})
frozenset({'28', '53', '2'})
frozenset({'28', '90', '2'})
frozenset({'28', '86', '2'})
frozenset({'28', '2', '85'})
frozenset({'28', '63', '2'})
frozenset({'28', '59', '2'})
frozenset({'28', '34', '2'})
frozenset({'39', '93', '2'})
frozenset({'39', '90', '2'})
frozenset({'39', '86', '2'})
frozenset({'39', '2', '85'})
frozenset({'39', '2', '67'})
frozenset({'39', '63', '2'})
frozenset({'39', '59', '2'})
frozenset({'39', '36', '2'})
frozenset({'39', '34', '2'})
frozenset({'39', '23', '2'})
frozenset({'93', '90', '2'})
frozenset({'85', '93', '2'})
frozenset({'93', '63', '2'})
frozenset({'93', '34', '2'})
frozenset({'85', '90', '2'})
frozenset({'90', '63', '2'})
frozenset({'90', '34', '2'})
frozenset({'86', '93', '2'})
frozenset({'86', '90', '2'})
frozenset({'85', '86', '2'})
frozenset({'67', '86', '2'})
frozenset({'86