### Aprior算法实现

算法实现要点：

* 通过pandas的`DataFrame`来实现对`项集（itemset）`的支持度计数
* 项集的集合`C`和`L`都是DataFrame对象

----------
#### Implementation start

In [1]:
import pandas as pd
import numpy as np

def get_C1(data: pd.DataFrame):
    
    C1 = {}
    for col in data:
        for i in data[col]:
            C1.setdefault(i, 0)
            C1[i] = C1[i] + 1

    if np.nan in C1:
        del C1[np.nan]
        
    C1 = pd.DataFrame({'itemset': [set([s]) for s in list(C1.keys())], 'count': list(C1.values())}) #注意这一步将dict的key转化为set的做法
    
    C1 = C1[['itemset','count']]
    
    return C1

In [2]:
def trim_C(C, min_sup):
    
    L =  C[C['count'] >= min_sup]
    L = L[['itemset','count']]
    return L

import itertools

def _findsubsets(s,m):
    return set(itertools.combinations(s, m))


def _connect(L: pd.DataFrame):
    pre_C  = []
    Lkeys = list(L.itemset)
    
    # 需要限制可以进行连接的情况
    lengthOfL = len(Lkeys)
    for i in range(lengthOfL-1):
        listed_i = list(Lkeys[i])
        listed_i.sort()
        for j in range(i+1, lengthOfL):
            listed_j = list(Lkeys[j])
            listed_j.sort()
            if listed_i[:-1] == listed_j[:-1]:
                pre_C.append(Lkeys[i] | Lkeys[j])
    return pre_C


def _remove_candidate_has_infrequent_subset(pre_C: list, k, L_: dict):
    for s in pre_C: 
        for subset in _findsubsets(s, k-1):
            if set(subset) not in list(L_[k-1].itemset):
                if pre_C != []: #防止pre_C已空的情况下,继续remove报错
                    pre_C.remove(s)
    return pre_C


def _count_support(pre_C, D):
    # 创建C的骨架(dataframe)
    C = pd.DataFrame(columns=['itemset', 'count'])
    for i in pre_C:
        C = C.append([{'itemset':i, 'count':0}], ignore_index=True)#非in-place操作,注意赋值回去给C
    
    for index, row in D.iterrows():
        row = list(row)
        for cb in pre_C:
            if cb <= set(row):
                C.loc[C.itemset==cb, 'count'] += 1
    return C


# 由L1生成C2
#apriori_gen
def L2C(L: pd.DataFrame, D: pd.DataFrame, k: int, L_: list):
    
    # 1. 连接
    pre_C = _connect(L)

    #2. 剪枝, 删除非频繁候选
    pre_C = _remove_candidate_has_infrequent_subset(pre_C, k, L_)
     
    #3. 支持度计数
    C= _count_support(pre_C, D)
    
    return C

In [3]:
def confidence(L: pd.DataFrame, L_):
    itemset_ses = L['itemset']
    k = len(itemset_ses[0])
    conf_df = pd.DataFrame()
    for itemset in itemset_ses:        
        for ik in range(1, k):
            subsets = _findsubsets(itemset, ik)
            for subset in subsets:
                subset = set(subset)
                diffset = itemset - subset #求出差集
                c_itemset = L.loc[L['itemset'] == itemset, 'count'].values[0]
                _L = L_[ik]
                c_subset = _L.loc[_L['itemset'] == subset, 'count'].values[0]
                conf = c_itemset / c_subset
                conf_df= conf_df.append(
                    {'itemset': itemset, 'start': subset, 'subset_count': c_subset,  'end': diffset, 'itemset_count': c_itemset, 'conf': "{0:.0f}%".format(conf*100)},
                    ignore_index=True)
                # conf为numpy.float64对象
                # python内置round()函数针对numpy.float64不能正确工作，方法是先将numpy.float64转换为python原生的float
    conf_df = conf_df[['itemset', 'start','end', 'subset_count',  'itemset_count', 'conf']]
    return conf_df

#### Implementation end

--------------

In [4]:
inputfile = '../PPDAM/chapter5/demo/data/DMCT_menu_orders.xls'

data = pd.read_excel(inputfile, header=None)

In [5]:
C1 = get_C1(data)
L1 = trim_C(C1,2)
#L1 = L1[['itemset','count']] #调整L1列的顺序, 顺序调整: count itemset -> itemset count
L_ = {1: L1}
C_ = {1: C1}
k = 1

while not L_[k].empty:
    #print(L_[k])
    C_[k+1] = L2C(L_[k], data, k+1, L_)
    L_[k+1] = trim_C(C_[k+1], 2) #指定最小支持度计数(min_sup)为2
    k += 1

In [6]:
from IPython.display import display
for i in L_:
    display(L_[i])

Unnamed: 0,itemset,count
0,{I1},6
1,{I2},7
2,{I4},2
3,{I3},6
4,{I5},2


Unnamed: 0,itemset,count
0,"{I1, I2}",4
2,"{I1, I3}",4
3,"{I1, I5}",2
4,"{I4, I2}",2
5,"{I3, I2}",4
6,"{I5, I2}",2


Unnamed: 0,itemset,count
0,"{I3, I1, I2}",2
1,"{I1, I5, I2}",2


Unnamed: 0,itemset,count


In [7]:
L2 = L_[2]

In [8]:
L2 = L_[2]

In [9]:
L2

Unnamed: 0,itemset,count
0,"{I1, I2}",4
2,"{I1, I3}",4
3,"{I1, I5}",2
4,"{I4, I2}",2
5,"{I3, I2}",4
6,"{I5, I2}",2


In [10]:
confidence(L2, L_)

Unnamed: 0,itemset,start,end,subset_count,itemset_count,conf
0,"{I1, I2}",{I2},{I1},7.0,4.0,57%
1,"{I1, I2}",{I1},{I2},6.0,4.0,67%
2,"{I1, I3}",{I3},{I1},6.0,4.0,67%
3,"{I1, I3}",{I1},{I3},6.0,4.0,67%
4,"{I1, I5}",{I1},{I5},6.0,2.0,33%
5,"{I1, I5}",{I5},{I1},2.0,2.0,100%
6,"{I4, I2}",{I2},{I4},7.0,2.0,29%
7,"{I4, I2}",{I4},{I2},2.0,2.0,100%
8,"{I3, I2}",{I3},{I2},6.0,4.0,67%
9,"{I3, I2}",{I2},{I3},7.0,4.0,57%


In [11]:
def sort_by_conf(df: pd.DataFrame):
    return df.reindex(index=df.conf.str.rstrip('%').astype(float).sort_values(ascending=False).index)


L3 = L_[3]
df = confidence(L3, L_)
df = sort_by_conf(df)

In [12]:
df

Unnamed: 0,itemset,start,end,subset_count,itemset_count,conf
11,"{I1, I5, I2}","{I1, I5}",{I2},2.0,2.0,100%
10,"{I1, I5, I2}","{I5, I2}",{I1},2.0,2.0,100%
8,"{I1, I5, I2}",{I5},"{I1, I2}",2.0,2.0,100%
9,"{I1, I5, I2}","{I1, I2}",{I5},4.0,2.0,50%
5,"{I3, I1, I2}","{I1, I3}",{I2},4.0,2.0,50%
4,"{I3, I1, I2}","{I2, I3}",{I1},4.0,2.0,50%
3,"{I3, I1, I2}","{I1, I2}",{I3},4.0,2.0,50%
7,"{I1, I5, I2}",{I1},"{I5, I2}",6.0,2.0,33%
1,"{I3, I1, I2}",{I1},"{I2, I3}",6.0,2.0,33%
0,"{I3, I1, I2}",{I3},"{I1, I2}",6.0,2.0,33%


In [13]:
sort_by_conf(df[df['itemset'] == set(['I1','I2','I5'])])

Unnamed: 0,itemset,start,end,subset_count,itemset_count,conf
8,"{I1, I5, I2}",{I5},"{I1, I2}",2.0,2.0,100%
10,"{I1, I5, I2}","{I5, I2}",{I1},2.0,2.0,100%
11,"{I1, I5, I2}","{I1, I5}",{I2},2.0,2.0,100%
9,"{I1, I5, I2}","{I1, I2}",{I5},4.0,2.0,50%
7,"{I1, I5, I2}",{I1},"{I5, I2}",6.0,2.0,33%
6,"{I1, I5, I2}",{I2},"{I1, I5}",7.0,2.0,29%


### FP-Growth算法实现

步骤1. 第一次扫描事物数据库D，得到频繁1项集L，其中itemset按照计数递减排列

In [14]:
L = C_[1].sort_values(by='count', ascending=False)
L

Unnamed: 0,itemset,count
1,{I2},7
0,{I1},6
3,{I3},6
2,{I4},2
4,{I5},2


In [15]:
order =  [list(s)[0]  for s in L.itemset]

步骤2. 第二次扫描数据库D，构造`FP tree`和`headTable`

In [16]:
data

Unnamed: 0,0,1,2,3
0,I1,I2,I5,
1,I2,I4,,
2,I2,I3,,
3,I1,I2,I4,
4,I1,I3,,
5,I2,I3,,
6,I1,I3,,
7,I1,I2,I3,I5
8,I1,I2,I3,


In [17]:

for r in data.iterrows():
    # r是一个tuple，第一项是该行Index，第二行是该行内容的Series
    print(
        sorted([i for i in r[1].dropna()], key=lambda v: order.index(v))
    )

['I2', 'I1', 'I5']
['I2', 'I4']
['I2', 'I3']
['I2', 'I1', 'I4']
['I1', 'I3']
['I2', 'I3']
['I1', 'I3']
['I2', 'I1', 'I3', 'I5']
['I2', 'I1', 'I3']


#### Implementation start

credits: https://blog.csdn.net/gamer_gyt/article/details/51113753

------------

In [18]:
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode
        self.children = {}
 
    def inc(self, numOccur):
        self.count += numOccur
 
    def disp(self, ind=1): #ind: indentation
        print('*' * ind, self.name, ' ', self.count)
        for child in self.children.values():
            child.disp(ind + 1)

In [19]:
rootNode = treeNode('pyramid', 9, None)

In [20]:
rootNode.children['eye'] = treeNode('eye', 13, None)

In [21]:
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)

In [22]:
rootNode.disp()

* pyramid   9
** eye   13
** phoenix   3


In [23]:
def createTree(dataSet, minSup=2):
    ''' 创建FP树 '''
    # 第一次遍历数据集，创建头指针表
    headerTable = {}
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    # 移除不满足最小支持度的元素项
    for k in list(headerTable.keys()):
        if headerTable[k] < minSup:
            del(headerTable[k])
    # 空元素集，返回空
    freqItemSet = set(headerTable.keys())
    if len(freqItemSet) == 0:
        return None, None
    # 增加一个数据项，用于存放指向相似元素项指针
    for k in headerTable:
        headerTable[k] = [headerTable[k], None]
    retTree = treeNode('Null Set', 1, None) # 根节点
    # 第二次遍历数据集，创建FP树
    for tranSet, count in dataSet.items():
        localD = {} # 对一个项集tranSet，记录其中每个元素项的全局频率，用于排序
        for item in tranSet:
            if item in freqItemSet:
                localD[item] = headerTable[item][0] # 注意这个[0]，因为之前加过一个数据项
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] # 排序
            updateTree(orderedItems, retTree, headerTable, count) # 更新FP树
    return retTree, headerTable

In [24]:
def updateTree(items, inTree, headerTable, count):
    if items[0] in inTree.children:
        # 有该元素项时计数值+1
        inTree.children[items[0]].inc(count)
    else:
        # 没有这个元素项时创建一个新节点
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        # 更新头指针表或前一个相似元素项节点的指针指向新节点
        if headerTable[items[0]][1] == None:
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
 
    if len(items) > 1:
        # 对剩下的元素项迭代调用updateTree函数
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)

In [25]:
def updateHeader(nodeToTest, targetNode):
    while (nodeToTest.nodeLink != None):
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode

In [26]:
def loadSimpDat(data):
#     simpDat = [['r', 'z', 'h', 'j', 'p'],
#                ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
#                ['z'],
#                ['r', 'x', 'n', 'o', 's'],
#                ['y', 'r', 'x', 'z', 'q', 't', 'p'],
#                ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    simpDat = []
    for r in data.iterrows():
        simpDat.append([i for i in r[1].dropna()])
    return simpDat
 
def createInitSet(dataSet):
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = 1
    return retDict

In [27]:
simpDat = loadSimpDat(data)
initSet = createInitSet(simpDat)
myFPtree, myHeaderTab = createTree(initSet, 2)
myFPtree.disp()

* Null Set   1
** I2   6
*** I1   4
**** I5   1
**** I4   1
**** I3   2
***** I5   1
*** I4   1
*** I3   1
** I1   1
*** I3   1


In [28]:
def findPrefixPath(basePat, treeNode):
    ''' 创建前缀路径 '''
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats

In [29]:
def ascendTree(leafNode, prefixPath):
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)

In [30]:
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    #print('headerTable', headerTable)
    bigL = [v[0] for v in sorted(list(headerTable.items()), key=lambda p: p[1][0])] #注意headerTable的构造
    for basePat in bigL:
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        myCondTree, myHead = createTree(condPattBases, minSup)
 
        if myHead != None:
            # 用于测试
            print('conditional tree for:', newFreqSet)
            myCondTree.disp()
 
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)

In [31]:
def fpGrowth(dataSet, minSup=2):
    initSet = createInitSet(dataSet)
    myFPtree, myHeaderTab = createTree(initSet, minSup)
    freqItems = []
    mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems)
    return freqItems

In [32]:
>>> dataSet = loadSimpDat(data)
>>> freqItems = fpGrowth(dataSet)
>>> freqItems

conditional tree for: {'I5'}
* Null Set   1
** I1   1
*** I2   1
** I2   1
*** I1   1
conditional tree for: {'I4'}
* Null Set   1
** I2   2
conditional tree for: {'I3'}
* Null Set   1
** I2   1
** I1   3
*** I2   2
conditional tree for: {'I2', 'I3'}
* Null Set   1
** I1   2
conditional tree for: {'I1'}
* Null Set   1
** I2   4


[{'I5'},
 {'I1', 'I5'},
 {'I2', 'I5'},
 {'I4'},
 {'I2', 'I4'},
 {'I3'},
 {'I2', 'I3'},
 {'I1', 'I2', 'I3'},
 {'I1', 'I3'},
 {'I1'},
 {'I1', 'I2'},
 {'I2'}]

#### Implementation end

------------