In [1]:
import json
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder

In [2]:
with open('/home/Data/locate/data_release/topology/sys_and_nodes.json') as f:
    sys_node = json.load(f)
with open('/home/Data/locate/data_release/topology/topology_edges_node.json') as f:
    node_node = json.load(f)
with open('/home/Data/locate/data_release/topology/topology_edges_sys.json') as f:
    sys_sys = json.load(f)

In [3]:
nodes = list(node_node.keys())
edges = []
for i in node_node:
    for j in node_node[i]:
        edges.append([i,j])

In [4]:
list_edge = []
for i in range(len(edges)):
    list_edge.append(edges[i])
    for j in range(len(edges)):
        if edges[i][0] == edges[j][1]:
            edge = [edges[j][0],edges[i][0],edges[i][1]]
            list_edge.append(edge)

In [5]:
# 获取候选1项集，dataSet为事务集。返回一个list，每个元素都是set集合
# C1 是大小为1的所有候选项集的集合
def createC1(dataSet):
    C1 = []   # 元素个数为1的项集（非频繁项集，因为还没有同最小支持度比较）
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()  # 这里排序是为了，生成新的候选集时可以直接认为两个n项候选集前面的部分相同
    # 因为除了候选1项集外其他的候选n项集都是以二维列表的形式存在，所以要将候选1项集的每一个元素都转化为一个单独的集合。
    return list(map(frozenset, C1))   #map(frozenset, C1)的语义是将C1由Python列表转换为不变集合（frozenset，Python中的数据结构）

In [6]:
# 找出候选集中的频繁项集
# dataSet为全部数据集，Ck为大小为k（包含k个元素）的候选项集，minSupport为设定的最小支持度
def scanD(dataSet, Ck, minSupport):
    ssCnt = {}   # 记录每个候选项的个数
    for tid in dataSet:
        for can in Ck:
            if can.issubset(tid):
                ssCnt[can] = ssCnt.get(can, 0) + 1   # 计算每一个项集出现的频率
    numItems = float(len(dataSet))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            retList.insert(0, key)  #将频繁项集插入返回列表的首部
        supportData[key] = support
    return retList, supportData   #retList为在Ck中找出的频繁项集（支持度大于minSupport的），supportData记录各频繁项集的支持度

In [7]:
#total apriori
def aprioriGen(Lk, k): #组合，向上合并
    #creates Ck 参数：频繁项集列表 Lk 与项集元素个数 k
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): #两两组合遍历
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #若两个集合的前k-2个项相同时,则将两个集合合并
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

In [8]:
#apriori
def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet)) #python3
    L1, supportData = scanD(D, C1, minSupport)#单项最小支持度判断 0.5，生成L1
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):#创建包含更大项集的更大列表,直到下一个大的项集为空
        Ck = aprioriGen(L[k-2], k)#Ck
        Lk, supK = scanD(D, Ck, minSupport)#get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1 #继续向上合并 生成项集个数更多的
    return L, supportData

In [9]:
#生成关联规则
# 创建关联规则
def generateRules(fileName, L, supportData, minConf=0.7):  # supportData是从scanD获得的字段
    bigRuleList = []
    for i in range(1, len(L)):  # 只获得又有2个或以上的项目的集合
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(fileName, freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(fileName, freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

In [10]:
# 实例数、支持度、置信度和提升度评估
def calcConf(fileName, freqSet, H, supportData, brl, minConf=0.7):
    prunedH = []
    D = fileName
    numItems = float(len(D))
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]  # 计算置信度
        if conf >= minConf:
            instances = numItems * supportData[freqSet]  # 计算实例数
            liftvalue = conf / supportData[conseq]  # 计算提升度
            brl.append((freqSet - conseq, conseq, int(instances), round(supportData[freqSet], 4), round(conf, 4),
                        round(liftvalue, 4)))  # 支持度已经在SCAND中计算得出
            prunedH.append(conseq)
    return prunedH

In [11]:
# 生成候选规则集
def rulesFromConseq(fileName, freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)):
        Hmp1 = aprioriGen(H, m + 1)
        Hmp1 = calcConf(fileName, freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):
            rulesFromConseq(fileName, freqSet, Hmp1, supportData, brl, minConf)

In [12]:

import pandas as pd
#from graphviz import Digraph

# 设置最小支持度阈值
minS = 0.00
# 设置最小置信度阈值
minC = 0.00

data = list_edge

# 计算符合最小支持度的规则
L, suppdata = apriori(data, minSupport=minS)

# 计算满足最小置信度规则
rules = generateRules(data, L, suppdata, minConf=minC)

### 关联结果评估###
model_summary = 'data record: {1} \nassociation rules count: {0}'  # 展示数据集记录数和满足阈值定义的规则数量
print (model_summary.format(len(rules), len(data)))  # 使用str.format做格式化输出
df =  pd.DataFrame(rules,  columns=['item1',  'itme2',  'instance',  'support', 'confidence', 'lift'])  # 创建频繁规则数据框
df_lift = df[df['lift'] > 0.1]  # 只选择提升度>1的规则
df_lift.sort_values('instance', ascending=False)

data record: 1936 
association rules count: 6368


Unnamed: 0,item1,itme2,instance,support,confidence,lift
1311,(node_34),(node_62),16,0.0083,0.2222,4.3022
1288,(node_81),(node_62),16,0.0083,0.2222,4.3022
1369,(node_62),(node_72),16,0.0083,0.1600,4.3022
1368,(node_72),(node_62),16,0.0083,0.2222,4.3022
1330,(node_72),(node_77),16,0.0083,0.2222,4.3022
...,...,...,...,...,...,...
3313,(node_18),"(node_84, node_21)",1,0.0005,0.0250,9.6800
3314,(node_18),"(node_55, node_84)",1,0.0005,0.0250,9.6800
3315,(node_84),"(node_55, node_18)",1,0.0005,0.0147,3.5588
3316,(node_55),"(node_84, node_18)",1,0.0005,0.0156,4.3214
