## 实验二：关联规则挖掘分析

&emsp;&emsp;关联规则主要用于寻找数据集中项集之间的关联关系。它揭示了数据项间的未知关系，基于样本的统计规律，进行关联规则挖掘。根据所挖掘的关联关系，可以从一个属性信息来推断另一个属性的信息。当置信度或提升度达到某一阈值时，则认为规则成立。  
&emsp;&emsp;关联规则挖掘主要分为两个部分：第一个是找出事物数据库中所有大于等于事先设定的最小支持度的数据项集；第二个是利用频繁项集生成所需要的关联规则，根据事先设定的最小置信度进行取舍，最后得到强关联规则。

#### Apriori 算法的主要步骤如下：

(1)扫描全部数据，产生候选 1-项集的集合$C_{1}$。  
(2)根据最小支持度，由候选 1-项集的集合$C_{1}$产生频繁 1-项集的集合$L_{1}$。  
(3)对 K>1，重复执行步骤(4)-(6)。  
(4)由$L_{k}$执行连接和剪枝操作，产生候选（k+1）-项集的集合$C_{k+1}$。  
(5)根据最小支持度，由候选（k+1）-项集的集合$C_{k+1}$，产生频繁（k+1）-项集的集合$L_{k+1}$。  
(6)若 L≠$\phi $ ,则 k=k+1，调往步骤(4)；否则，调往步骤(7)。  
(7)根据最小置信度，由频繁项集产生强关联规则，结束。


### 1. 应用 python 语言编写程序，挖掘数据集”腹泻数据.xlsx”中所有频繁项集，支持度阈值为 0.1。


In [None]:
import pandas as pd

trans = pd.read_csv("腹泻数据.csv", header=None, encoding="utf-8")
trans.head()
trans_list = trans.values.tolist()


def load_data(x):
    data = []
    for trans in x:
        tem = []
        for i in trans:
            if type(i) == str:
                tem.append(i)
        data.append(tem)
    return data


def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))


def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if not can in ssCnt:
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            retList.insert(0, key)
        supportData[key] = support
    return retList, supportData


def aprioriGen(Lk, k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i + 1, lenLk):
            L1 = list(Lk[i])[: k - 2]
            L2 = list(Lk[j])[: k - 2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(Lk[i] | Lk[j])
    return retList


def apriori(dataSet, minSupport=0.1):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, 0.1)
    L = [L1]
    k = 2
    while len(L[k - 2]) > 0:
        Ck = aprioriGen(L[k - 2], k)
        Lk, supK = scanD(D, Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData


def generateRules(L, supportData, minConf=0.7):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if i > 1:
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
        return bigRuleList


def calcConf(freqSet, H, supportData, br1, minConf=0.7):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]
        if conf >= minConf:
            print(freqSet - conseq, "-->", conseq, "conf:", conf)
            br1.append((freqSet - conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH


def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
    m = len(H[0])
    if len(freqSet) > (m + 1):
        Hmp1 = apriori(H, m + 1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
        if len(Hmp1) > 1:
            rulesFromConseq(freqSet, H, supportData, br1, minConf)


if __name__ == "__main__":
    dataSet = load_data(trans_list)
    L, supportData = apriori(dataSet, 0.1)
    rules = generateRules(L, supportData, minConf=0.7)
    # print(rules)
    print(L)

### 2. 利用 sklearn 中的 Apriori 库函数，挖掘”腹泻数据.xlsx”的频繁项集和关联规则，支持度阈值 0.1，置信度阈值 0.7。


In [None]:
from efficient_apriori import apriori

trans = pd.read_csv("腹泻数据.csv", header=None, encoding="utf-8")
trans.head()
trans_list = trans.values.tolist()


def load_data(x):
    data = []
    for trans in x:
        tem = []
        for i in trans:
            if type(i) == str:
                tem.append(i)
        data.append(tem)
    return data


trans_final = load_data(trans_list)
trans_final

In [None]:
itemsets, rules = apriori(trans_final, min_support=0.1, min_confidence=0.7)
# itemsets是字典形式存储，键为频繁项集的元素个数，值也是一个字典（频繁项集为键，值为支持数）
print(itemsets, "\n")
# rules规则是以列表形式存储
print(rules)

### 3. 利用 mlxtend 库函数，挖掘”腹泻数据.xlsx”的频繁项集和关联规则，支持度阈值 0.1，置信度阈值 0.7。


In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [None]:
df = pd.DataFrame(trans_final)

In [None]:
te = TransactionEncoder()
trans_new = te.fit_transform(trans_final)
trans_new = pd.DataFrame(trans_new, columns=te.columns_)
trans_new

In [None]:
frequent_itemset = apriori(trans_new, min_support=0.1, use_colnames=True)
frequent_itemset

In [None]:
frequent_itemset.sort_values(by="support", ascending=True)

In [None]:
rules_mlxtend = association_rules(
    frequent_itemset, metric="confidence", min_threshold=0.7
)
rules_mlxtend