In [6]:
import os,itertools
import numpy as np
import pandas as pd

class Apriori(object):
    def __init__(self, itemSets, minSupport=0.5, minConf=0.7, sort = False):
        self.itemSets = itemSets
        self.minSupport = minSupport
        self.minConf = minConf
        self.sort = sort
        self.__Initialize()

    def __Initialize(self):
        self.__item()
        self.__creat_matrix()
        self.update(minSupport=self.minSupport, minConf=self.minConf)

    def __item(self):
        '''获取项目元素列表'''
        self.item = []
        for itemSet in self.itemSets:
            for item in itemSet:
                if item not in self.item:
                    self.item.append(item)
        self.item.sort()

    def __creat_matrix(self):
        '''将项集转为pandas.DataFrame数据类型'''
        self.data = pd.DataFrame(columns=self.item)
        for i in range(len(self.itemSets)):
            self.data.loc[i, self.itemSets[i]] = 1

    def __candidate_itemsets_l1(self):
        '''创建单项频繁项集及L1'''
        self.L1 = self.data.loc[:, self.data.sum(axis=0) / len(self.itemSets) >= self.minSupport]
        self.L1_support_selects = dict(self.L1.sum(axis=0) / len(self.itemSets))  # 只作为分母，不进行拆分

    def __candidate_itemsets_lk(self):
        '''根据L1创建多项频繁项集Lk，非频繁项集的任何超集都不是频繁项集'''
        last_support_selects = self.L1_support_selects.copy()  # 初始化
        while last_support_selects:
            new_support_selects = {}
            for last_support_select in last_support_selects.keys():
                for L1_support_name in set(self.L1.columns) - set(last_support_select.split(',')):
                    columns = sorted([L1_support_name] + last_support_select.split(','))  # 新的列名：合并后排序
                    count = (self.L1.loc[:, columns].sum(axis=1) == len(columns)).sum()
                    if count / len(self.itemSets) >= self.minSupport:
                        new_support_selects[','.join(columns)] = count / len(self.itemSets)
            self.support_selects.update(new_support_selects)
            last_support_selects = new_support_selects.copy()  # 作为新的 Lk，进行下一轮更新

    def __support_selects(self):
        '''支持度选择'''
        self.__candidate_itemsets_l1()
        self.__candidate_itemsets_lk()
        self.item_Conf = self.L1_support_selects.copy()
        self.item_Conf.update(self.support_selects)

    def __confidence_selects(self):
        '''生成关联规则，其中support_selects已经按照长度大小排列'''
        for groups, Supp_groups in self.support_selects.items():
            groups_list = groups.split(',')
            for recommend_len in range(1, len(groups_list)):
                for recommend in itertools.combinations(groups_list, recommend_len):
                    items = ','.join(sorted(set(groups_list) - set(recommend)))
                    Conf = Supp_groups / self.item_Conf[items]
                    if Conf >= self.minConf:
                        self.confidence_select.setdefault(items, {})
                        self.confidence_select[items].setdefault(','.join(recommend),{'Support': Supp_groups, 'Confidence': Conf})

    def show(self,**kwargs):
        '''可视化输出'''
        if kwargs.get('data'):
            select = kwargs['data']
        else:
            select = self.confidence_select
        items = []
        value = []
        for ks, vs in select.items():
            items.extend(list(zip([ks] * vs.__len__(), vs.keys())))
            for v in vs.values():
                value.append([v['Support'], v['Confidence']])
        index = pd.MultiIndex.from_tuples(items, names=['Items', 'Recommend'])
        self.rules = pd.DataFrame(value, index=index, columns=['Support', 'Confidence'])
        if self.sort or kwargs.get('sort'):
            result = self.rules.sort_values(by=['Support', 'Confidence'], ascending=False)
        else:
            result = self.rules.copy()
        return result

    def update(self, **kwargs):
        '''用于更新数据'''
        if kwargs.get('minSupport'):
            self.minSupport = kwargs['minSupport']
            self.support_selects = {}  # 用于储存满足支持度的频繁项集
            self.__support_selects()
        if kwargs.get('minConf'):
            self.minConf = kwargs['minConf']
            self.confidence_select = {}  # 用于储存满足自信度的关联规则
            self.__confidence_selects()
        print(self.show())
        if kwargs.get('file_name'):
            file_name = kwargs['file_name']
            if file_name.endswith(".xlsx"):
                  self.show().to_excel(f'{file_name}')
            else:
                  self.show().to_excel(f'{file_name}.xlsx')
            
        self.apriori_rules = self.rules.copy()

    def __get_Recommend_list(self,itemSet):
        '''输入数据，获取关联规则列表'''
        self.recommend_selects = {}
        itemSet = set(itemSet) & set(self.apriori_rules.index.levels[0])
        if itemSet:
            for start_str in itemSet:
                for end_str in self.apriori_rules.loc[start_str].index:
                    start_list = start_str.split(',')
                    end_list = end_str.split(',')
                    self.__creat_Recommend_list(start_list, end_list, itemSet)

    def __creat_Recommend_list(self,start_list,end_list,itemSet):
        '''迭代创建关联规则列表'''
        if set(end_list).issubset(itemSet):
            start_str = ','.join(sorted(start_list+end_list))
            if start_str in self.apriori_rules.index.levels[0]:
                for end_str in self.apriori_rules.loc[start_str].index:
                    start_list = start_str.split(',')
                    end_list = end_str.split(',')
                    self.__creat_Recommend_list(sorted(start_list),end_list,itemSet)
        elif not set(end_list) & itemSet:
            start_str = ','.join(start_list)
            end_str = ','.join(end_list)
            self.recommend_selects.setdefault(start_str, {})
            self.recommend_selects[start_str].setdefault(end_str, {'Support': self.apriori_rules.loc[(start_str, end_str), 'Support'], 'Confidence': self.apriori_rules.loc[(start_str, end_str), 'Confidence']})

    def get_Recommend(self,itemSet,**kwargs):
        '''获取加权关联规则'''
        self.recommend = {}
        self.__get_Recommend_list(itemSet)
        self.show(data = self.recommend_selects)
        items = self.rules.index.levels[0]
        for item_str in items:
            for recommends_str in self.rules.loc[item_str].index:
                recommends_list = recommends_str.split(',')
                for recommend_str in recommends_list:
                    self.recommend.setdefault(recommend_str,0)
                    self.recommend[recommend_str] += self.rules.loc[(item_str,recommends_str),'Support'] * self.rules.loc[(item_str,recommends_str),'Confidence'] * self.rules.loc[item_str,'Support'].mean()/(self.rules.loc[item_str,'Support'].sum()*len(recommends_list))
        result = pd.Series(self.recommend,name='Weight').sort_values(ascending=False)
        result.index.name = 'Recommend'
        result = result/result.sum()
        result = 1/(1+np.exp(-result))
        print(result)
        if kwargs.get('file_name'):
            file_name = kwargs['file_name']
            if file_name.endswith(".xlsx"):
                  excel_writer = pd.ExcelWriter(f'{file_name}')
            else:
                  excel_writer = pd.ExcelWriter(f'{file_name}.xlsx')
            result.to_excel(excel_writer,'推荐项目及权重')
            self.rules.to_excel(excel_writer, '关联规则树状表')
            self.show().to_excel(excel_writer, '总关联规则树状表')
            self.show(sort = True).to_excel(excel_writer, '总关联规则排序表')
            excel_writer.save()
        return result

def str2itemsets(strings, split=','):
    '''将字符串列表转化为对应的集合'''
    itemsets = []
    for string in strings:
        itemsets.append(sorted(string.split(split)))
    return itemsets

if __name__ == '__main__':
    # 1.导入数据
    data = pd.read_excel('/home/weijunfei/算法/Apriori/apriori算法实现.xlsx' )

    # 2.关联规则中不考虑多次购买同一件物品，删除重复数据
    data = data.drop_duplicates()

    # 3.初始化列表
    itemSets = []

    # 3.按销售单分组，只有1件商品的没有意义，需要进行过滤
    groups = data.groupby(by='销售单明细')
    for group in groups:
        if len(group[1]) >= 2:
            itemSets.append(group[1]['商品编码'].tolist())

    # 4.训练 Apriori
    ap = Apriori(itemSets, minSupport=0.03, minConf=0.5)
    ap.get_Recommend('2BYP206,2BYW001-,2BYW013,2BYX029'.split(','))

                                                   Support  Confidence
Items                            Recommend                            
2BPM007                          1ADS002          0.047619    0.666667
                                 2BJH005          0.047619    0.666667
                                 2BPY448          0.047619    0.666667
                                 2BJH005,2BPY448  0.047619    0.666667
1ADS002                          2BPM007          0.047619    1.000000
...                                                    ...         ...
2BYP206,2BYW001-,2BYW013,2BYX029 2BYD011          0.047619    1.000000
2BYD011,2BYW001-,2BYW013,2BYX029 2BYP206          0.047619    1.000000
2BYD011,2BYP206,2BYW013,2BYX029  2BYW001-         0.047619    1.000000
2BYD011,2BYP206,2BYW001-,2BYX029 2BYW013          0.047619    1.000000
2BYD011,2BYP206,2BYW001-,2BYW013 2BYX029          0.047619    1.000000

[467 rows x 2 columns]
Recommend
2BYD011    0.724888
2BYY023    0.502948
2BY

In [None]:
class apriori_algorithm:
 
    # 算法初始化
    def __init__(self, minSupport, dataSet):
        self.minSupport = minSupport  # 最小支持度
        self.dataSet = dataSet  # 数据集
 
    # 加载数据集
    def loaddata(self):
        return edges
 
    # 生成单个物品的项集列表
    def generateC1(self, dataSet):
        C1 = []  # 用于存放生成的单个物品的项集列表
        # 遍历数据集
        for data in dataSet:
            for item in data:
                if [item] not in C1:
                    C1.append([item])
        C1.sort()
        return C1
 
    # 遍历数据集，和Ck对比，计数
    def generateLk_by_Ck(self, dataSet, Ck, minSupport, support_data):
        """
           Generate Lk by executing a delete policy from Ck.
           Args:
               data_set: 数据集
               Ck: A set which contains all all frequent candidate k-itemsets.
               min_support: The minimum support.
               support_data: A dictionary. The key is frequent itemset and the value is support.
           Returns:
               Lk: A set which contains all all frequent k-itemsets.
           """
        D = map(set, dataSet)
        C = map(frozenset, Ck)
        C1 = list(C)  # 关于map对象的遍历，在内循环中遍历完最后一个元素后，再次访问时会放回空列表，所以外循环第二次进入的时候是空的，需要将其转为list处理
        countData = dict()
        for d in D:  # set遍历
            for c in C1:
                if c.issubset(d):  # 子集判断，并非元素判断
                    if c not in countData.keys():  # 将集合作为字典的键使用,c为[]型
                        countData[c] = 1
 
                    else:
                        countData[c] += 1
 
        numItems = float(len(list(dataSet)))
        returnList = []
        supportData = dict()
        # 遍历前面得到的计数字典
        for key in countData:
            support = countData[key] / numItems
            if support >= minSupport:
                returnList.insert(0, key)  # insert() 函数用于将指定对象插入列表的指定位置
                support_data[key] = support
 
        return returnList
 
    def generate_L(self, dataSet, k, min_support):
        """
           Generate all frequent itemsets.
           Args:
               data_set:数据集
               k: 频繁项集中含有的最多的元素
               min_support: 最小支持度
           Returns:
               L: 出现的所有频繁项集
               support_data: 每个频繁项集对应的支持度
           """
        support_data = {}
        C1 = self.generateC1(dataSet)
        L1 = self.generateLk_by_Ck(dataSet, C1, min_support, support_data)
        Lksub1 = L1.copy()
 
        L = []
        L.append(Lksub1)
 
        for i in range(2, k + 1):
            Ci = self.generateCK(Lksub1, i)
            Li = self.generateLk_by_Ck(dataSet, Ci, min_support, support_data)
            Lksub1 = Li.copy()
            L.append(Lksub1)
        return L, support_data
 
    # generateCK 候选频繁项集产生   参数 Lk频繁项集，k:项集元素个数
    def generateCK(self, Lk, k):
        Ck = set()
        len_Lk = len(list(Lk))
        list_Lk = list(Lk)
        for i in range(len_Lk):
            for j in range(1, len_Lk):
                l1 = list(list_Lk[i])
                l2 = list(list_Lk[j])
                l1.sort()
                l2.sort()
                if l1[0:k - 2] == l2[0:k - 2]:
                    Ck_item = list_Lk[i] | list_Lk[j]
                    if self.isCk(Ck_item, list_Lk):
                        Ck.add(Ck_item)
                    # Ck.add(Ck_item)
        return Ck
 
    # 频繁项集判断
 
    def isCk(self, Ck_item, list_Lk):
        for item in Ck_item:
            sub_Ck = Ck_item - frozenset([item])
            if sub_Ck not in list_Lk:
                return False
        return True
 
    # 生成关联规则
    def generate_big_rules(self, L, support_data, min_conf):
        """
        Generate big rules from frequent itemsets.
        Args:
            L: 所有频繁项集的列表
            support_data: 每个频繁项集对应的支持度
            min_conf: 最小可信度
        """
        big_rule_list = []
        sub_set_list = []
        for i in range(0, len(L)):
            for freq_set in L[i]:
                for sub_set in sub_set_list:
                    if sub_set.issubset(freq_set):
                        conf = support_data[freq_set] / support_data[freq_set - sub_set]
                        big_rule = (freq_set - sub_set, sub_set, conf)
 
                        if conf >= min_conf and big_rule not in big_rule_list:
                            print(freq_set - sub_set, " => ", sub_set, "conf: ", conf)
                            big_rule_list.append(big_rule)
                sub_set_list.append(freq_set)
        return big_rule_list
 
 
if __name__ == '__main__':
    minS = 0.0001
    dataSet = edges
    apriori = apriori_algorithm(minSupport=minS, dataSet=dataSet)
 
    L, support_data = apriori.generate_L(dataSet, 4, minS)
 
    print(support_data)
    big_rule_list = apriori.generate_big_rules(L, support_data, 0)