In [254]:
import pandas as pd
import itertools

In [88]:
data_path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
df = pd.read_excel(data_path)

In [89]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [90]:
df = df.replace('BANK CHARGES', 'BANK')
df = df[df['Country'] == 'France']
df['StockCode'] = df['StockCode'].astype(str)
product_code = df['StockCode'].unique()
product_name = df['Description'].unique()
product = dict(zip(product_code, product_name))

multiple_sets = df.groupby('InvoiceNo')['StockCode'].agg(' '.join).reset_index()
multiple_sets = multiple_sets.rename(columns={'InvoiceNo': 'Номер транзакции T', 'StockCode': 'Набор объектов'})
multiple_sets

Unnamed: 0,Номер транзакции T,Набор объектов
0,536370,22728 22727 22726 21724 21883 10002 21791 2103...
1,536852,22549 22544 22539 22661 21791 21786 POST
2,536974,15056BL 15056P 20679 21915 22659 22352 22892 2...
3,537065,22837 22846 22892 22968 84678 20665 21786 2178...
4,537463,22961 21224 22326 21124 21121 21137 22041 2203...
...,...,...
456,C579532,POST 22890
457,C579562,23084 21731
458,C580161,POST
459,C580263,M M 70007 85175 84821 84819 84817 72586 22232 ...


In [91]:
object = multiple_sets['Набор объектов']
object_list = [line.split() for line in object]

In [193]:
def generate_candidates(frequent_items, lvl):
    candidates = set()
    for i in range(len(frequent_items)):
        for j in range(i + 1, len(frequent_items)):
            union = frequent_items[i][0].union(frequent_items[j][0])

            if len(union) == lvl:
                candidates.add(union)

    return candidates


def apriori(transactions, min_supp):
    items = {}
    for transaction in transactions:
        for item in transaction:
            if item in items:
                items[item] += 1
            else:
                items[item] = 1

    items = {k: v for k, v in items.items() if v >= min_supp}

    frequent_items = [(frozenset([item]), freq)
                      for item, freq in items.items()]

    candidates = set(items.keys())
    cur_lvl = 2

    while (True):
        new_items = {}
        candidates = generate_candidates(frequent_items, cur_lvl)

        for transaction in transactions:
            for candidate in candidates:
                if candidate.issubset(transaction):
                    if candidate in new_items:
                        new_items[candidate] += 1
                    else:
                        new_items[candidate] = 1

        new_frequent_items = [(item, support) for item,
        support in new_items.items() if support >= min_supp]

        frequent_items.extend(new_frequent_items)

        cur_lvl += 1

        if not new_frequent_items:
            return frequent_items

In [266]:
frequent_items_1 = apriori(object_list, 20)
frequent_items_1

[(frozenset({'22728'}), 40),
 (frozenset({'22727'}), 37),
 (frozenset({'22726'}), 39),
 (frozenset({'22326'}), 65),
 (frozenset({'22629'}), 50),
 (frozenset({'22659'}), 27),
 (frozenset({'22631'}), 27),
 (frozenset({'22661'}), 26),
 (frozenset({'21731'}), 72),
 (frozenset({'22492'}), 41),
 (frozenset({'POST'}), 311),
 (frozenset({'21915'}), 27),
 (frozenset({'22352'}), 57),
 (frozenset({'22027'}), 37),
 (frozenset({'20749'}), 27),
 (frozenset({'20750'}), 55),
 (frozenset({'22748'}), 24),
 (frozenset({'20725'}), 61),
 (frozenset({'20726'}), 47),
 (frozenset({'21121'}), 30),
 (frozenset({'47566'}), 22),
 (frozenset({'22467'}), 23),
 (frozenset({'21559'}), 48),
 (frozenset({'22423'}), 54),
 (frozenset({'22634'}), 21),
 (frozenset({'22554'}), 68),
 (frozenset({'22551'}), 55),
 (frozenset({'22961'}), 21),
 (frozenset({'20719'}), 29),
 (frozenset({'21212'}), 40),
 (frozenset({'20682'}), 28),
 (frozenset({'22908'}), 20),
 (frozenset({'22895'}), 28),
 (frozenset({'22383'}), 26),
 (frozenset({'

In [250]:
class FPNode:
    def __init__(self, item, count, parent):
        self.item = item
        self.count = count
        self.parent = parent
        self.children = {}

    def increment(self, count):
        self.count += count


def find_frequent_items(node, minsupp):
    frequent_items = []
    if node.item is not None and node.count >= minsupp:
        frequent_items.append((frozenset({node.item}), node.count))
    for child in node.children.values():
        child_items = find_frequent_items(child, minsupp)
        for itemset, count in child_items:
            new_itemset = frozenset({node.item}) | itemset
            new_itemset = frozenset(
                filter(lambda x: x is not None, new_itemset))
            frequent_items.append((new_itemset, count))

    return frequent_items


def fpgrowth(transactions, minsupp):
    items_counts = {}
    for transaction in transactions:
        for item in transaction:
            if item in items_counts:
                items_counts[item] += 1
            else:
                items_counts[item] = 1

    items_counts = {k: v for k, v in items_counts.items() if v >= minsupp
                    }
    frequent_items = set(items_counts.keys())

    root = FPNode(None, 0, None)
    for transaction in transactions:
        sorted_items = [item for item in transaction if item in frequent_items]
        sorted_items.sort(key=lambda item: items_counts[item], reverse=True)
        current_node = root
        for item in sorted_items:
            if item not in current_node.children:
                new_node = FPNode(item, 1, current_node)
                current_node.children[item] = new_node
                current_node = new_node
            else:
                current_node.children[item].increment(1)
                current_node = current_node.children[item]

    return find_frequent_items(root, minsupp)

In [269]:
frequent_items_2 = fpgrowth(object_list, 10)
frequent_items_2

[(frozenset({'POST'}), 311),
 (frozenset({'21731', 'POST'}), 46),
 (frozenset({'21731', '22554', 'POST'}), 10),
 (frozenset({'21731', '22556', 'POST'}), 13),
 (frozenset({'20725', 'POST'}), 16),
 (frozenset({'22554', 'POST'}), 18),
 (frozenset({'22326', 'POST'}), 21),
 (frozenset({'22556', 'POST'}), 18),
 (frozenset({'22423', 'POST'}), 14),
 (frozenset({'21086', 'POST'}), 20),
 (frozenset({'21080', '21086', 'POST'}), 14),
 (frozenset({'21080', '21086', '21094', 'POST'}), 14),
 (frozenset({'23084', 'POST'}), 65),
 (frozenset({'21731', '23084', 'POST'}), 16),
 (frozenset({'23084'}), 10)]

In [271]:
def generate_rules(frequent_items, transactions, min_conf):
    rules = []
    for itemset, supp in frequent_items:
        for i in range(1, len(itemset)):
            subsets = itertools.combinations(itemset, i)
            for subset in subsets:
                left_side = frozenset(subset)
                right_side = itemset - left_side

                left_sup = sum(1 for transaction in transactions if left_side.issubset(transaction))
                conf = supp / (left_sup or 1)

                if conf >= min_conf:
                    rules.append((left_side, right_side, supp, conf))

    return rules

In [273]:
generate_rules(frequent_items_1, object_list, 0.5)
generate_rules(frequent_items_2, object_list, 0.5)

[(frozenset({'21731'}), frozenset({'POST'}), 46, 0.6388888888888888),
 (frozenset({'21731', '22556'}), frozenset({'POST'}), 13, 0.65),
 (frozenset({'23084'}), frozenset({'POST'}), 65, 0.8666666666666667),
 (frozenset({'21731', '23084'}), frozenset({'POST'}), 16, 0.7272727272727273)]