In [1]:
import numpy as np
import pandas as pd

In [274]:
def read_data(filename):
    df = pd.read_csv(filename)
    data = np.array(df)[:, 0:2]
    for order_id in np.unique(data[:1000000, 0]):
        yield set(data[data[:, 0] == order_id][:, 1].tolist())
    
orders = list(read_data('./data/order_products__train.csv'))
print(len(orders))

94954


In [11]:
# Data ->                 [{x}, {y}, ...]
# C1 -> L1 ->             [{1}, {1}, ...]
# C2 -> L2 ->             [{2}, {2}, ...]
# C3 -> L3 -> ...         [{3}, {3}, ...]
# C 候选集 L 频繁项集

# [{1}, {2}, {3}] -> [{1, 2}, {2, 3}, {1, 3}]
# [{1, 2}, {1, 3}, {2, 3}, {1, 4}] -> [{1, 2, 3}] 
# 一个项集是非频繁的，那么它的所有超集也是非频繁的

In [275]:
def calcC1(orders):
    """
    :return: C1 [{1}, {1}, ...]
    """
    C1 = np.unique([x for y in orders for x in y]).reshape(-1, 1).tolist()
    return list(map(frozenset, sorted(C1)))  # unhashable type: 'set'

def calcLn(orders, Ck, min_support):
    """
    :return: Ln(Prune), support(dict)
    """
    cnts = {item: 0 for item in Ck}
    for order in orders:
        for item in Ck:
            if item.issubset(order):  # Founded in order
                cnts[item] += 1
    orders_size = len(orders)
    Ln = []  # satisfy min support items (filter, Apriori)
    supports = {}  # All item's support
    for item in cnts:
        support = cnts[item] / orders_size  # s(x->y) = #(x|y) / #N
        if support >= min_support: # prune
            Ln.append(item)
        supports[item] = support
    return Ln, supports

def calcCn(Ln_1):
    """
    :return: Cn (L_{n-1} -> Cn)
    """
    Cn = []
    if len(Ln_1) == 0:
        return []
    n = len(Ln_1[0]) + 1
    for i in range(len(Ln_1)):
        for j in range(i+1, len(Ln_1)):  # [{1, 2}, {1, 3}, {2, 3}] -> [{1, 2, 3}]
            L_l, L_r = Ln_1[i], Ln_1[j]  # sorted auto
            new_item = L_l | L_r
            if len(new_item) != n:
                continue
            Cn.append(new_item)  # Get all 1 level outer super set
    Cn = list(set(Cn))
    true_Cn = []  # [{1, 2}, {1, 3}] !-> [{1, 2, 3}], [{1, 2}, {1, 3}, {2, 3}] -> [{1, 2, 3}]
    for item in Cn:
        for el in item:
            if item - {el} not in Ln_1:  # not all contain
                break
        else:
            true_Cn.append(item)
    return list(set(true_Cn))

def calcFrequentItemset(orders, min_support):
    """ 
    :return: FrequentItemset SupportDict
    """
    C1 = calcC1(orders)  # [{1}, {1}, ...]
    L1, Sn = calcLn(orders, C1, min_support)
    L = [L1]
    while len(L[-1]) != 0:
        Cn = calcCn(L[-1])
        Ln, Sn_ = calcLn(orders, Cn, min_support)
        Sn.update(Sn_)
        if len(Ln) == 0:
            break
        L.append(Ln)
    return L, Sn

In [269]:
def calcH1(frequentItem, supports, min_confidence):
    """
    :return: front [{1}, {1}, ...] confidences
    """
    F1 = []  # antecedents
    H1 = []  # 1-consequent
    confidences = {}
    for R_r in frequentItem:
        frequentItem = frozenset(frequentItem)
        R_r = frozenset({R_r})
        R_l = frequentItem - R_r
        confidence = supports[frequentItem] / supports[R_l]
        # print(R_l, '->', R_r, ':', confidence)
        if confidence >= min_confidence:
            F1.append(R_l); H1.append(R_r)
            confidences[R_r] = confidence
    return F1, H1, confidences

def calcHn(Hn_1, frequentItem, supports, min_confidence):
    """
    :return: front [{2}, {2}, ...] confidence
    """
    F, H, Cn = [], [], {}
    Hn = calcCn(Hn_1)  # [{1}, {2}, {3}] -> [{1, 2}, {1, 3}, {2, 3}]
    for R_r in Hn:
        R_l = frequentItem - R_r
        confidence = supports[frequentItem] / supports[R_l]
        # print(R_l, '->', R_r, ':', confidence)
        if confidence >= min_confidence:
            F.append(R_l); H.append(R_r)
            Cn[R_r] = confidence
    return F, H, Cn

def genRule(frequentItems, supports, min_confidence):  # c(x->y) = #(x|y) / #(x) = s(x->y) / s(x)
    """
    :return: antecedents consequents Confidences
    """
    F, H, Cn = [], [], {}
    for frequentItemSet in frequentItems:  # [[{1}, {1}, ...], [{2}, {2}, ...], ...]
        if len(frequentItemSet) == 0 or len(frequentItemSet[0]) <= 1:
            continue
        for frequentItem in frequentItemSet:  # [{2}, {2}, ...] / [{3}, {3}, ...]
            # print('||| frequentItem', frequentItem)
            Fn, Hn, Cn_ = calcH1(frequentItem, supports, min_confidence)
            F.append(Fn); H.append(Hn)
            Cn.update(Cn_)
            while len(Hn) != 0 and len(Hn[0]) != len(frequentItem) - 1:
                Fn, Hn, Cn_ = calcHn(Hn, frequentItem, supports, min_confidence)
                if len(Hn) != 0:
                    F.append(Fn); H.append(Hn)
                    Cn.update(Cn_)
    return F, H, Cn

In [276]:
import datetime
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

from pprint import pprint
# tmp = [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
# tmp = list(map(frozenset, tmp))
frequentItems, Sn = calcFrequentItemset(orders, 0.02)
antecedents, consequents, Cn = genRule(frequentItems, Sn, 0.1)

print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# print('Supports: ')
# pprint(Sn)
frequentItems = [x for y in frequentItems for x in y]
print('\nFrequent Itemsets: ', len(frequentItems))
pprint({L: Sn[L] for L in Sn if L in frequentItems})
antecedents = [x for y in antecedents for x in y]
consequents = [x for y in consequents for x in y]
print('\nAssociation Rule: ', len(consequents))
for f, h in zip(antecedents, consequents):
    print(f, '->', h, ':', Cn[h])

2019-11-26 17:23:15
2019-11-26 17:30:49

Frequent Itemsets:  40
{frozenset({4605}): 0.02890873475577648,
 frozenset({4920}): 0.030720138172167576,
 frozenset({5450}): 0.024043220928028308,
 frozenset({5876}): 0.02636013227457506,
 frozenset({8424}): 0.022579354213619226,
 frozenset({8518}): 0.029182551551277462,
 frozenset({13176}): 0.11680392611159088,
 frozenset({16797}): 0.04938180592708048,
 frozenset({19057}): 0.02216862902036776,
 frozenset({21137}): 0.08249257535227583,
 frozenset({21616}): 0.022463508646291888,
 frozenset({21903}): 0.07465720243486319,
 frozenset({22935}): 0.032815889799271226,
 frozenset({24184}): 0.02205278345304042,
 frozenset({24852}): 0.14186869431514207,
 frozenset({24964}): 0.031562651389093664,
 frozenset({26209}): 0.04639088400699286,
 frozenset({26604}): 0.02148408703161531,
 frozenset({27104}): 0.02011500305411041,
 frozenset({27845}): 0.03728120984897951,
 frozenset({27966}): 0.04217831792236241,
 frozenset({28204}): 0.024948922636223857,
 frozenset