In [1]:
import numpy as np
import pandas as pd
from itertools import chain, combinations
from tqdm import tqdm

In [2]:
def association_rules(df, metric="confidence", min_threshold=0.8):    
    """
    A lighter version of mlxtend/association_rules  
    """
    metric_dict = {
        "antecedent support": lambda _, sA, __: sA,
        "consequent support": lambda _, __, sC: sC,
        "support": lambda sAC, _, __: sAC,
        "confidence": lambda sAC, sA, _: sAC/sA,
        "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC)/sC,
        }

    columns_ordered = ["antecedent support", "consequent support",
                       "support",
                       "confidence", "lift",
                       ]

    # get dict of {frequent itemset} -> support
    keys = df['itemsets'].values
    values = df['support'].values
    frozenset_vect = np.vectorize(lambda x: frozenset(x))
    frequent_items_dict = dict(zip(frozenset_vect(keys), values))

    # prepare buckets to collect frequent rules
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []

    # iterate over all frequent itemsets
    for k in frequent_items_dict.keys():
        sAC = frequent_items_dict[k]
        # to find all possible combinations
        for idx in range(len(k)-1, 0, -1):
            # of antecedent and consequent
            for c in combinations(k, r=idx):
                antecedent = frozenset(c)
                consequent = k.difference(antecedent)                
                sA = frequent_items_dict[antecedent]
                sC = frequent_items_dict[consequent]                

                score = metric_dict[metric](sAC, sA, sC)
                if score >= min_threshold:
                    rule_antecedents.append(antecedent)
                    rule_consequents.append(consequent)
                    rule_supports.append([sAC, sA, sC])

    # check if frequent rule was generated
    if not rule_supports:
        return pd.DataFrame(
            columns=["antecedents", "consequents"] + columns_ordered)

    else:
        # generate metrics
        rule_supports = np.array(rule_supports).T.astype(float)
        df_res = pd.DataFrame(
            data=list(zip(rule_antecedents, rule_consequents)),
            columns=["antecedents", "consequents"])
        
        sAC = rule_supports[0]
        sA = rule_supports[1]
        sC = rule_supports[2]
        for m in columns_ordered:
            df_res[m] = metric_dict[m](sAC, sA, sC)
        return df_res

In [3]:
def subsets_by_len(dataframe: pd.DataFrame):            
    all_subset_lists = dataframe.item.apply(lambda x: tuple(powerset(x))).unique()
    all_subsets = []
    len_subsets = {}
    for subsets in all_subset_lists:    
        all_subsets = all_subsets + list((set(subsets) - set(all_subsets)))
    # Group subsets by length
    for subsets in all_subsets:
        try:
            len_subsets[len(subsets)].append(subsets)
        except:
            len_subsets[len(subsets)] = [subsets]
    return len_subsets

def list_contains(BigList, SmallList):   
    # return true if Smallset is in Bigset
    Bigset = set(BigList)
    Smallset = set(SmallList)     
    return all(item in Bigset for item in Smallset)

def count_item(origin_value_counts, c_item_set, init=False):
    # count the appearance of itemset
    total = origin_value_counts.sum()
    itemset_count = []
    for c in c_item_set:
        if (init):
            c = tuple([c])
        count = 0
        for items in origin_value_counts.index.tolist():        
            items = tuple(sorted(items))                        
            if list_contains(items, c):                 
                count += origin_value_counts[items]                
        itemset_count.append(count/total)
    return itemset_count


def check_all_subset_inside(df_item_set_list, itemset_list, length):
    c_item_set = []
    for sets in tqdm(itemset_list):
        sets = tuple(sorted(sets))
        if (length - 2) > 0:
            subsets = list(combinations(sets, length - 1))
        else:
            subsets = list(sets)        
        if list_contains(df_item_set_list, subsets):
            c_item_set.append(sets)
    return c_item_set

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1) if r != 0)
    

def apriori(data, min_support:float, max_length):
    subsets_len_dict = subsets_by_len(data)
    origin_itemset_valcount = data.item.value_counts()    
    
    apriori_dict = {}
    if len(apriori_dict) == 0:
        c_item_set = []
        for i in data.item:
            new_elems = list(i)
            c_item_set = c_item_set + list((set(new_elems) - set(c_item_set)))
        
        df = pd.DataFrame({
            "itemsets": c_item_set,
            "support": count_item(origin_itemset_valcount, c_item_set, init=True)
        })
        
        df.drop(df[df.support < min_support].index, inplace=True) # prune
        apriori_dict[1] = df
    
    for i in range(max_length + 1):        
        if i <= 1: continue
        c_item_set = check_all_subset_inside(apriori_dict[i-1].itemsets, subsets_len_dict[i], i)

        c = pd.DataFrame({
            "itemsets": c_item_set,
            "support": count_item(origin_itemset_valcount, c_item_set)
        })

        c.drop(c[c.support < min_support].index, inplace=True)        
        apriori_dict[i] = c
    result_df = pd.DataFrame({"itemsets":[], "support":[]})
    for i in apriori_dict:        
        result_df = result_df.append(apriori_dict[i])

    result.itemsets = result.itemsets.apply(lambda x: x if type(x) == tuple else (int(x), ))
    
    return result_df

In [4]:
odf = pd.read_csv("../../data/ibm.csv")
df = odf.groupby(['transaction_id']).item_id.apply(tuple).reset_index(name='item')
df["length"] = df.item.apply(lambda x: len(x))

In [5]:
df.item.value_counts().sum()

828

In [6]:
list(combinations([1, 2, 3], 0))
list(powerset([1,2,3]))

[(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]

In [7]:
all_subset_lists = df.item.apply(lambda x: tuple(powerset(x))).unique()
all_subsets = []
len_subsets = {}
for subsets in all_subset_lists:    
    all_subsets = all_subsets + list((set(subsets) - set(all_subsets)))
    
for subsets in all_subsets:
    try:
        len_subsets[len(subsets)].append(subsets)
    except:
        len_subsets[len(subsets)] = [subsets]




In [8]:
result = apriori(df, 0.1, 15)

100%|██████████| 818/818 [00:00<00:00, 303945.84it/s]
100%|██████████| 4249/4249 [00:00<00:00, 267298.57it/s]
100%|██████████| 11426/11426 [00:00<00:00, 275865.13it/s]
100%|██████████| 20097/20097 [00:00<00:00, 331491.28it/s]
100%|██████████| 25548/25548 [00:00<00:00, 434069.28it/s]
100%|██████████| 24645/24645 [00:00<00:00, 471178.82it/s]
100%|██████████| 18556/18556 [00:00<00:00, 446259.86it/s]
100%|██████████| 11093/11093 [00:00<00:00, 441549.68it/s]
100%|██████████| 5299/5299 [00:00<00:00, 417022.23it/s]
100%|██████████| 2004/2004 [00:00<00:00, 404591.35it/s]
100%|██████████| 581/581 [00:00<00:00, 350027.38it/s]
100%|██████████| 121/121 [00:00<00:00, 286729.26it/s]
100%|██████████| 16/16 [00:00<00:00, 183859.90it/s]
100%|██████████| 1/1 [00:00<00:00, 27060.03it/s]


In [10]:
result

Unnamed: 0,itemsets,support
0,"(307,)",0.138889
1,"(723,)",0.141304
2,"(470,)",0.212560
3,"(443,)",0.190821
4,"(973,)",0.158213
...,...,...
3,"(3, 111, 451, 488, 523)",0.355072
4,"(3, 111, 487, 488, 523)",0.355072
5,"(3, 451, 487, 488, 523)",0.352657
6,"(3, 111, 451, 487, 523)",0.357488


In [11]:
association_rules(result)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(723),(443),0.141304,0.190821,0.124396,0.880342,4.613437
1,(307),(443),0.138889,0.190821,0.123188,0.886957,4.648101
2,(723),(470),0.141304,0.212560,0.126812,0.897436,4.222028
3,(973),(470),0.158213,0.212560,0.143720,0.908397,4.273595
4,(307),(470),0.138889,0.212560,0.123188,0.886957,4.172727
...,...,...,...,...,...,...,...
753,(451),"(3, 487, 488, 523, 111)",0.404589,0.355072,0.342995,0.847761,2.387572
754,(487),"(3, 451, 488, 523, 111)",0.404589,0.355072,0.342995,0.847761,2.387572
755,(488),"(3, 451, 487, 523, 111)",0.403382,0.357488,0.342995,0.850299,2.378540
756,(523),"(3, 451, 487, 488, 111)",0.411836,0.351449,0.342995,0.832845,2.369743
