#!/usr/bin/env python<br># coding: utf-8

Author: Bao Cai

Course: Machine Learning for Descriptive Problems

Topic: Frequent Itemsets

Start Date: 2020-02-27

Last Save: 2020-02-27

1. Download the groceries.csv file from itslearning (also the file is in the folder homework inside the zip of this lecture)

2. Find the frequent pair of items (2-tuples) using the naïve, A-priori and PCY algorithms. For each of these compare the time of execution and results for supports s=10, 50, 100. Comment your results.

3. For the PCY algorithm, create up to 5 compact hash tables. What is  the difference in results and time of execution for 1,2,3,4 and 5 tables? Comment your results.

4. Find the final list of k-frequent items (k-tuples) for k=3,4 and 5. Experiment a bit and describe the best value for the support in each case. *Warning*: You can use any of the three algorithms, but be careful, because the algorithm can take too long if you don't chose it properly.

5. Using one of the results of the previous item, for one k (k=3,4 or 5) find the possible clusters using the 1-NN criteria. Comment your results.

In [1]:
# Libraries
import itertools
from time import time
from datetime import datetime

In [2]:
# global variables
path_data = 'Data/groceries.csv'

In [3]:
# Function

def read_baskets(file, k=2, verbose=True):
    """
    Read a basket file, for each line is a basket.
    
    Parameters:
    ----------
    
    file: str
        Path to file.

    k: int
        Number of max items in a basket.
    
    verbose: boolean
        Choose to report on progress or not.
    
    Returns:
    -------

    basket_of_k: list
        List of baskets of size k.
    """
    with open(file) as f:
        baskets = f.readlines()

    basket_of_k = []
    n = 0
    for basket in baskets:
        items = basket.replace('\n', '').split(',')
        for itemset in itertools.combinations(items, k):
            basket_of_k.append(frozenset(itemset))
        if verbose:
            n += 1
            if n % 1000 == 0:
                print(n, 'baskets processed')
    return basket_of_k

def naive_frequency(baskets):
    """
    Return a dict of frequencies for each basket in given list.
    
    Parameters:
    ----------
    
    baskets: list
        List of baskets.
    
    Returns:
    -------

    basket_frequency: dict
        A frequency corresponds to each basket.
    """
    
    basket_frequency = {}
    for basket in baskets:
        if basket not in basket_frequency:
            basket_frequency[basket] = 0
        basket_frequency[basket] += 1
    return basket_frequency

def frequency_threshold(basket_frequency, s=100):
    """
    Return a dict of set frequencies exceed support threshold.
    
    Parameters:
    ----------
    
    basket_frequency: list
        A frequency corresponds to each basket.
    
    s: int
        Support threshold
    
    Returns:
    -------

    exceed_frequency: dict
        A dict of set frequencies exceed support threshold.
    """
    
    exceed_frequency = {}
    k = len(list(basket_frequency.keys())[0])
    for key, value in basket_frequency.items():
        if value >= s:
            exceed_frequency[key] = value
    print('{} itemsets of size {} with frequency exceed {}'.format(
        len(exceed_frequency), k, s
    ))
    return exceed_frequency

def a_piori_preset(
    k,
    s=100,
    larger_set=None,
    smaller_set=None
):
    
    basket_size = len(list(larger_set.keys())[0]) +\
                  len(list(smaller_set.keys())[0])
    if basket_size != k:
        print(
            'The given sets cannot be combined',
            'to produce a set of', k
        )
        return None
    frequent_preset = set([
        a.union(b)
        for a in larger_set.keys()
        for b in smaller_set.keys()
    ])
    return frequent_preset

def a_piori_filter(
    file,
    k,
    s=100,
    frequent_preset=None,
    read_baskets=read_baskets,
    naive_frequency=naive_frequency,
    frequency_threshold=frequency_threshold
):
    if k == 1:
        return frequency_threshold(
            naive_frequency(
                read_baskets(file, k, False)
            ),
            s
        )
    filtered_set = {}
    for basket in read_baskets(file, k, False):
        if basket not in frequent_preset:
            continue
        if basket not in filtered_set:
            filtered_set[basket] = 0
        filtered_set[basket] += 1
        
    return frequency_threshold(filtered_set, s)

def PCY_hash(
    file,
    k,
    n_hash=2,
    s=100,
    read_baskets=read_baskets
):
    
    hash_tables = []
    for i in n_hash:
        max_hash = 5*1000000 + i*1024
        hash_table.append((np.zeros((max_hash,), dtype=int), max_hash))
    
    for key in read_baskets(file, k, False):
        for hash_table, max_hash in hash_tables:
            hash_table[hash(key)%max_hash] += 1
    for i in range(len(hash_tables)):
        hash_tables[i][0] = set(np.where(hash_tables[i][0] >= s)[0])
    return hash_tables

def PCY_filter(
    file,
    k,
    s=100,
    hash_tables=None,
    frequent_preset=None,
    read_baskets=read_baskets,
    frequency_threshold=frequency_threshold
):
    filtered_set = {}
    for basket in read_baskets(file, k, False):
        if basket not in frequent_preset:
            continue
        accept = True
        for hash_table, max_hash in hash_tables:
            hashed = hash(basket)%max_hash
            if hashed not in hash_table:
                accept = False
                break
        if not accept:
            continue
        if basket not in filtered_set:
            filtered_set[basket] = 0
        filtered_set[basket] += 1
    return frequency_threshold(filtered_set, s)

In [4]:
with open('Data/groceries.csv') as f:
    baskets = f.readlines()

baskets = [
    frozenset(
        basket.replace('\n', '').split(',')
    ) for basket in baskets
]

In [5]:
baskets = read_baskets(path_data, k=2)

1000 baskets processed
2000 baskets processed
3000 baskets processed
4000 baskets processed
5000 baskets processed
6000 baskets processed
7000 baskets processed
8000 baskets processed
9000 baskets processed


In [6]:
%%time
naive_itemsets = naive_frequency(baskets)
naive_100_threshold = frequency_threshold(naive_itemsets)

207 itemsets of size 2 with frequency exceed 100
CPU times: user 85 ms, sys: 2.59 ms, total: 87.6 ms
Wall time: 83 ms


In [7]:
%%time
naive_itemsets = naive_frequency(baskets)
naive_50_threshold = frequency_threshold(naive_itemsets, 50)

605 itemsets of size 2 with frequency exceed 50
CPU times: user 74.4 ms, sys: 7.66 ms, total: 82.1 ms
Wall time: 77.7 ms


In [8]:
%%time
naive_itemsets = naive_frequency(baskets)
naive_20_threshold = frequency_threshold(naive_itemsets, 20)

1674 itemsets of size 2 with frequency exceed 20
CPU times: user 78.6 ms, sys: 2.89 ms, total: 81.5 ms
Wall time: 78.2 ms


In [9]:
%%time
naive_itemsets = naive_frequency(baskets)
naive_10_threshold = frequency_threshold(naive_itemsets, 10)

2981 itemsets of size 2 with frequency exceed 10
CPU times: user 79.3 ms, sys: 1.78 ms, total: 81.1 ms
Wall time: 78.1 ms


In [10]:
%%time
naive_k3_itemsets = naive_frequency(read_baskets(path_data, 3, False))
naive_k3_100_threshold = frequency_threshold(naive_k3_itemsets, 100)

31 itemsets of size 3 with frequency exceed 100
CPU times: user 1.07 s, sys: 75.8 ms, total: 1.15 s
Wall time: 1.14 s


In [11]:
%%time
naive_k3_itemsets = naive_frequency(read_baskets(path_data, 3, False))
naive_k3_20_threshold = frequency_threshold(naive_k3_itemsets, 20)

1991 itemsets of size 3 with frequency exceed 20
CPU times: user 1.48 s, sys: 34.7 ms, total: 1.52 s
Wall time: 1.51 s


In [12]:
%%time
naive_k4_itemsets = naive_frequency(read_baskets(path_data, 4, False))
naive_k4_20_threshold = frequency_threshold(naive_k4_itemsets, 20)

395 itemsets of size 4 with frequency exceed 20
CPU times: user 3.36 s, sys: 120 ms, total: 3.48 s
Wall time: 3.48 s


In [13]:
%%time
naive_k5_itemsets = naive_frequency(read_baskets(path_data, 5, False))
naive_k5_20_threshold = frequency_threshold(naive_k5_itemsets, 20)

16 itemsets of size 5 with frequency exceed 20
CPU times: user 11.8 s, sys: 1.46 s, total: 13.2 s
Wall time: 13.2 s


### A-Piori

In [14]:
apiori_k1_s100 = a_piori_filter(path_data, 1)
apiori_k1_s50 = a_piori_filter(path_data, 1, 50)
apiori_k1_s10 = a_piori_filter(path_data, 1, 10)

88 itemsets of size 1 with frequency exceed 100
120 itemsets of size 1 with frequency exceed 50
157 itemsets of size 1 with frequency exceed 10


In [15]:
apiori_k2_s100 = a_piori_preset(
    2,
    100,
    apiori_k1_s100,
    apiori_k1_s100
)

In [16]:
%%time
apiori_k2_s100 = a_piori_filter(
    path_data,
    2,
    100,
    apiori_k2_s100
)

207 itemsets of size 2 with frequency exceed 100
CPU times: user 277 ms, sys: 2.97 ms, total: 280 ms
Wall time: 278 ms


In [17]:
apiori_k2_s50 = a_piori_preset(
    2,
    50,
    apiori_k1_s50,
    apiori_k1_s50
)

In [18]:
%%time
apiori_k2_s50 = a_piori_filter(
    path_data,
    2,
    50,
    apiori_k2_s50
)

605 itemsets of size 2 with frequency exceed 50
CPU times: user 271 ms, sys: 2.84 ms, total: 274 ms
Wall time: 271 ms


In [19]:
apiori_k2_s10 = a_piori_preset(
    2,
    10,
    apiori_k1_s10,
    apiori_k1_s10
)

In [20]:
%%time
apiori_k2_s10 = a_piori_filter(
    path_data,
    2,
    10,
    apiori_k2_s10
)

2981 itemsets of size 2 with frequency exceed 10
CPU times: user 340 ms, sys: 1.91 ms, total: 342 ms
Wall time: 339 ms


In [21]:
apiori_k3_s100 = a_piori_preset(
    3,
    100,
    apiori_k2_s100,
    apiori_k1_s100
)

In [22]:
%%time
apiori_k3_s100 = a_piori_filter(
    path_data,
    3,
    100,
    apiori_k3_s100
)

31 itemsets of size 3 with frequency exceed 100
CPU times: user 2.84 s, sys: 33.4 ms, total: 2.87 s
Wall time: 2.85 s


In [23]:
apiori_k1_s20 = a_piori_filter(path_data, 1, 20)
apiori_k2_s20 = a_piori_preset(
    2,
    20,
    apiori_k1_s20,
    apiori_k1_s20
)
apiori_k2_s20 = a_piori_filter(
    path_data,
    2,
    20,
    apiori_k2_s20
)

apiori_k3_s20 = a_piori_preset(
    3,
    20,
    apiori_k2_s20,
    apiori_k1_s20
)

apiori_k4_s20 = a_piori_preset(
    4,
    20,
    apiori_k2_s20,
    apiori_k2_s20
)

147 itemsets of size 1 with frequency exceed 20
1674 itemsets of size 2 with frequency exceed 20


In [24]:
%%time
apiori_k3_s20 = a_piori_filter(
    path_data,
    3,
    20,
    apiori_k3_s20
)

1991 itemsets of size 3 with frequency exceed 20
CPU times: user 10.5 s, sys: 32.7 s, total: 43.1 s
Wall time: 1min 21s


In [25]:
apiori_k5_s20 = a_piori_preset(
    5,
    20,
    apiori_k3_s20,
    apiori_k2_s20
)

In [26]:
%%time
apiori_k4_s20 = a_piori_filter(
    path_data,
    4,
    20,
    apiori_k4_s20
)

395 itemsets of size 4 with frequency exceed 20
CPU times: user 28.3 s, sys: 1min 43s, total: 2min 11s
Wall time: 4min 58s


In [None]:
%%time
apiori_k5_s20 = a_piori_filter(
    path_data,
    5,
    20,
    apiori_k5_s20
)

Basket size 2 and 3 seems large so I checked with Naive method above and it's checked out.

Also, I think the more advance algorithm will benefit when the number of items in a single basket is relatively large or equal to the number of unique items. The reason behind that statement is that for this dataset, there's not much 5 items basket (I guess, I didn't open many of them), so naive method can just run through them and count pretty easily.