#!/usr/bin/env python<br># coding: utf-8

Author: Bao Cai

Course: Machine Learning for Descriptive Problems

Topic: Frequent Itemsets

Start Date: 2020-02-27

Last Save: 2020-02-27

1. Download the groceries.csv file from itslearning (also the file is in the folder homework inside the zip of this lecture)

2. Find the frequent pair of items (2-tuples) using the naïve, A-priori and PCY algorithms. For each of these compare the time of execution and results for supports s=10, 50, 100. Comment your results.

3. For the PCY algorithm, create up to 5 compact hash tables. What is  the difference in results and time of execution for 1,2,3,4 and 5 tables? Comment your results.

4. Find the final list of k-frequent items (k-tuples) for k=3,4 and 5. Experiment a bit and describe the best value for the support in each case. *Warning*: You can use any of the three algorithms, but be careful, because the algorithm can take too long if you don't chose it properly.

5. Using one of the results of the previous item, for one k (k=3,4 or 5) find the possible clusters using the 1-NN criteria. Comment your results.

In [1]:
# Libraries
import itertools
from time import time
from datetime import datetime

In [2]:
# global variables
path_data = 'Data/groceries.csv'

In [3]:
# Function

def read_baskets(file, k=2, verbose=True):
    """
    Read a basket file, for each line is a basket.
    
    Parameters:
    ----------
    
    file: str
        Path to file.

    k: int
        Number of max items in a basket.
    
    verbose: boolean
        Choose to report on progress or not.
    
    Returns:
    -------

    basket_of_k: list
        List of baskets of size k.
    """
    with open(file) as f:
        baskets = f.readlines()

    basket_of_k = []
    n = 0
    for basket in baskets:
        items = basket.replace('\n', '').split(',')
        for itemset in itertools.combinations(items, k):
            basket_of_k.append(frozenset(itemset))
        if verbose:
            n += 1
            if n % 1000 == 0:
                print(n, 'baskets processed')
    return basket_of_k

def naive_frequency(baskets):
    """
    Return a dict of frequencies for each basket in given list.
    
    Parameters:
    ----------
    
    baskets: list
        List of baskets.
    
    Returns:
    -------

    basket_frequency: dict
        A frequency corresponds to each basket.
    """
    
    basket_frequency = {}
    for basket in baskets:
        if basket not in basket_frequency:
            basket_frequency[basket] = 0
        basket_frequency[basket] += 1
    return basket_frequency

def frequency_threshold(basket_frequency, s=100):
    """
    Return a dict of set frequencies exceed support threshold.
    
    Parameters:
    ----------
    
    basket_frequency: list
        A frequency corresponds to each basket.
    
    s: int
        Support threshold
    
    Returns:
    -------

    exceed_frequency: dict
        A dict of set frequencies exceed support threshold.
    """
    
    exceed_frequency = {}
    k = list(basket_frequency.keys())
    for key, value in basket_frequency.items():
        if value >= s:
            exceed_frequency[key] = value
    print('{} itemsets with frequency exceed {}'.format(
        len(exceed_frequency), s
    ))
    return exceed_frequency

def a_piori_filter(
    file,
    k,
    s=100,
    larger_set=None,
    smaller_set=None,
    read_baskets=read_baskets,
    naive_frequency=naive_frequency,
    frequency_threshold=frequency_threshold
):
    if k == 1:
        return frequency_threshold(
            naive_frequency(
                read_baskets(file, k, False)
            ),
            s
        )
    basket_size = len(list(larger_set.keys())[0]) +\
                  len(list(smaller_set.keys())[0])
    if basket_size != k:
        print(
            'The given sets cannot be combined',
            'to produce a set of', k
        )
        return None
    frequent_preset = set([
        a.union(b)
        for a in larger_set.keys()
        for b in smaller_set.keys()
    ])
    filtered_set = {}
    for basket in read_baskets(file, k, False):
        if basket not in frequent_preset:
            continue
        if basket not in filtered_set:
            filtered_set[basket] = 0
        filtered_set[basket] += 1
    return frequency_threshold(filtered_set, s)

In [4]:
with open('Data/groceries.csv') as f:
    baskets = f.readlines()

baskets = [
    frozenset(
        basket.replace('\n', '').split(',')
    ) for basket in baskets
]

In [5]:
baskets = read_baskets(path_data, k=2)

1000 baskets processed
2000 baskets processed
3000 baskets processed
4000 baskets processed
5000 baskets processed
6000 baskets processed
7000 baskets processed
8000 baskets processed
9000 baskets processed


In [6]:
%time
naive_itemsets = naive_frequency(baskets)
naive_100_threshold = frequency_threshold(naive_itemsets)

CPU times: user 6 µs, sys: 2 µs, total: 8 µs
Wall time: 15.3 µs
207 itemsets with frequency exceed 100


In [7]:
%time
naive_itemsets = naive_frequency(baskets)
naive_50_threshold = frequency_threshold(naive_itemsets, 50)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 12.4 µs
605 itemsets with frequency exceed 50


In [8]:
%time
naive_itemsets = naive_frequency(baskets)
naive_10_threshold = frequency_threshold(naive_itemsets, 10)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11.7 µs
2981 itemsets with frequency exceed 10


In [9]:
%time
naive_k3_itemsets = naive_frequency(read_baskets(path_data, 3, False))
naive_k3_100_threshold = frequency_threshold(naive_k3_itemsets, 100)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 12.4 µs
31 itemsets with frequency exceed 100


In [10]:
apiori_k1_s100 = a_piori_filter(path_data, 1)
apiori_k1_s50 = a_piori_filter(path_data, 1, 50)
apiori_k1_s10 = a_piori_filter(path_data, 1, 10)

88 itemsets with frequency exceed 100
120 itemsets with frequency exceed 50
157 itemsets with frequency exceed 10


In [11]:
%time
apiori_k2_s100 = a_piori_filter(
    path_data,
    2,
    100,
    apiori_k1_s100,
    apiori_k1_s100
)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 14.1 µs
207 itemsets with frequency exceed 100


In [12]:
%time
apiori_k2_s50 = a_piori_filter(
    path_data,
    2,
    50,
    apiori_k1_s50,
    apiori_k1_s50
)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.9 µs
605 itemsets with frequency exceed 50


In [13]:
%time
apiori_k2_s10 = a_piori_filter(
    path_data,
    2,
    10,
    apiori_k1_s10,
    apiori_k1_s10
)

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.6 µs
2981 itemsets with frequency exceed 10


In [14]:
%time
apiori_k3_s100 = a_piori_filter(
    path_data,
    3,
    100,
    apiori_k2_s100,
    apiori_k1_s100
)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.4 µs
31 itemsets with frequency exceed 100


In [15]:
%%time
apiori_k2_s10 = a_piori_filter(
    path_data,
    2,
    10,
    apiori_k1_s10,
    apiori_k1_s10
)

apiori_k4_s10 = a_piori_filter(
    path_data,
    4,
    10,
    apiori_k2_s10,
    apiori_k2_s10
)

2981 itemsets with frequency exceed 10
3137 itemsets with frequency exceed 10
CPU times: user 29.5 s, sys: 2.11 s, total: 31.7 s
Wall time: 31.5 s


In [None]:
%%time
apiori_k3_s10 = a_piori_filter(
    path_data,
    3,
    10,
    apiori_k2_s10,
    apiori_k1_s10
)

apiori_k5_s10 = a_piori_filter(
    path_data,
    5,
    10,
    apiori_k3_s10,
    apiori_k2_s10
)

6831 itemsets with frequency exceed 10
