# Exercises Week 10 - Association Rule Mining

In [2]:
import numpy as np

# Local imports
import sys
from utilities.load_data import load_market_basket

## Exercise 1.

Consider a dataset of transactions $D$, and let $D'$ be a dataset derived from $D$ by independently deleting items from transactions in $D$. In particular, any item in any transaction in $D$ is deleted with probability $p$.

1. Given an itemset $S$, compute its expected support in $D'$ as a function of its support in $D$.
2. Compute the probability that an itemset $S$, which is frequent in $D$, is also frequent in $D'$, under the same minimum support threshold.

### Solutions
1.  To solve this, we need to think of what the probability is for loosing each "supporting" transaction.
    Suppose that $|S| = k$ and consider a transaction $t_i$ of which $S \subseteq i(t_i)$, then since each item in $t_i$ is deleted independently with probability $p$, the probability of keeping the support from $t_i$ is $Pr[\text{not deleting any item } x\in S \text{ from }t_i] = \prod_{x \in S} (1-p) = (1-p)^k$.  
    Thus computing the expected value of the support for $S$ in $D'$ amounts to
    $$
        \mathbb{E}\left[Sup(S, D')\right] = \sum_{t_i \in D} (1-p)^k 1 = (1-p)^k Sup(S, D)
    $$
    
2.  To compute this probability, we again consider an itemset $S$ with size $k$, i.e., $|S| = k$.
    Again, we can realize that a transaction $t_i$ is going to be supportive to $S$ if all $x \in S$ of $t_i$ are kept. This has probability $\hat p = (1-p)^k$.
    We can think of $\hat p$ as a bernouli distribution.  
    
    Now we need to quantify the probability of keeping at least the minimum support ($m$) of the supporting transactions. 
    Each of the $n := Sup(S, D)$ supporting transactions will be kept with probability $\hat p$ so we can think of this setup as a binomial distribution with $n$ trials.
    The binomial distribution has probability mass function $Pr[X = k] = {n \choose k }\hat p^k(1-\hat p)^{n-k}$ for bernouli trials with $n$ samples (transactions in this case) and exactly $k$ positive outcomes.
    
    Now we just need to sum up the probabilities of all $k \geq m$.
    
    $$
        Pr[sup(S, D') \geq m] = \sum_{k=m}^{n} {n \choose k}\hat p^k (1-\hat p)^{n - k}
    $$

## Exercise 2.

The work you do in this exercise will be useful also in your hand-in.

We learned the Apriori algorithm in class. Make your own implementation. 

We will use the anonymized real-world retail market basket data from: http://fimi.ua.ac.be/data/.

This data comes from an anonymous Belgian retail store, and was donated by Tom Brijs from Limburgs Universitair Centrum, Belgium. The original data contains 16,470 different items and 88,162 transactions. You may only work with the top-50 items in terms of occurrence frequency.

Your task is to:
1. Implement the Apriori algorithm.
2. Apply the Apriori algorithm on these data to find association rules with minimum support 0.05 and minimum confidence 0.7. Write down the rules in descending order of confidence.


In [3]:
from itertools import combinations

### TODO Your code here
def compute_candidates(prev_itemset):
    Ck = set()
    # Join step
    for itemset in prev_itemset:
        its1 = tuple(sorted(itemset))
        for itemset2 in prev_itemset:
            its2 = tuple(sorted(itemset2))
            if its1[:-1] == its2[:-1]:
                if its1[-1] < its2[-1]: Ck.add(its1 + its2[-1:])

    # Pruning step
    to_remove = set()
    for c in Ck:
        for subset in combinations(c, len(c)-1):
            if not subset in prev_itemset:
                to_remove.add(c)
                break
    for c in to_remove:
        Ck.remove(c)
    
    return Ck


def apriori_algorithm_paper(T, support=0.05, min_confidence=0.7):
    # We are going to keep dictionaries of dictionaries:
    # itemsets: Dict(int, Dict(tuple, int)) where the outer int is the size of the itemset, 
    # the tuple is the itemset and the last int is the count of that itemset 
    
    itemsets = dict()
    
    # 1. Count size one itemsets
    num_transactions = len(T)
    
    counts = dict()
    for t in T:
        for i in t:
            k = (i,)
            if k in counts: counts[k] += 1
            else:           counts[k]  = 1
    
    itemsets[1] = {
        i: c for (i, c) in counts.items() if (c / num_transactions) >= support
    }
    
    still_applicable = [True] * len(T) # Remember which rows are still candidates for matching itemsets (for efficiency)
    
    # 2. Construct larger itemsets
    k = 2
    while k-1 in itemsets and itemsets[k-1]:
        prev_is = set(itemsets[k-1].keys())
        
        Ck = compute_candidates(prev_is)
        
        # Count occurences of candidate sets in the transaction database
        counts = dict()
        for row, t in enumerate(T): 
            if not still_applicable[row]: continue
            
            found_any = False
            for c in Ck:
                if set.issubset(set(c), t):
                    if c in counts: counts[c] += 1
                    else:           counts[c] = 1
                    found_any = True
            
            still_applicable[row] = found_any
            
        # Keep candidate sets with proper support
        itemsets[k] = {
            c: counts[c] for c in Ck if (counts[c] / num_transactions) >= support
        }
        k += 1
    
    # 3. generate rules from itemsets
    count = lambda i: itemsets[len(i)][i]                      # For easier look up of counts
    rule  = lambda lhs, rhs: "%s => %s" % (str(lhs), str(rhs)) # For generating rule strings
    rules = []                                                 # Collection of rules
    
    def apriori_rule_generation(itemset, Hm):
        """
            Recursive formulation for generating rules. Starting from itemsets of size 2.
        """
        if (len(itemset)-1) <= len(Hm[0]): return # Stop recursion if itemset is only one larger than candidates

        Hm = list(compute_candidates(Hm))         # Same computation as for building itemsets
        to_remove = []
        
        confidence = lambda lhs: count(itemset) / count(lhs)

        for hm in Hm:
            lhs = tuple(sorted(set(itemset).difference(set(hm))))
            
            conf = confidence(lhs)
            if conf >= min_confidence:
                rules.append((rule(lhs, hm), conf))
            else:
                to_remove.append(hm)

        for hm in to_remove:
            Hm.remove(hm)
         
        if Hm: apriori_rule_generation(itemset, Hm)
    
    # Generate rules for all k>=2 itemsets 
    for size in itemsets.keys():
        if size < 2: continue
            
        # For every itemset of this size
        for itemset in itemsets[size].keys():
            confidence = lambda lhs: count(itemset) / count(lhs)
            
            # Capture rules {others} -> {a single item}
            for rhs in combinations(itemset, 1):
                # Compute the left hand side
                remaining = set(itemset).difference(set(rhs))
                lhs = tuple(sorted(remaining))

                # If the confidence is high enough, yield the rule
                conf = confidence(lhs)
                
                if conf >= min_confidence:
                    rules.append((rule(lhs, rhs), conf))
            
            # Recurse on H_m, where m > 1
            H1 = list(combinations(itemset, 1))
            apriori_rule_generation(itemset, H1)
            
    return itemsets, rules
### TODO Your code here

In [6]:
from itertools import chain, combinations
rule  = lambda lhs, rhs: "%s => %s" % (str(lhs), str(rhs)) # For generating rule strings

def compute_support(Ck, T, still_applicable=None):
    if still_applicable is None: still_applicable = [True] * len(T)
    
    counts = {}
    for row, t in enumerate(T): 
        if not still_applicable[row]: continue

        found_any = False
        for c in Ck:
            if set.issubset(set(c), t):
                if c in counts: counts[c] += 1
                else:           counts[c] = 1
                found_any = True

        still_applicable[row] = found_any            
    return counts

def extend_prefix_tree(Ck_prev):
    Ck = set()
    # Join step
    for itemset in Ck_prev:
        its1 = tuple(sorted(itemset))
        for itemset2 in Ck_prev:
            its2 = tuple(sorted(itemset2))
            if its1[:-1] == its2[:-1] and its1[-1] < its2[-1]: Ck.add(its1 + its2[-1:])

    # Pruning step
    to_remove = set()
    for c in Ck:
        for subset in combinations(c, len(c)-1):
            if not subset in Ck_prev:
                to_remove.add(c)
                break
    for c in to_remove:
        Ck.remove(c)
    return Ck

def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))

def apriori_algorithm_book(T, support=0.5, min_confidence=0.7):
    n = len(T)
    
    # Compute Itemsets
    itemsets = {}
    C1 = set()
    for t in T:
        for ti in t: C1.add((ti,))
    
    still_applicable = [True] * n
    Ck = C1
    k = 1
    while Ck:
        itemsets[k] = compute_support(Ck, T, still_applicable)
        Ck_copy = Ck.copy()
        for itemset in Ck:
            if itemsets[k][itemset] / n < support:
                del itemsets[k][itemset]
                Ck_copy.remove(itemset)
         
        Ck = extend_prefix_tree(Ck_copy)
        k += 1
     
    # Construct Rules
    k = 2
    rules = []
    while k <= max(itemsets.keys()):
        for itemset in itemsets[k].keys():
            for rhs in powerset(itemset):
                remaining = set(itemset).difference(set(rhs))
                lhs = tuple(sorted(remaining))
                
                conf = itemsets[k][itemset] / itemsets[len(lhs)][lhs]
                if conf >= min_confidence:
                    rules.append((rule(lhs, rhs), conf))
        k+= 1
    return itemsets, rules

In [9]:
# Load the retail data
transactions = load_market_basket()

def book_example():
    return [
        [1, 2, 4, 5],
        [2, 3, 5],
        [1, 2, 4, 5],
        [1, 2, 3, 5],
        [1, 2, 3, 4, 5], 
        [2, 3, 4],
    ]
    
def filter_transactions(T, k=50):
    """
        Keep only the top k items in the transactions.
        Remove transactions that become empty.
    """
    # Count occurences of each item
    counts = [0] * 16470
    for t in T:
        for i in t:
            counts[i] += 1

    # Sort and select top k
    counts = np.array(counts)
    order  = np.argsort(counts)[::-1] # reverse the sorted order

    indexes_to_keep = order[:k]       # Keep the top k items
    index_set = set(indexes_to_keep)  # Convert to python set for efficiency

    # Filter transactions
    T_new = [t_ for t_ in  [list(filter(lambda i: i in index_set, t)) for t in T]  if t_]
    return T_new

T = filter_transactions(transactions, k=100)

# Example 8.1 from the book
# T = book_example()

book = True
if book:
    apriori_algorithm = apriori_algorithm_book
else:
    apriori_algorithm = apriori_algorithm_paper

itemsets, rules = apriori_algorithm(T, support=0.05, min_confidence=0.7)
rules = sorted(rules, key=lambda x: x[1], reverse=True)

print("%-8s \t %s" % ("Conf.", "Rule"))
for r in rules:
    print("%7.4f%% \t %s" % r[::-1])


Conf.    	 Rule
 0.8168% 	 (41, 48) => (39,)
 0.7681% 	 (38, 48) => (39,)
 0.7637% 	 (41,) => (39,)


## Exercise 3.

We have learned how to mine frequent itemsets and association rules from a transaction database where each transaction consists of a simple set of items. You are asked to propose a framework for mining association rules from transaction data, in which each item in a transaction is associated with an integer number that counts how many times the items appears in the transaction. In a market basket, this count number indicates the number of copies of a product in a customer’s basket. For example, we do not only care whether a customer bought fish or not, but how many pieces of fish they bought. You need to:

1. Define (extend) the notion of an itemset and an association rule in the case of such data.

2. Describe an efficient algorithm that mines itemsets and association rules as defined in (1). Illustrate the pruning strategies used in your algorithm and explain how they relate to the Apriori principle.

3. Extend your implementation of the Apriori algorithm to handle this case.

### Solution:
1. 
    1. The simple solution would be to let the "identifiers" be the item and the count. This would give rules like [(A, 2), (C, 3) => (D, 1)].
    2. One could also keep the original formulation of an itemset but change the count by counting as many supports as the min count of the items in each transaction. This woul
    3. Finally, one could compute mean counts in supporting transactions, i.e., [A:2, B:3.5, D:1.2] and if I then make a rule say "E => ABD", We could then give an additional information on how many items one could expect.
2. For the third (and more interesting) solution above, mining would be roughly the same as the original algorithm. The differrence would be the additional book keeping for keeping the means. Note that we can compute a running mean.
3. Alongside with the count variables, we will keep sum variables.
    When producing rules, we will output sum / count along with confidence in order to indicate how many of each items one can expect.

In [3]:
### TODO your code here
# You can copy pase the code from above and adjust it
### TODO your code here

# Optional
Test of expectation computations above.

In [99]:
import numpy as np
from tqdm import tqdm

itemsets, _ = apriori_algorithm(T)

p = 0.1

def sample_itemset(p=0.1):
    T_ = []
    for t in T: 
        rs = np.random.rand(len(t))
        new = []
        for x, pr in zip(t, rs):
            if pr < p: continue
            else:      new.append(x)
        T_.append(new)
    itemsets, _ = apriori_algorithm(T_)
    return itemsets


I2 = [sample_itemset() for _ in tqdm(range(10))]
I += I2

100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


In [100]:
for k in range(1, max(itemsets.keys())):
    print(" _ - - - - -  ", k, " - - - -- - ")
    for key in itemsets[k].keys():
        counts = []
        for itemsets2 in I:
            if key in itemsets2[k]: counts.append(itemsets2[k][key])
        print("%s \t %-10d %-10d %-10.2f %-5d" % (str(key), itemsets[k][key], itemsets[k][key] * (1-p)**k, np.mean(counts), len(counts)))
        

 _ - - - - -   1  - - - -- - 
(32,) 	 15167      13650      13635.63   30   
(38,) 	 15596      14036      14031.97   30   
(39,) 	 50675      45607      45622.53   30   
(41,) 	 14945      13450      13450.37   30   
(48,) 	 42135      37921      37915.73   30   
(65,) 	 4472       4024       4023.66    29   
 _ - - - - -   2  - - - -- - 
(38, 39) 	 10345      8379       8378.83    30   
(32, 39) 	 8455       6848       6839.50    30   
(41, 48) 	 9018       7304       7310.60    30   
(32, 48) 	 8034       6507       6496.17    30   
(39, 48) 	 29142      23605      23606.07   30   
(38, 48) 	 7944       6434       6424.03    30   
(39, 41) 	 11414      9245       9246.13    30   
 _ - - - - -   3  - - - -- - 
(32, 39, 48) 	 5402       3938       nan        0    
(38, 39, 48) 	 6102       4448       4439.00    30   
(39, 41, 48) 	 7366       5369       5372.37    30   
