# Apriori
## imports

In [1]:
import pandas as pd
from itertools import combinations
import time

## Read csv

In [2]:
data = pd.read_csv("Sales Transaction v.4a.csv")
print(data.head(10))

  TransactionNo       Date ProductNo                          ProductName  \
0        581482  12/9/2019     22485        Set Of 2 Wooden Market Crates   
1        581475  12/9/2019     22596  Christmas Star Wish List Chalkboard   
2        581475  12/9/2019     23235             Storage Tin Vintage Leaf   
3        581475  12/9/2019     23272    Tree T-Light Holder Willie Winkie   
4        581475  12/9/2019     23239    Set Of 4 Knick Knack Tins Poppies   
5        581475  12/9/2019     21705              Bag 500g Swirly Marbles   
6        581475  12/9/2019     22118             Joy Wooden Block Letters   
7        581475  12/9/2019     22119           Peace Wooden Block Letters   
8        581475  12/9/2019     22217          T-Light Holder Hanging Lace   
9        581475  12/9/2019     22216            T-Light Holder White Lace   

   Price  Quantity  CustomerNo         Country  
0  21.47        12     17490.0  United Kingdom  
1  10.65        36     13069.0  United Kingdom  
2  11

## Group by transactions

In [3]:
transactions = data.groupby('TransactionNo')['ProductNo'].apply(set)

print(transactions.head(10))

TransactionNo
536365    {85123A, 84406B, 84029G, 71053, 84029E, 22752,...
536366                                       {22632, 22633}
536367    {21754, 22623, 48187, 22622, 21777, 22310, 227...
536368                         {22912, 22914, 22960, 22913}
536369                                              {21756}
536370    {21731, 10002, 21913, 21791, 22726, 21035, 225...
536371                                              {22086}
536372                                       {22632, 22633}
536373    {82483, 21068, 85123A, 20679, 84406B, 71053, 8...
536374                                              {21258}
Name: ProductNo, dtype: object


## Set minimum support and confidence
The minimum confidence was chosen because of time reasons otherwise the execution would take too long

In [4]:
min_support = len(transactions) * 0.02
min_confidence = 0.6

## 1-itemset

In [5]:
start_time = time.time()
counts = pd.Series([item for transaction in transactions for item in transaction]).value_counts()
itemset_1 = counts[counts > min_support]
current_items = set(itemset_1.index)
itemset_1.index = [(i,) for i in itemset_1.index]
itemsets = [itemset_1]

## Next itemsets

In [6]:
k = 2
while True:
    sets = list(combinations(current_items, k))
    sets = [tuple(sorted(tup)) for tup in sets]
    counts = pd.Series([s for s in sets for transaction in transactions if set(s).issubset(transaction)]).value_counts()
    itemset_k = counts[counts > min_support]
    current_items = {item for tup in itemset_k.index for item in tup}
    k += 1
    if itemset_k.empty or k > len(itemset_1):
        break
    else:
        itemsets.append(itemset_k)

## Find rules

In [7]:
rules = []
for counts in itemsets[1:]:
    for itemset in counts.index:
        sets = set()
        for i in range(1, len(itemset)):
            sets.update(combinations(itemset, i))
        for s in sets:
            confidence = counts[itemset]/itemsets[len(s)-1][s]
            if confidence > min_confidence:
                rules.append((s, set(s).symmetric_difference(itemset), confidence))
basic_apriori_time = time.time() - start_time
print(f"Time for basic apriori: {basic_apriori_time}")
for antecedent, consequent,  confidence in rules:
    print(f"Rule: {antecedent} -> {consequent} | Confidence: {confidence:.4f}")

Time for basic apriori: 198.6903212070465
Rule: ('22386',) -> {'85099B'} | Confidence: 0.6767
Rule: ('22697',) -> {'22699'} | Confidence: 0.7417
Rule: ('22699',) -> {'22697'} | Confidence: 0.7000
Rule: ('21931',) -> {'85099B'} | Confidence: 0.6103
Rule: ('22726',) -> {'22727'} | Confidence: 0.6454
Rule: ('22697',) -> {'22698'} | Confidence: 0.6093
Rule: ('22698',) -> {'22697'} | Confidence: 0.8040
Rule: ('22698',) -> {'22699'} | Confidence: 0.7665
Rule: ('85099C',) -> {'85099B'} | Confidence: 0.6262
Rule: ('22910',) -> {'22086'} | Confidence: 0.6671
Rule: ('23300',) -> {'23301'} | Confidence: 0.7176
Rule: ('21928',) -> {'85099B'} | Confidence: 0.6691
Rule: ('85099F',) -> {'85099B'} | Confidence: 0.6504
Rule: ('22630',) -> {'22629'} | Confidence: 0.6256
Rule: ('20712',) -> {'85099B'} | Confidence: 0.6170
Rule: ('22356',) -> {'20724'} | Confidence: 0.6921
Rule: ('20719',) -> {'20724'} | Confidence: 0.6014
Rule: ('21733',) -> {'85123A'} | Confidence: 0.6658
Rule: ('20723',) -> {'20724'} |

# FP-growth algorithm

In [40]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth,apriori,association_rules



encoder = TransactionEncoder()
preproc = encoder.fit(transactions).transform(transactions)
data = pd.DataFrame(preproc, columns=encoder.columns_)

start_time = time.time()

frequent_itemsets_fpgrowth = fpgrowth(data, min_support=0.02, use_colnames=True)

rules_fpgrowth = association_rules(frequent_itemsets_fpgrowth, metric="confidence", min_threshold=0.6)
fp_time = time.time() - start_time

print(f"Time for basic apriori: {fp_time}")

print("\nFrequent Itemsets (FP-Growth):")
print(frequent_itemsets_fpgrowth)
print("\nAssociation Rules (FP-Growth):")
print(rules_fpgrowth)

Time for basic apriori: 0.8176543712615967

Frequent Itemsets (FP-Growth):
      support         itemsets
0    0.099595         (85123A)
1    0.020945          (22633)
2    0.063222          (84879)
3    0.033917          (21754)
4    0.027194          (21755)
..        ...              ...
271  0.020988   (23298, 47566)
272  0.022324   (23203, 23202)
273  0.025082  (85099B, 23203)
274  0.021763   (23209, 23203)
275  0.023660   (23301, 23300)

[276 rows x 2 columns]

Association Rules (FP-Growth):
       antecedents     consequents  antecedent support  consequent support  \
0          (22726)         (22727)            0.043139            0.046587   
1          (21733)        (85123A)            0.032494            0.099595   
2          (22386)        (85099B)            0.053051            0.092010   
3          (21931)        (85099B)            0.051758            0.092010   
4         (85099C)        (85099B)            0.040812            0.092010   
5          (20723)         (2

# Better apriori implementation

In [41]:
start_time = time.time()
frequent_itemsets_apriori = apriori(data, min_support=0.02, use_colnames=True)
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.6)
better_apriori_time = time.time() - start_time

print(f"Time for better apriori: {better_apriori_time}")

print("\nFrequent Itemsets (Apriori):")
print(frequent_itemsets_apriori)
print("\nAssociation Rules (Apriori):")
print(rules_apriori)

Time for better apriori: 2.7918524742126465

Frequent Itemsets (Apriori):
      support               itemsets
0    0.022496                (15036)
1    0.023574               (15056N)
2    0.030340                (20685)
3    0.022841                (20711)
4    0.037580                (20712)
..        ...                    ...
271  0.023660         (23301, 23300)
272  0.023487        (82494L, 82482)
273  0.025556       (85099B, 85099C)
274  0.023487       (85099B, 85099F)
275  0.023660  (22698, 22697, 22699)

[276 rows x 2 columns]

Association Rules (Apriori):
       antecedents     consequents  antecedent support  consequent support  \
0          (20712)        (85099B)            0.037580            0.092010   
1          (20719)         (20724)            0.036330            0.045251   
2          (20723)         (20724)            0.031632            0.045251   
3          (22356)         (20724)            0.032753            0.045251   
4          (21733)        (85123A)    