In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Groceries_dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


In [3]:
print("First few rows:")
print(data.head())
print("\nUnique items:")
print(data['itemDescription'].nunique())
print("\nSample items:")
print(data['itemDescription'].unique()[:10])

First few rows:
   Member_number        Date   itemDescription
0           1808  21-07-2015    tropical fruit
1           2552  05-01-2015        whole milk
2           2300  19-09-2015         pip fruit
3           1187  12-12-2015  other vegetables
4           3037  01-02-2015        whole milk

Unique items:
167

Sample items:
['tropical fruit' 'whole milk' 'pip fruit' 'other vegetables' 'rolls/buns'
 'pot plants' 'citrus fruit' 'beef' 'frankfurter' 'chicken']


In [5]:
# Convert data to transaction format
transactions = data.groupby('Member_number')['itemDescription'].apply(list).values.tolist()
print(f"Total transactions: {len(transactions)}")
print("Sample transaction:", transactions[0])

Total transactions: 3898
Sample transaction: ['soda', 'canned beer', 'sausage', 'sausage', 'whole milk', 'whole milk', 'pickled vegetables', 'misc. beverages', 'semi-finished bread', 'hygiene articles', 'yogurt', 'pastry', 'salty snack']


In [8]:
# Apriori Algorithm Implementation
from itertools import combinations

def get_support(itemset, transactions):
    count = 0
    for transaction in transactions:
        if set(itemset).issubset(set(transaction)):
            count += 1
    return count / len(transactions)

def get_frequent_1_itemsets(transactions, min_support):
    items = {}
    for transaction in transactions:
        for item in transaction:
            items[item] = items.get(item, 0) + 1
    
    frequent_items = []
    for item, count in items.items():
        support = count / len(transactions)
        if support >= min_support:
            frequent_items.append(([item], support))
    
    return frequent_items

def generate_candidates(frequent_itemsets, k):
    candidates = []
    n = len(frequent_itemsets)
    
    for i in range(n):
        for j in range(i + 1, n):
            itemset1 = frequent_itemsets[i][0]
            itemset2 = frequent_itemsets[j][0]
            
            if k == 2:
                candidates.append(sorted(itemset1 + itemset2))
            elif itemset1[:-1] == itemset2[:-1]:
                candidates.append(sorted(itemset1 + [itemset2[-1]]))
    
    return candidates

def apriori(transactions, min_support):
    frequent_itemsets = {}
    
    # Generate frequent 1-itemsets
    frequent_1 = get_frequent_1_itemsets(transactions, min_support)
    frequent_itemsets[1] = frequent_1
    
    k = 2
    while frequent_itemsets.get(k-1):
        candidates = generate_candidates(frequent_itemsets[k-1], k)
        frequent_k = []
        
        for candidate in candidates:
            support = get_support(candidate, transactions)
            if support >= min_support:
                frequent_k.append((candidate, support))
        
        if frequent_k:
            frequent_itemsets[k] = frequent_k
            k += 1
        else:
            break
    
    return frequent_itemsets

In [9]:
# Run Apriori algorithm
min_support = 0.01  # 1% minimum support
frequent_itemsets = apriori(transactions, min_support)

print("Frequent Itemsets:")
for k, itemsets in frequent_itemsets.items():
    print(f"\n{k}-itemsets:")
    for itemset, support in itemsets[:10]:  # Show first 10
        print(f"  {itemset}: {support:.4f}")
    if len(itemsets) > 10:
        print(f"  ... and {len(itemsets) - 10} more")

Frequent Itemsets:

1-itemsets:
  ['soda']: 0.3884
  ['canned beer']: 0.1839
  ['sausage']: 0.2370
  ['whole milk']: 0.6419
  ['pickled vegetables']: 0.0344
  ['misc. beverages']: 0.0611
  ['semi-finished bread']: 0.0364
  ['hygiene articles']: 0.0534
  ['yogurt']: 0.3422
  ['pastry']: 0.2014
  ... and 106 more

2-itemsets:
  ['canned beer', 'soda']: 0.0546
  ['sausage', 'soda']: 0.0772
  ['soda', 'whole milk']: 0.1511
  ['pickled vegetables', 'soda']: 0.0118
  ['misc. beverages', 'soda']: 0.0172
  ['semi-finished bread', 'soda']: 0.0123
  ['hygiene articles', 'soda']: 0.0169
  ['soda', 'yogurt']: 0.0975
  ['pastry', 'soda']: 0.0631
  ['salty snack', 'soda']: 0.0205
  ... and 1116 more

3-itemsets:
  ['canned beer', 'sausage', 'soda']: 0.0164
  ['canned beer', 'soda', 'whole milk']: 0.0303
  ['canned beer', 'soda', 'yogurt']: 0.0180
  ['canned beer', 'pastry', 'soda']: 0.0128
  ['canned beer', 'frankfurter', 'soda']: 0.0105
  ['canned beer', 'rolls/buns', 'soda']: 0.0226
  ['canned bee

In [9]:
# Association Rules Generation
def generate_association_rules(frequent_itemsets, min_confidence):
    rules = []
    
    for k in range(2, len(frequent_itemsets) + 1):
        if k not in frequent_itemsets:
            continue
            
        for itemset, support in frequent_itemsets[k]:
            # Generate all possible antecedent-consequent combinations
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = list(antecedent)
                    consequent = [item for item in itemset if item not in antecedent]
                    
                    # Calculate confidence
                    antecedent_support = get_support(antecedent, transactions)
                    if antecedent_support > 0:
                        confidence = support / antecedent_support
                        
                        if confidence >= min_confidence:
                            # Calculate lift
                            consequent_support = get_support(consequent, transactions)
                            lift = confidence / consequent_support if consequent_support > 0 else 0
                            
                            rules.append({
                                'antecedent': antecedent,
                                'consequent': consequent,
                                'support': support,
                                'confidence': confidence,
                                'lift': lift
                            })
    
    return rules

# Generate association rules
min_confidence = 0.5
rules = generate_association_rules(frequent_itemsets, min_confidence)

print(f"Generated {len(rules)} association rules with confidence >= {min_confidence}")
print("\nTop 20 Association Rules:")
print("=" * 80)

# Sort rules by lift in descending order
rules_sorted = sorted(rules, key=lambda x: x['lift'], reverse=True)

for i, rule in enumerate(rules_sorted[:20]):
    ant = " & ".join(rule['antecedent'])
    con = " & ".join(rule['consequent'])
    print(f"{i+1:2d}. {ant} => {con}")
    print(f"    Support: {rule['support']:.4f}, Confidence: {rule['confidence']:.4f}, Lift: {rule['lift']:.4f}")
    print()

Generated 1117 association rules with confidence >= 0.5

Top 20 Association Rules:
 1. rolls/buns & sausage & tropical fruit => yogurt
    Support: 0.0110, Confidence: 0.5309, Lift: 1.8761

 2. curd & sausage & whole milk => yogurt
    Support: 0.0100, Confidence: 0.5270, Lift: 1.8625

 3. other vegetables & rolls/buns & sausage & whole milk => yogurt
    Support: 0.0136, Confidence: 0.5196, Lift: 1.8363

 4. domestic eggs & meat => whole milk
    Support: 0.0103, Confidence: 0.7843, Lift: 1.7118

 5. canned beer & coffee => soda
    Support: 0.0113, Confidence: 0.5366, Lift: 1.7116

 6. UHT-milk & soda & whole milk => other vegetables
    Support: 0.0100, Confidence: 0.6393, Lift: 1.6977

 7. frozen meals & rolls/buns & whole milk => other vegetables
    Support: 0.0100, Confidence: 0.6393, Lift: 1.6977

 8. UHT-milk & sausage => other vegetables
    Support: 0.0118, Confidence: 0.6389, Lift: 1.6965

 9. frankfurter & sausage => soda
    Support: 0.0164, Confidence: 0.5289, Lift: 1.68

In [10]:
# Summary Statistics
print("APRIORI ALGORITHM RESULTS SUMMARY")
print("=" * 50)
print(f"Dataset: {len(transactions)} transactions")
print(f"Unique items: {data['itemDescription'].nunique()}")
print(f"Min support threshold: {min_support}")
print(f"Min confidence threshold: {min_confidence}")
print()

total_frequent = sum(len(itemsets) for itemsets in frequent_itemsets.values())
print(f"Total frequent itemsets found: {total_frequent}")

for k, itemsets in frequent_itemsets.items():
    print(f"  {k}-itemsets: {len(itemsets)}")

print(f"\nTotal association rules: {len(rules)}")

# Top rules by different metrics
print("\nTOP 5 RULES BY CONFIDENCE:")
top_conf = sorted(rules, key=lambda x: x['confidence'], reverse=True)[:5]
for i, rule in enumerate(top_conf):
    ant = " & ".join(rule['antecedent'])
    con = " & ".join(rule['consequent'])
    print(f"{i+1}. {ant} => {con} (Conf: {rule['confidence']:.3f})")

print("\nTOP 5 RULES BY LIFT:")
top_lift = sorted(rules, key=lambda x: x['lift'], reverse=True)[:5]
for i, rule in enumerate(top_lift):
    ant = " & ".join(rule['antecedent'])
    con = " & ".join(rule['consequent'])
    print(f"{i+1}. {ant} => {con} (Lift: {rule['lift']:.3f})")

print("\nTOP 5 RULES BY SUPPORT:")
top_supp = sorted(rules, key=lambda x: x['support'], reverse=True)[:5]
for i, rule in enumerate(top_supp):
    ant = " & ".join(rule['antecedent'])
    con = " & ".join(rule['consequent'])
    print(f"{i+1}. {ant} => {con} (Supp: {rule['support']:.3f})")

APRIORI ALGORITHM RESULTS SUMMARY
Dataset: 3898 transactions
Unique items: 167
Min support threshold: 0.01
Min confidence threshold: 0.5

Total frequent itemsets found: 3016
  1-itemsets: 116
  2-itemsets: 1126
  3-itemsets: 1459
  4-itemsets: 311
  5-itemsets: 4

Total association rules: 1117

TOP 5 RULES BY CONFIDENCE:
1. domestic eggs & meat => whole milk (Conf: 0.784)
2. chocolate & fruit/vegetable juice => whole milk (Conf: 0.750)
3. bottled water & other vegetables & rolls/buns & yogurt => whole milk (Conf: 0.745)
4. bottled water & pip fruit & yogurt => whole milk (Conf: 0.741)
5. brown bread & rolls/buns & yogurt => whole milk (Conf: 0.735)

TOP 5 RULES BY LIFT:
1. rolls/buns & sausage & tropical fruit => yogurt (Lift: 1.876)
2. curd & sausage & whole milk => yogurt (Lift: 1.863)
3. other vegetables & rolls/buns & sausage & whole milk => yogurt (Lift: 1.836)
4. domestic eggs & meat => whole milk (Lift: 1.712)
5. canned beer & coffee => soda (Lift: 1.712)

TOP 5 RULES BY SUPPORT