# 🛒 Market Basket Analysis & Association Rules

**Steps 6-7: Prepare Baskets and Compute Association Rules**

This notebook covers:
- 6.1 Focus on Top Products
- 6.2 Create Baskets
- 7.1 Count Frequencies
- 7.2 Calculate Support, Confidence, and Lift
- Filter and save association rules

---

In [1]:
from pathlib import Path
def project_root(start: Path = None) -> Path:
    here = start or Path.cwd()
    for p in [here, *here.parents]:
        if (p / "data_raw").exists() or (p / "outputs").exists() or (p / "data_clean").exists():
            return p
    return here
ROOT = project_root()
DATA_CLEAN = ROOT / "data_clean"
OUTPUTS = ROOT / "outputs"
DATA_CLEAN.mkdir(parents=True, exist_ok=True)
OUTPUTS.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)
print("DATA_CLEAN:", DATA_CLEAN)
print("OUTPUTS:", OUTPUTS)


ROOT: /Users/alihasan/retail-pricing-mba
DATA_CLEAN: /Users/alihasan/retail-pricing-mba/data_clean
OUTPUTS: /Users/alihasan/retail-pricing-mba/outputs


In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import Counter
import itertools
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [3]:
# Load data
df = pd.read_csv('../data_clean/transactions.csv')
top_500_products = pd.read_csv('../outputs/top_500_products.csv')

top_list = top_500_products['Description'].tolist()
df_top = df[df['Description'].isin(top_list)].copy()

print(f"✅ Filtered dataset: {len(df_top):,} rows")

✅ Filtered dataset: 197,031 rows


In [4]:
# Create baskets
baskets = (df_top.groupby('Invoice')['Description']
           .apply(lambda s: sorted(set(s.tolist())))
           .tolist())

print(f"✅ Created {len(baskets):,} market baskets")

✅ Created 18,062 market baskets


In [5]:
# Count frequencies
item_counts = Counter()
pair_counts = Counter()
total_baskets = len(baskets)

for basket in baskets:
    for item in basket:
        item_counts[item] += 1
    if len(basket) >= 2:
        for combo in itertools.combinations(basket, 2):
            pair = tuple(sorted(combo))
            pair_counts[pair] += 1

print(f"✅ Counted {len(item_counts):,} individual items")
print(f"✅ Counted {len(pair_counts):,} item pairs")

✅ Counted 500 individual items
✅ Counted 116,112 item pairs


In [6]:
# Calculate association rules
association_rules = []

for pair, pair_count in pair_counts.items():
    item_a, item_b = pair
    support_ab = pair_count / total_baskets
    support_a = item_counts[item_a] / total_baskets
    support_b = item_counts[item_b] / total_baskets
    
    confidence_ab = support_ab / support_a if support_a > 0 else 0
    confidence_ba = support_ab / support_b if support_b > 0 else 0
    lift_ab = support_ab / (support_a * support_b) if (support_a * support_b) > 0 else 0
    
    # Add both directions
    association_rules.append({
        'antecedent': item_a, 'consequent': item_b,
        'support': support_ab, 'confidence': confidence_ab, 'lift': lift_ab,
        'pair_count': pair_count
    })
    association_rules.append({
        'antecedent': item_b, 'consequent': item_a,
        'support': support_ab, 'confidence': confidence_ba, 'lift': lift_ab,
        'pair_count': pair_count
    })

print(f"✅ Calculated metrics for {len(association_rules):,} association rules")

✅ Calculated metrics for 232,224 association rules


In [7]:
# Filter and save rules
rules_df = pd.DataFrame(association_rules)
filtered_rules = rules_df[
    (rules_df['confidence'] >= 0.20) &
    (rules_df['lift'] > 1.0) &
    (rules_df['support'] >= 0.001)
].copy()

filtered_rules.to_csv('../outputs/assoc_rules_pairs.csv', index=False)
print(f"✅ Association rules saved: {len(filtered_rules):,} rules")

✅ Association rules saved: 2,118 rules
