### ASSOCIATION RULES - ALL APPROACHES

This notebook generates and evaluates association rules using 3 approaches:
1. By Department (intra-department rules)
2. Cross-Department (global rules across all departments)
3. By Segment (RFM segment-specific rules)

## I. Import and prepare data

### I.1. Import raw data

In [1]:
import pandas as pd 
import sys
import os

sys.path.append('../scripts')  # To import from parent directory

from load_data import load_instacart_data
from split_data import temporal_split_instacart
from functions_association_rules import (
    prepare_transactions,
    generate_association_rules,
    evaluate_rules,
    print_evaluation_results
)

In [3]:
data = load_instacart_data()

orders = data['orders']
order_products_prior = data['order_products_prior']
order_products_train = data['order_products_train']
products = data['products']
departments = data['departments']

### I.2. Generate train/testing set

In [4]:
# Check if split already exists
if os.path.exists('../data/processed/train.csv') and os.path.exists('../data/processed/test.csv'):
    print("  Split files already exist, loading from disk...")
    train = pd.read_csv('../data/processed/train.csv')
    test = pd.read_csv('../data/processed/test.csv')
else:
    print("  Creating new split...")
    splits = temporal_split_instacart(
        order_products_prior=order_products_prior,
        order_products_train=order_products_train,
        orders=orders,
        products=products,
        departments=departments,
        train_ratio=0.7,  # 70% train, 30% test
        save_path='../data/processed/'
    )
    
    train = splits['train']
    test = splits['test']

  Creating new split...

[1/6] Merging prior + train datasets...
  Total rows: 33,819,106
  Unique orders: 3,346,083
  Unique products: 49,685

[2/6] Temporal split based on order_number...
  Total orders: 3,421,083
  Train orders: 2,394,758 (70.0%)
  Test orders: 1,026,325 (30.0%)

  Train rows: 23,558,148
  Test rows: 10,260,958

[3/6] Checking basket size distributions...

  TRAIN - Basket Size:
    Mean: 10.07, Median: 8, Std: 7.53
    Range: [1, 145]

  TEST - Basket Size:
    Mean: 10.19, Median: 8, Std: 7.57
    Range: [1, 137]

  ðŸ“Š Distribution Comparison:
    Train vs Test: Î” mean = 0.11 âœ…

[4/6] Checking department diversity distributions...

  TRAIN - Department Diversity:
    Mean: 4.76, Median: 4, Std: 2.57

  TEST - Department Diversity:
    Mean: 4.70, Median: 4, Std: 2.50

  ðŸ“Š Distribution Comparison:
    Train vs Test: Î” mean = 0.06 âœ…

[6/6] Saving splits...
  âœ… Train saved: ../data/processed/train.csv (23,558,148 rows)
  âœ… Test saved: ../data/processed

In [None]:
import gc 

print("  Enriching train/test with product and department info...")    
# Merge train/test with product and department info
train_enriched = train.merge(products[['product_id', 'product_name', 'department_id']], 
    on='product_id').merge(departments[['department_id', 'department']], on='department_id')

del train
gc.collect()

test_enriched = test.merge(products[['product_id', 'product_name', 'department_id']], 
        on='product_id').merge(departments[['department_id', 'department']], on='department_id')

del test
gc.collect()

  Enriching train/test with product and department info...


NameError: name 'train' is not defined

: 

In [15]:
train_enriched.to_csv('../data/processed/train_enriched.csv', index=False)
test_enriched.to_csv('../data/processed/test_enriched.csv', index=False)

## II. Association rules on top N products (no filtering)

In [None]:
# Without department filter, just on top global products
# Prepare transactions (no department filter, top global products)
transactions_general = prepare_transactions(
    train_enriched,
    top_n_products=200  # Top 200 products across ALL departments
)

print(f"  Total transactions: {len(transactions_general):,}")

# Generate rules
general_rules = generate_association_rules(
    transactions_general,
    min_support=0.005,
    min_confidence=0.15,
    min_lift=1.3,
    max_transactions=200_000
)

if general_rules is not None:
    print("\n General top products rules...")
    general_rules.to_csv('../data/processed/rules_top_products.csv', index=False)
    print(f"\nTotal top products rules: {len(general_rules):,}")
    
    # Evaluate
    print("\nEvaluating top products rules...")
    metrics_general = evaluate_rules(
        rules=general_rules,
        test_data=test_enriched,
        k=10
    )
    print_evaluation_results(metrics_general)
else:
    print("No top products rules generated")
    metrics_general = None

  Total transactions: 1,831,459

Total top products rules: 82

Evaluating top products rules...
  Precision@10: 8.70%
  Recall@10: 3.19%
  Coverage: 48.87%
  Average hits: 0.22
  Baskets evaluated: 10,000
  Baskets with recommendations: 4,887


Not very interesting => only associations between products from 'Produce' department

## III. Association rules inside each departments

In [14]:
# Rules inside each department

# Configuration tiers
TIER_1 = ['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen']
TIER_2 = ['pantry', 'household', 'personal care', 'bakery', 'dry goods pasta']
TIER_3 = ['deli', 'meat seafood', 'canned goods', 'international', 'breakfast', 'alcohol', 'babies', 'pets']

dept_config = {
    **{d: {'n_products': 100, 'max_trans': 150_000, 'supp': 0.003, 'conf': 0.15, 'lift': 1.3} for d in TIER_1},
    **{d: {'n_products': 80, 'max_trans': 100_000, 'supp': 0.003, 'conf': 0.10, 'lift': 1.3} for d in TIER_2},
    **{d: {'n_products': 60, 'max_trans': 50_000, 'supp': 0.005, 'conf': 0.12, 'lift': 1.3} for d in TIER_3}
}

all_dept_rules = []

for i, (dept, config) in enumerate(dept_config.items(), 1):
    print(f"  [{i}/{len(dept_config)}] {dept}...", end=' ')
    
    # Prepare transactions for this department
    transactions = prepare_transactions(
        train_enriched,
        filter_column='department',
        filter_values=dept,
        top_n_products=config['n_products']
    )
    
    # Generate rules
    rules = generate_association_rules(
        transactions,
        min_support=config['supp'],
        min_confidence=config['conf'],
        min_lift=config['lift'],
        max_transactions=config['max_trans']
    )
    
    if rules is not None:
        rules['department'] = dept
        all_dept_rules.append(rules)
        print(f"{len(rules)} rules")
    else:
        print("No rules")

# Consolidate
dept_rules = pd.concat(all_dept_rules, ignore_index=True)
dept_rules.to_csv('../data/processed/rules_by_department.csv', index=False)

print(f"\nTotal rules by department: {len(dept_rules):,}")

# Evaluate
print("\nEvaluating rules by department...")
metrics_dept = evaluate_rules(
    rules=dept_rules,
    test_data=test_enriched,
    groupby_column='department',
    k=10
)
print_evaluation_results(metrics_dept)

  [1/18] produce... 180 rules
  [2/18] dairy eggs... 20 rules
  [3/18] snacks... 4 rules
  [4/18] beverages... 20 rules
  [5/18] frozen... 6 rules
  [6/18] pantry... No rules
  [7/18] household... 3 rules
  [8/18] personal care... 4 rules
  [9/18] bakery... No rules
  [10/18] dry goods pasta... 2 rules
  [11/18] deli... No rules
  [12/18] meat seafood... No rules
  [13/18] canned goods... No rules
  [14/18] international... No rules
  [15/18] breakfast... No rules
  [16/18] alcohol... 6 rules
  [17/18] babies... 166 rules
  [18/18] pets... 66 rules

Total rules by department: 477

Evaluating rules by department...


TypeError: unhashable type: 'list'

## IV. Association rules for each segment

In [None]:
segments=pd.read_csv('../data/processed/rfm_customer_segments.csv', usecols=['user_id', 'segment'])

In [None]:
# merge train_enriched and test_enriched with segments to get segment info for each transaction
# ATTENTION ! : peut-Ãªtre sauvgarder train_enriched et test_enriched plutÃ´t que test/train

In [None]:
# Merge train/test with segments

# Merge orders with segments
orders_with_segments = orders.merge(segments, on='user_id', how='left')

# Add segments to train set
train_with_segments = train.merge(
    orders_with_segments[['order_id', 'user_id', 'segment']], 
    on='order_id', 
    how='left'
)

del train, orders # for memory management
gc.collect()

# Add segments to test set
test_with_segments = test.merge(
    orders_with_segments[['order_id', 'user_id', 'segment']].drop_duplicates('order_id'),
    on='order_id', 
    how='left'
)

del test, orders_with_segments # for memory management
gc.collect()

print("\n  Merge completed successfully")

In [None]:
train_with_segments.to_csv('../data/processed/train_with_segments.csv', index=False)
test_with_segments.to_csv('../data/processed/test_with_segments.csv', index=False)

In [None]:
# APPROACH 3: RULES BY SEGMENT
# Load segment data
train_seg = pd.read_csv('../data/processed/train_with_segments.csv')
test_seg = pd.read_csv('../data/processed/test_with_segments.csv')

In [None]:
# Add product names to segment data
import gc

train_seg_enriched = train_seg.merge(
    products[['product_id', 'product_name']], 
    on='product_id'
)

del train_seg  # Free memory
gc.collect()

test_seg_enriched = test_seg.merge(
    products[['product_id', 'product_name']], 
    on='product_id'
)

del test_seg  # Free memory
gc.collect()


In [None]:
# Get segments
segments = train_seg_enriched['segment'].dropna().unique()
print(f"  Segments: {len(segments)}")

all_seg_rules = []

for i, segment in enumerate(segments, 1):
    print(f"  [{i}/{len(segments)}] {segment}...", end=' ')
    
    # Prepare transactions for this segment
    transactions = prepare_transactions(
        train_seg_enriched,
        filter_column='segment',
        filter_values=segment,
        top_n_products=80
    )
    
    # Generate rules
    rules = generate_association_rules(
        transactions,
        min_support=0.005,
        min_confidence=0.15,
        min_lift=1.3,
        max_transactions=100_000
    )
    
    if rules is not None:
        rules['segment'] = segment
        all_seg_rules.append(rules)
        print(f"{len(rules)} rules")
    else:
        print("No rules")

# Consolidate
if all_seg_rules:
    segment_rules = pd.concat(all_seg_rules, ignore_index=True)
    segment_rules.to_csv('../data/processed/rules_by_segment.csv', index=False)
    
    print(f"\nTotal rules by segment: {len(segment_rules):,}")
    
    # Evaluate
    print("\nEvaluating rules by segment...")
    metrics_seg = evaluate_rules(
        rules=segment_rules,
        test_data=test_seg_enriched,
        groupby_column='segment',
        k=10
    )
    print_evaluation_results(metrics_seg)
else:
    print("No segment rules generated")
    metrics_seg = None


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# COMPARE RULES BETWEEN SEGMENTS

print("Loading rules by segment...")
segment_rules = pd.read_csv('../data/processed/rules_by_segment.csv')

print(f"Total rules: {len(segment_rules):,}")
print(f"Segments: {segment_rules['segment'].nunique()}")

# 1. CREATE RULE IDENTIFIER

# Create unique rule identifier (antecedent -> consequent)
segment_rules['rule_id'] = segment_rules['antecedent'] + ' -> ' + segment_rules['consequent']

print(f"\nUnique rules (across all segments): {segment_rules['rule_id'].nunique()}")

# 2. RULES SHARED BETWEEN SEGMENTS

# Count how many segments share each rule
rule_counts = segment_rules.groupby('rule_id')['segment'].apply(lambda x: list(x)).reset_index()
rule_counts['n_segments'] = rule_counts['segment'].apply(len)
rule_counts['segments_list'] = rule_counts['segment'].apply(lambda x: ', '.join(sorted(x)))

# Distribution
print("\nRule sharing distribution:")
sharing_dist = rule_counts['n_segments'].value_counts().sort_index()
for n_seg, count in sharing_dist.items():
    pct = count / len(rule_counts) * 100
    print(f"  Rules in {n_seg} segment(s): {count:4d} ({pct:5.1f}%)")

# Unique vs shared
unique_rules = rule_counts[rule_counts['n_segments'] == 1]
shared_rules = rule_counts[rule_counts['n_segments'] > 1]

print(f"\nSummary:")
print(f"  Unique rules (1 segment only): {len(unique_rules)} ({len(unique_rules)/len(rule_counts)*100:.1f}%)")
print(f"  Shared rules (2+ segments): {len(shared_rules)} ({len(shared_rules)/len(rule_counts)*100:.1f}%)")


# 3. TOP SHARED RULES

print("\n" + "="*70)
print("TOP 20 MOST SHARED RULES")
print("="*70)

most_shared = rule_counts.sort_values('n_segments', ascending=False).head(20)

for idx, row in most_shared.iterrows():
    print(f"\nRule: {row['rule_id']}")
    print(f"  Shared by {row['n_segments']} segments: {row['segments_list']}")


# 4. UNIQUE RULES PER SEGMENT


for seg in segments:
    # Rules unique to this segment
    seg_rules = set(segment_rules[segment_rules['segment'] == seg]['rule_id'])
    
    # Rules from other segments
    other_rules = set(segment_rules[segment_rules['segment'] != seg]['rule_id'])
    
    # Unique rules
    unique_to_seg = seg_rules - other_rules
    
    n_total = len(seg_rules)
    n_unique = len(unique_to_seg)
    pct_unique = (n_unique / n_total * 100) if n_total > 0 else 0
    
    print(f"\n{seg}:")
    print(f"  Total rules: {n_total}")
    print(f"  Unique rules: {n_unique} ({pct_unique:.1f}%)")
    

# 5. VISUALIZATION

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Rule sharing distribution
ax1 = axes[0] 
sharing_dist.plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_xlabel('Number of segments sharing the rule')
ax1.set_ylabel('Number of rules')
ax1.set_title('Distribution of Rule Sharing')
ax1.grid(axis='y', alpha=0.3)

# Plot 2: Rules per segment (unique vs shared)
ax2 = axes[1]
seg_stats = []
for seg in segments:
    seg_rules = set(segment_rules[segment_rules['segment'] == seg]['rule_id'])
    other_rules = set(segment_rules[segment_rules['segment'] != seg]['rule_id'])
    unique = len(seg_rules - other_rules)
    shared = len(seg_rules & other_rules)
    seg_stats.append({'segment': seg, 'unique': unique, 'shared': shared})

seg_stats_df = pd.DataFrame(seg_stats).set_index('segment')
seg_stats_df.plot(kind='barh', stacked=True, ax=ax2, color=['steelblue', 'coral'])
ax2.set_xlabel('Number of rules')
ax2.set_title('Rules per Segment (Unique vs Shared)')
ax2.legend(['Unique', 'Shared with other segments'])


In [None]:
# Comparison of approaches


comparison = pd.DataFrame([
    {
        'Approach': 'By Department',
        'Rules': len(dept_rules),
        'Precision@10': f"{metrics_dept['precision@K']:.2%}",
        'Recall@10': f"{metrics_dept['recall@K']:.2%}",
        'Coverage': f"{metrics_dept['coverage']:.2%}",
        'Avg Hits': f"{metrics_dept['avg_hits']:.2f}"
    },
    {
        'Approach': 'Cross-Department',
        'Rules': len(cross_rules) if cross_rules is not None else 0,
        'Precision@10': f"{metrics_cross['precision@K']:.2%}" if metrics_cross else 'N/A',
        'Recall@10': f"{metrics_cross['recall@K']:.2%}" if metrics_cross else 'N/A',
        'Coverage': f"{metrics_cross['coverage']:.2%}" if metrics_cross else 'N/A',
        'Avg Hits': f"{metrics_cross['avg_hits']:.2f}" if metrics_cross else 'N/A'
    },
    {
        'Approach': 'By Segment',
        'Rules': len(segment_rules) if all_seg_rules else 0,
        'Precision@10': f"{metrics_seg['precision@K']:.2%}" if metrics_seg else 'N/A',
        'Recall@10': f"{metrics_seg['recall@K']:.2%}" if metrics_seg else 'N/A',
        'Coverage': f"{metrics_seg['coverage']:.2%}" if metrics_seg else 'N/A',
        'Avg Hits': f"{metrics_seg['avg_hits']:.2f}" if metrics_seg else 'N/A'
    }
])

print(comparison.to_string(index=False))