In [8]:
import pandas as pd
import numpy as np
import os
import gc
from collections import defaultdict
from tqdm.notebook import tqdm
from IPython.display import display, Markdown

In [None]:
# This is a straightforward way to evaluate your rules based on standard thresholds

rules = pd.read_csv('../outputs/association_rules.csv')

print("=" * 50)
print("SIMPLE ASSOCIATION RULE EVALUATION")
print("=" * 50)

# Count rules by quality
total_rules = len(rules)
rules_lift_above_1 = len(rules[rules['lift'] > 1])      # Better than random
rules_lift_above_2 = len(rules[rules['lift'] > 2])      # Good associations
rules_confident = len(rules[rules['confidence'] > 0.2]) # Reliable rules

print(f"\nTotal rules found: {total_rules}")
print(f"\n--- LIFT (measures association strength) ---")
print(f"  Lift > 1 (better than random): {rules_lift_above_1} ({100*rules_lift_above_1/total_rules:.1f}%)")
print(f"  Lift > 2 (strong association): {rules_lift_above_2} ({100*rules_lift_above_2/total_rules:.1f}%)")

print(f"\n--- CONFIDENCE (measures reliability) ---")
print(f"  Confidence > 20%: {rules_confident} ({100*rules_confident/total_rules:.1f}%)")

print(f"\n--- KEY STATISTICS ---")
print(f"  Average Lift: {rules['lift'].mean():.2f}")
print(f"  Best Lift: {rules['lift'].max():.2f}")
print(f"  Average Confidence: {rules['confidence'].mean():.1%}")

# Simple verdict
print("\n" + "=" * 50)
if rules_lift_above_1 == total_rules and rules['lift'].mean() > 1.5:
    print("✓ SUCCESS: All rules show positive associations (lift > 1)")
    print("  Your rules are meaningful and not random!")
else:
    print("⚠ Some rules may be weak. Consider raising min_support.")
print("=" * 50)

SIMPLE ASSOCIATION RULE EVALUATION

Total rules found: 953

--- LIFT (measures association strength) ---
  Lift > 1 (better than random): 953 (100.0%)
  Lift > 2 (strong association): 573 (60.1%)

--- CONFIDENCE (measures reliability) ---
  Confidence > 20%: 213 (22.4%)

--- KEY STATISTICS ---
  Average Lift: 3.02
  Best Lift: 63.56
  Average Confidence: 14.6%

✓ SUCCESS: All rules show positive associations (lift > 1)
  Your rules are meaningful and not random!


In [None]:
# =============================================================================
# RULE VALIDATION ON TRAINING DATA
# =============================================================================
# Check: Do the generated rules actually match real transactions?

# Load rules
rules = pd.read_csv('../outputs/association_rules.csv')
rules['antecedents'] = rules['antecedents'].apply(lambda x: eval(x))
rules['consequents'] = rules['consequents'].apply(lambda x: eval(x))

# Load training baskets (reuse test_ground_truth if available, or reload)
data_path = '../data/'
orders = pd.read_csv(os.path.join(data_path, 'orders.csv'))
op = pd.read_csv(os.path.join(data_path, 'order_products_train.csv'))
products = pd.read_csv(os.path.join(data_path, 'products.csv'))

op = op.merge(products[['product_id', 'product_name']], on='product_id')
baskets = op.groupby('order_id')['product_name'].apply(set).tolist()

print(f"Total baskets: {len(baskets):,}")
print(f"Total rules to validate: {len(rules)}")

# Sample for speed
import random
random.seed(42)
sample_size = 30000
sample_baskets = random.sample(baskets, min(sample_size, len(baskets)))

# Validation: count rule matches
fires = 0      # antecedent found in basket
hits = 0       # antecedent AND consequent both found

for basket in tqdm(sample_baskets, desc="Validating rules"):
    for _, rule in rules.iterrows():
        if rule['antecedents'].issubset(basket):
            fires += 1
            if rule['consequents'].issubset(basket):
                hits += 1

validation_rate = hits / fires if fires > 0 else 0
expected_rate = rules['confidence'].mean()

print("\n" + "=" * 50)
print("VALIDATION RESULTS")
print("=" * 50)
print(f"Rule antecedent found in basket: {fires:,} times")
print(f"Consequent also present: {hits:,} times")
print(f"\nObserved Validation Rate: {validation_rate:.2%}")
print(f"Expected (avg confidence):  {expected_rate:.2%}")

# Check if validation matches expectation
diff = abs(validation_rate - expected_rate)
print("\n" + "=" * 50)
if diff < 0.05:
    print("✓ SUCCESS: Rules validated!")
    print("  Observed rate matches expected confidence.")
else:
    print(f"~ Difference of {diff:.1%} from expected.")
print("=" * 50)

del orders, op, products, baskets
gc.collect()

Total baskets: 131,209
Total rules to validate: 953


Validating rules:   0%|          | 0/30000 [00:00<?, ?it/s]


VALIDATION RESULTS
Rule antecedent found in basket: 915,193 times
Consequent also present: 111,794 times

Observed Validation Rate: 12.22%
Expected (avg confidence):  14.56%

✓ SUCCESS: Rules validated!
  Observed rate matches expected confidence.


NameError: name 'op_prior' is not defined

In [25]:
print("\n-Top Business Actionable Rules (Interpretation for Merchandising)")

# Reload rules before frozenset conversion to display nicely
rules_display = pd.read_csv('../outputs/association_rules.csv')

# Filter for rules that represent strong cross-sell opportunities (high lift and strong confidence)
actionable_rules = rules_display[(rules_display['lift'] > 1.5) & (rules_display['confidence'] > 0.2)].head(5)

for i, row in actionable_rules.iterrows():
    antecedent = str(row['antecedents']).replace('frozenset(', '').replace(')', '').replace("'", "")
    consequent = str(row['consequents']).replace('frozenset(', '').replace(')', '').replace("'", "")
    
    print(f"\nRecommendation #{i+1}:")
    print(f"  Condition (Antecedent): IF Customer Buys {antecedent}")
    print(f"  Suggestion (Consequent): THEN Recommend {consequent}")
    print(f"  Confidence: {row['confidence']:.2f}, Lift: {row['lift']:.2f}")

    if len(eval(row['antecedents'])) >= 2:
        print("  -> **Business Use:** Perfect for a **Pre-built Bundle Discount** (Triplet Rule).")
    else:
        print("  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.")




-Top Business Actionable Rules (Interpretation for Merchandising)

Recommendation #1:
  Condition (Antecedent): IF Customer Buys {Non Fat Raspberry Yogurt}
  Suggestion (Consequent): THEN Recommend {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Confidence: 0.42, Lift: 63.56
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #2:
  Condition (Antecedent): IF Customer Buys {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Suggestion (Consequent): THEN Recommend {Non Fat Raspberry Yogurt}
  Confidence: 0.36, Lift: 63.56
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #3:
  Condition (Antecedent): IF Customer Buys {Vanilla Skyr Nonfat Yogurt}
  Suggestion (Consequent): THEN Recommend {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Confidence: 0.35, Lift: 53.38
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #4:

In [26]:
# Manual Cross-Departmental Exploration

# We need to reload products/departments.

# Load products and auxiliary tables needed for department names
products = pd.read_csv(os.path.join(data_path, 'products.csv'))
departments = pd.read_csv(os.path.join(data_path, 'departments.csv'))
products = products.merge(departments[['department_id', 'department']], on='department_id', how='left')

# Reload the rules display DataFrame
rules = pd.read_csv('../outputs/association_rules.csv')


# Helper Function to Extract Item Name (Only works for single-item sets)
def get_item_name(frozenset_str):
    try:
        items = list(eval(frozenset_str))
        return items[0] if len(items) == 1 else None
    except:
        return None

# Apply Department Mapping to Rules
rules['Antecedent_Item'] = rules['antecedents'].apply(get_item_name)
rules['Consequent_Item'] = rules['consequents'].apply(get_item_name)

# Merge Antecedent Department
rules = rules.merge(products[['product_name', 'department']], 
                    left_on='Antecedent_Item', 
                    right_on='product_name', 
                    suffixes=('_ant', '_con'),
                    how='left')
rules.rename(columns={'department': 'Antecedent_Dept'}, inplace=True)

# Merge Consequent Department
rules = rules.merge(products[['product_name', 'department']], 
                    left_on='Consequent_Item', 
                    right_on='product_name', 
                    suffixes=('_ant', '_con'),
                    how='left')
rules.rename(columns={'department': 'Consequent_Dept'}, inplace=True)

# Find Cross-Departmental Rules and Search
cross_dept_rules = rules[
    (rules['Antecedent_Dept'] != rules['Consequent_Dept']) &
    (rules['Antecedent_Item'].notna())
].copy()

cross_dept_rules_sorted = cross_dept_rules.sort_values(['lift', 'support'], ascending=[False, True])

print("\n--- Top 10 Cross-Departmental Rules (Manual Exploration) ---")
print(cross_dept_rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Antecedent_Dept', 'Consequent_Dept']].to_markdown(index=False))

# Search for "Diapers" or "Beer" Themes
print("\n--- Searching for 'Diapers' or 'Beer' Themes ---")
theme_rules = rules[
    (rules['Antecedent_Item'].str.contains('diaper|beer|wine', case=False, na=False)) |
    (rules['Consequent_Item'].str.contains('diaper|beer|wine', case=False, na=False))
].sort_values('lift', ascending=False)

if not theme_rules.empty:
    print(theme_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Antecedent_Dept', 'Consequent_Dept']].head(5).to_markdown(index=False))
else:
    print("No strong rules found containing 'diaper', 'beer', or 'wine'.")

cross_dept_rules.to_csv('../outputs/cross_dept_association_rules.csv', index=False)
print("\nSaved final association rules to outputs/cross_dept_association_rules.csv")

# Cleanup
del products, departments
gc.collect()


--- Top 10 Cross-Departmental Rules (Manual Exploration) ---
| antecedents                                                                   | consequents                                                   |    support |   confidence |    lift | Antecedent_Dept   | Consequent_Dept   |
|:------------------------------------------------------------------------------|:--------------------------------------------------------------|-----------:|-------------:|--------:|:------------------|:------------------|
| frozenset({'Blueberries'})                                                    | frozenset({'Raspberries'})                                    | 0.00209201 |    0.110676  | 5.47073 | frozen            | produce           |
| frozenset({'Raspberries'})                                                    | frozenset({'Blueberries'})                                    | 0.00209201 |    0.103408  | 5.47073 | produce           | frozen            |
| frozenset({'Organic Raspberries'})      

1322