In [15]:
import pandas as pd
import numpy as np
import os
import gc
from collections import defaultdict
from tqdm.notebook import tqdm
from IPython.display import display, Markdown

# Rule Conversion Function
# This function takes the string representation of a frozenset 
# (e.g., "frozenset({'Milk'})") and converts it back into a usable frozenset object.
def convert_frozenset_string(set_string):
    # This evaluates the string as Python code, returning the frozenset object directly.
    # It relies on the string being well-formed (which it should be, as we created it).
    return eval(set_string)


# Load the Generated Rules
rules = pd.read_csv('../outputs/association_rules.csv')

# Apply the function to the antecedent and consequent columns
rules['antecedents'] = rules['antecedents'].apply(convert_frozenset_string)
rules['consequents'] = rules['consequents'].apply(convert_frozenset_string)

print(f"Loaded {len(rules)} association rules. First antecedent type: {type(rules['antecedents'].iloc[0])}")

# Load the Instacart Test Data
data_path = '../data/'
orders = pd.read_csv(os.path.join(data_path, 'orders.csv'))
op_train = pd.read_csv(os.path.join(data_path, 'order_products_train.csv'))
products = pd.read_csv(os.path.join(data_path, 'products.csv'))

Loaded 953 association rules. First antecedent type: <class 'frozenset'>


In [16]:
# Identify all unique products contained in the generated rules
rule_products = set()
for s in rules['antecedents']:
    rule_products.update(s)
for s in rules['consequents']:
    rule_products.update(s)

# Filter the TEST TRANSACTIONS (op_train) to keep only products that are in our rules.
# This ensures the test set only contains items the model actually knows about.

# NOTE: This part requires modification of Cell 2.

# In Cell 2, before the merge with 'products', add this filter:
#op_train_filtered = op_train[op_train['product_id'].isin(products[products['product_name'].isin(rule_products)]['product_id'])]

In [17]:
# Prepare Test Set Transactions
test_orders = orders[orders['eval_set'] == 'train'][['order_id', 'user_id']]
test_transactions = test_orders.merge(op_train[['order_id', 'product_id']], on='order_id', how='left')
test_transactions = test_transactions.merge(products[['product_id', 'product_name']], on='product_id', how='left')

# Clean up memory
del orders, op_train, products
gc.collect()

# Create a dictionary mapping Order ID -> List of Products Bought
# IMPORTANT: Ensure the items are strings within the set.
test_ground_truth = test_transactions.groupby('order_id')['product_name'].apply(set).to_dict()

print(f"Prepared {len(test_ground_truth)} test baskets for evaluation.")

Prepared 131209 test baskets for evaluation.


In [18]:
def generate_recommendations(current_basket, rules_df, k=5):
    """Generates top K recommendations based on association rules."""
    
    # CRITICAL: Filter rules where the antecedent is a subset of the current basket.
    # The 'current_basket' is a set (from the ground truth).
    relevant_rules = rules_df[
        rules_df['antecedents'].apply(lambda x: x.issubset(current_basket))
    ].copy()
    
    relevant_rules['score'] = relevant_rules['confidence'] * relevant_rules['lift']
    
    recommendations = defaultdict(float)
    
    for _, row in relevant_rules.iterrows():
        # Consequent is a frozenset, but we assume single-item consequents
        # The item 'consequent_item' is now a string (e.g., 'Banana')
        consequent_item = list(row['consequents'])[0] 
        
        # Only recommend items NOT already in the basket (CRITICAL FILTER)
        if consequent_item not in current_basket:
            recommendations[consequent_item] = max(recommendations[consequent_item], row['score'])
            
    top_k = sorted(recommendations.items(), key=lambda item: item[1], reverse=True)[:k]
    return [item[0] for item in top_k]

# Offline Evaluation Execution

K = 5 
all_orders = list(test_ground_truth.keys())
SAMPLE_SIZE = 5000 
sample_orders = all_orders[:SAMPLE_SIZE] 

hit_count = 0
total_recommendations = 0

print(f"\n--- Starting Offline Evaluation (Precision@{K}) on {SAMPLE_SIZE} orders ---")

for order_id in tqdm(sample_orders, desc="Evaluating Test Orders"):
    ground_truth = test_ground_truth[order_id]
    
    # Ensure current_basket is a set of strings, matching the structure of rule items
    current_basket = ground_truth 
    
    recs = generate_recommendations(current_basket, rules, k=K)
    
    # Check hits (items recommended AND purchased in the same order)
    # The intersection of two sets (recs is list of strings, ground_truth is set of strings)
    hits = len(set(recs) & current_basket)
    hit_count += hits
    
    if len(recs) > 0:
        total_recommendations += len(recs)

# Calculate Precision@K
if total_recommendations > 0:
    precision_at_k = hit_count / total_recommendations
else:
    precision_at_k = 0

print(f"\n--- Offline Evaluation Results ---")
print(f"Total baskets evaluated: {len(sample_orders)}")
print(f"Precision@{K} (Accuracy): {precision_at_k:.4f}")


--- Starting Offline Evaluation (Precision@5) on 5000 orders ---


Evaluating Test Orders:   0%|          | 0/5000 [00:00<?, ?it/s]


--- Offline Evaluation Results ---
Total baskets evaluated: 5000
Precision@5 (Accuracy): 0.0000


In [23]:
print("\n-Top Business Actionable Rules (Interpretation for Merchandising)")

# Reload rules before frozenset conversion to display nicely
rules_display = pd.read_csv('../outputs/association_rules.csv')

# Filter for rules that represent strong cross-sell opportunities (high lift and strong confidence)
actionable_rules = rules_display[(rules_display['lift'] > 1.5) & (rules_display['confidence'] > 0.2)].head(5)

for i, row in actionable_rules.iterrows():
    antecedent = str(row['antecedents']).replace('frozenset(', '').replace(')', '').replace("'", "")
    consequent = str(row['consequents']).replace('frozenset(', '').replace(')', '').replace("'", "")
    
    print(f"\nRecommendation #{i+1}:")
    print(f"  Condition (Antecedent): IF Customer Buys {antecedent}")
    print(f"  Suggestion (Consequent): THEN Recommend {consequent}")
    print(f"  Confidence: {row['confidence']:.2f}, Lift: {row['lift']:.2f}")

    if len(eval(row['antecedents'])) >= 2:
        print("  -> **Business Use:** Perfect for a **Pre-built Bundle Discount** (Triplet Rule).")
    else:
        print("  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.")




-Top Business Actionable Rules (Interpretation for Merchandising)

Recommendation #1:
  Condition (Antecedent): IF Customer Buys {Non Fat Raspberry Yogurt}
  Suggestion (Consequent): THEN Recommend {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Confidence: 0.42, Lift: 63.56
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #2:
  Condition (Antecedent): IF Customer Buys {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Suggestion (Consequent): THEN Recommend {Non Fat Raspberry Yogurt}
  Confidence: 0.36, Lift: 63.56
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #3:
  Condition (Antecedent): IF Customer Buys {Vanilla Skyr Nonfat Yogurt}
  Suggestion (Consequent): THEN Recommend {Icelandic Style Skyr Blueberry Non-fat Yogurt}
  Confidence: 0.35, Lift: 53.38
  -> **Business Use:** Ideal for a 'Frequently Bought Together' widget on the product page.

Recommendation #4:

In [28]:
# Manual Cross-Departmental Exploration

# We need to reload products/departments.

# Load products and auxiliary tables needed for department names
products = pd.read_csv(os.path.join(data_path, 'products.csv'))
departments = pd.read_csv(os.path.join(data_path, 'departments.csv'))
products = products.merge(departments[['department_id', 'department']], on='department_id', how='left')

# Reload the rules display DataFrame
rules = pd.read_csv('../outputs/association_rules.csv')


# Helper Function to Extract Item Name (Only works for single-item sets)
def get_item_name(frozenset_str):
    try:
        items = list(eval(frozenset_str))
        return items[0] if len(items) == 1 else None
    except:
        return None

# Apply Department Mapping to Rules
rules['Antecedent_Item'] = rules['antecedents'].apply(get_item_name)
rules['Consequent_Item'] = rules['consequents'].apply(get_item_name)

# Merge Antecedent Department
rules = rules.merge(products[['product_name', 'department']], 
                    left_on='Antecedent_Item', 
                    right_on='product_name', 
                    suffixes=('_ant', '_con'),
                    how='left')
rules.rename(columns={'department': 'Antecedent_Dept'}, inplace=True)

# Merge Consequent Department
rules = rules.merge(products[['product_name', 'department']], 
                    left_on='Consequent_Item', 
                    right_on='product_name', 
                    suffixes=('_ant', '_con'),
                    how='left')
rules.rename(columns={'department': 'Consequent_Dept'}, inplace=True)

# Find Cross-Departmental Rules and Search
cross_dept_rules = rules[
    (rules['Antecedent_Dept'] != rules['Consequent_Dept']) &
    (rules['Antecedent_Item'].notna())
].copy()

cross_dept_rules_sorted = cross_dept_rules.sort_values(['lift', 'support'], ascending=[False, True])

print("\n--- Top 10 Cross-Departmental Rules (Manual Exploration) ---")
print(cross_dept_rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Antecedent_Dept', 'Consequent_Dept']].to_markdown(index=False))

# Search for "Diapers" or "Beer" Themes
print("\n--- Searching for 'Diapers' or 'Beer' Themes ---")
theme_rules = rules[
    (rules['Antecedent_Item'].str.contains('diaper|beer|wine', case=False, na=False)) |
    (rules['Consequent_Item'].str.contains('diaper|beer|wine', case=False, na=False))
].sort_values('lift', ascending=False)

if not theme_rules.empty:
    print(theme_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'Antecedent_Dept', 'Consequent_Dept']].head(5).to_markdown(index=False))
else:
    print("No strong rules found containing 'diaper', 'beer', or 'wine'.")

cross_dept_rules.to_csv('../outputs/cross_dept_association_rules.csv', index=False)
print("\nSaved final association rules to outputs/cross_dept_association_rules.csv")

# Cleanup
del products, departments
gc.collect()


--- Top 10 Cross-Departmental Rules (Manual Exploration) ---
| antecedents                                                                   | consequents                                                   |    support |   confidence |    lift | Antecedent_Dept   | Consequent_Dept   |
|:------------------------------------------------------------------------------|:--------------------------------------------------------------|-----------:|-------------:|--------:|:------------------|:------------------|
| frozenset({'Blueberries'})                                                    | frozenset({'Raspberries'})                                    | 0.00209201 |    0.110676  | 5.47073 | frozen            | produce           |
| frozenset({'Raspberries'})                                                    | frozenset({'Blueberries'})                                    | 0.00209201 |    0.103408  | 5.47073 | produce           | frozen            |
| frozenset({'Organic Raspberries'})      

36