In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import time

In [3]:
def load_and_preprocess_data(file_path):
    """Load and preprocess the transaction data."""
    # Load data
    df = pd.read_csv(file_path)

    # Check basic stats
    print(f"Total rows: {df.shape[0]}")
    print(f"Unique transactions: {df['TransactionID'].nunique()}")
    print(f"Unique customers: {df['CustomerID'].nunique()}")
    print(f"Unique products: {df['ProductCode'].nunique()}")
    print(f"Categories: {df['Category'].nunique()}")

    # Convert DateTime to datetime format if needed
    if not pd.api.types.is_datetime64_any_dtype(df['DateTime']):
        df['DateTime'] = pd.to_datetime(df['DateTime'])

    return df

In [4]:
def memory_efficient_basket_analysis(df, min_support_count=10, min_confidence=0.1, max_basket_items=50):
    """
    Perform memory-efficient basket analysis for product recommendations.

    Parameters:
    -----------
    df : pandas DataFrame
        Transaction data with TransactionID and ProductCode columns
    min_support_count : int
        Minimum number of baskets an itemset must appear in
    min_confidence : float
        Minimum confidence for an association rule
    max_basket_items : int
        Maximum number of unique items to consider in a basket (for memory efficiency)

    Returns:
    --------
    rules_df : pandas DataFrame
        Association rules with antecedent, consequent, support, confidence, and lift
    """
    start_time = time.time()

    # Step 1: Create transaction baskets
    print("Creating transaction baskets...")
    transaction_baskets = {}
    for tid, group in df.groupby('TransactionID')['ProductCode']:
        # Limit basket size for memory efficiency
        transaction_baskets[tid] = set(group.unique()[:max_basket_items])

    # Step 2: Calculate item frequencies
    print("Calculating item frequencies...")
    item_freq = defaultdict(int)
    for basket in transaction_baskets.values():
        for item in basket:
            item_freq[item] += 1

    # Step 3: Keep only frequent items (above min_support_count)
    print("Filtering frequent items...")
    frequent_items = {item: freq for item, freq in item_freq.items()
                     if freq >= min_support_count}
    print(f"Found {len(frequent_items)} frequent items out of {len(item_freq)} total items")

    # Step 4: Generate all pairs of frequent items
    print("Generating frequent pairs...")
    frequent_pairs = {}

    # Calculate support for pairs more efficiently by going through baskets once
    for basket in transaction_baskets.values():
        # Get frequent items in this basket
        frequent_in_basket = [item for item in basket if item in frequent_items]
        # Generate all pairs
        for i, item1 in enumerate(frequent_in_basket):
            for item2 in frequent_in_basket[i+1:]:
                pair = tuple(sorted([item1, item2]))
                if pair not in frequent_pairs:
                    frequent_pairs[pair] = 0
                frequent_pairs[pair] += 1

    # Filter pairs by support
    frequent_pairs = {pair: count for pair, count in frequent_pairs.items()
                     if count >= min_support_count}
    print(f"Found {len(frequent_pairs)} frequent pairs")

    # Step 5: Generate association rules
    print("Generating association rules...")
    rules = []
    for (item1, item2), pair_count in frequent_pairs.items():
        # Calculate confidence for item1 -> item2
        confidence1 = pair_count / item_freq[item1]
        if confidence1 >= min_confidence:
            support = pair_count / len(transaction_baskets)
            lift = confidence1 / (item_freq[item2] / len(transaction_baskets))
            rules.append({
                'antecedent': item1,
                'consequent': item2,
                'support': support,
                'confidence': confidence1,
                'lift': lift,
                'count': pair_count
            })

        # Calculate confidence for item2 -> item1
        confidence2 = pair_count / item_freq[item2]
        if confidence2 >= min_confidence:
            support = pair_count / len(transaction_baskets)
            lift = confidence2 / (item_freq[item1] / len(transaction_baskets))
            rules.append({
                'antecedent': item2,
                'consequent': item1,
                'support': support,
                'confidence': confidence2,
                'lift': lift,
                'count': pair_count
            })

    # Convert to DataFrame and sort by lift
    if rules:
        rules_df = pd.DataFrame(rules).sort_values('lift', ascending=False)
    else:
        # Create an empty DataFrame with the correct columns if no rules are found
        rules_df = pd.DataFrame(columns=['antecedent', 'consequent', 'support', 'confidence', 'lift', 'count'])

    elapsed_time = time.time() - start_time
    print(f"Analysis completed in {elapsed_time:.2f} seconds")
    print(f"Generated {len(rules)} association rules")

    return rules_df, frequent_items

In [5]:
def create_product_mapping(df):
    """Create mapping from product codes to names."""
    product_map = df[['ProductCode', 'ItemName']].drop_duplicates().set_index('ProductCode')['ItemName'].to_dict()
    return product_map

In [6]:
def recommend_products_for_basket(basket, rules_df, frequent_items, product_map, top_n=5):
    """Recommend products based on current basket items."""
    # If no rules, return empty list
    if rules_df.empty:
        return []

    # Filter basket to only include frequent items
    basket = [item for item in basket if item in frequent_items]

    # Calculate recommendation scores
    scores = defaultdict(float)
    for item in basket:
        # Find rules where this item is the antecedent
        item_rules = rules_df[rules_df['antecedent'] == item]

        for _, rule in item_rules.iterrows():
            consequent = rule['consequent']
            if consequent not in basket:  # Don't recommend items already in basket
                # Use lift as the score
                scores[consequent] += rule['lift']

    # Get top N recommendations
    top_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Convert product codes to names
    named_recommendations = [(product_map.get(prod_id, f"Product {prod_id}"), score)
                            for prod_id, score in top_items]

    return named_recommendations

In [7]:
def recommend_by_category(basket, df, product_map, top_n=1):
    """Recommend top products from categories not in the basket."""
    basket_categories = set(df[df['ProductCode'].isin(basket)]['Category'].unique())
    all_categories = set(df['Category'].unique())

    # Find categories not in the current basket
    missing_categories = all_categories - basket_categories

    category_recommendations = []

    for category in missing_categories:
        # Get most popular product in this category
        category_products = df[df['Category'] == category]
        product_counts = category_products.groupby('ProductCode')['Quantity'].sum()
        if not product_counts.empty:
            top_product = product_counts.idxmax()
            category_recommendations.append((category, top_product))

    # Select top N categories
    top_category_recommendations = category_recommendations[:top_n]

    # Convert product codes to names
    named_recommendations = [(cat, product_map.get(prod_id, f"Product {prod_id}"))
                            for cat, prod_id in top_category_recommendations]

    return named_recommendations

In [8]:
def main():
    # File path
    file_path = 'customer_transactions_coles.csv'

    # Load and preprocess data
    df = load_and_preprocess_data(file_path)

    # Create product mapping
    product_map = create_product_mapping(df)

    # Run memory-efficient basket analysis
    # Try with different support thresholds if needed
    for min_support_count in [10, 5, 2]:
        print(f"\nRunning memory-efficient basket analysis with min_support_count={min_support_count}...")
        rules_df, frequent_items = memory_efficient_basket_analysis(
            df,
            min_support_count=min_support_count,  # Items must appear in at least X baskets
            min_confidence=0.01,
            max_basket_items=50    # Limit basket size for memory efficiency
        )

        if not rules_df.empty:
            print(f"Success! Generated {len(rules_df)} rules with min_support_count={min_support_count}")
            break
        else:
            print(f"No rules generated with min_support_count={min_support_count}, trying lower threshold...")

    # Check if any rules were generated
    if rules_df.empty:
        print("\nNo association rules could be generated with any threshold.")
        print("Using category-based recommendations only.")
    else:
        # Display top rules
        print("\nTop 10 association rules:")
        top_rules = rules_df.head(10)
        for _, rule in top_rules.iterrows():
            antecedent = product_map.get(rule['antecedent'], f"Product {rule['antecedent']}")
            consequent = product_map.get(rule['consequent'], f"Product {rule['consequent']}")
            ant_name = antecedent[:40] + "..." if len(antecedent) > 40 else antecedent
            cons_name = consequent[:40] + "..." if len(consequent) > 40 else consequent
            print(f"{ant_name} → {cons_name} "
                  f"(Support: {rule['support']:.4f}, Confidence: {rule['confidence']:.4f}, Lift: {rule['lift']:.4f})")

    # Example: Get recommendations for a sample basket
    sample_transaction_id = df['TransactionID'].iloc[0]
    sample_basket = df[df['TransactionID'] == sample_transaction_id]['ProductCode'].unique().tolist()

    print(f"\nSample basket contains {len(sample_basket)} items:")
    for item in sample_basket[:5]:  # Show first 5 items
        print(f"- {product_map.get(item, f'Product {item}')}")
    print("...")

    # Get association rule-based recommendations
    recommendations = recommend_products_for_basket(sample_basket, rules_df, frequent_items, product_map)

    print("\nTop recommendations based on association rules:")
    if recommendations:
        for product, score in recommendations:
            print(f"- {product} (Score: {score:.2f})")
    else:
        print("No association rule-based recommendations available.")

    # Get category-based recommendations
    category_recommendations = recommend_by_category(sample_basket, df, product_map, top_n=3)

    print("\nRecommendations for missing categories:")
    for category, product in category_recommendations:
        print(f"- {category}: {product}")

    # Create a recommendation function for use at checkout
    def get_recommendations_for_checkout(basket_items):
        """Get recommendations for a given basket at checkout."""
        # Get association-based recommendations
        assoc_recommendations = recommend_products_for_basket(
            basket_items, rules_df, frequent_items, product_map, top_n=3)

        # If we don't have enough association recommendations, add category-based ones
        if len(assoc_recommendations) < 3:
            cat_recommendations = recommend_by_category(
                basket_items, df, product_map, top_n=3-len(assoc_recommendations))

            # Combine both types of recommendations
            all_recommendations = [
                f"{product} (Score: {score:.2f})" for product, score in assoc_recommendations
            ] + [
                f"{product} (from {category})" for category, product in cat_recommendations
            ]
        else:
            all_recommendations = [
                f"{product} (Score: {score:.2f})" for product, score in assoc_recommendations
            ]

        return all_recommendations

    # Example usage
    print("\nCheckout recommendation example:")
    test_basket = [3476925, 3532695, 5009272]  # Example product codes
    print("Test basket:", [product_map.get(item, f"Product {item}") for item in test_basket])
    test_recommendations = get_recommendations_for_checkout(test_basket)
    print("Checkout recommendations:", test_recommendations)

    # Save the model components for future use
    if not rules_df.empty:
        print("\nSaving model components for future use...")
        rules_df.to_csv('association_rules.csv', index=False)
        pd.DataFrame(list(frequent_items.items()),
                   columns=['ProductCode', 'Frequency']).to_csv('frequent_items.csv', index=False)
        print("Model components saved successfully.")

        print("\nTo use this model in production, you can load the saved rules and frequent items:")
        print("""
# Load saved model components
rules_df = pd.read_csv('association_rules.csv')
frequent_items_df = pd.read_csv('frequent_items.csv')
frequent_items = dict(zip(frequent_items_df['ProductCode'], frequent_items_df['Frequency']))

# Create product mapping from your product database
product_map = {...}  # Map of ProductCode to ItemName

# Function to get recommendations for a basket
def recommend_products(basket):
    return recommend_products_for_basket(basket, rules_df, frequent_items, product_map)
        """)

In [9]:
if __name__ == "__main__":
    main()

Total rows: 477906
Unique transactions: 29408
Unique customers: 1000
Unique products: 19779
Categories: 10

Running memory-efficient basket analysis with min_support_count=10...
Creating transaction baskets...
Calculating item frequencies...
Filtering frequent items...
Found 10705 frequent items out of 19779 total items
Generating frequent pairs...
Found 124 frequent pairs
Generating association rules...
Analysis completed in 9.24 seconds
Generated 248 association rules
Success! Generated 248 rules with min_support_count=10

Top 10 association rules:
Helga's Sourdough Bread | 650g → Coles Thin & Crispy Medium Pizza Bases 9... (Support: 0.0004, Confidence: 0.0769, Lift: 13.6274)
Coles Thin & Crispy Medium Pizza Bases 9... → Helga's Sourdough Bread | 650g (Support: 0.0004, Confidence: 0.0723, Lift: 13.6274)
Simson's Pantry Low Carb Spinach And Her... → Mission Crust Plain Pizza | 400g (Support: 0.0003, Confidence: 0.0625, Lift: 10.5632)
Mission Crust Plain Pizza | 400g → Simson's Pantry 

------