In [22]:
#Import required libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
import itertools

In [23]:
import os

# Define desktop path for saving files
def get_desktop_path():
    """
    Get the path to the user's desktop directory
    """
    desktop_path = os.path.join(os.path.expanduser('~'), 'Desktop')
    return desktop_path

# Get desktop path
DESKTOP_PATH = get_desktop_path()
print(f"Saving files to: {DESKTOP_PATH}")

Saving files to: C:\Users\User\Desktop


In [24]:

# Set random seed for reproducible results
np.random.seed(42)

# Define the pool of supermarket items
def generate_item_pool():
    """
    Generate a pool of 30 unique supermarket items
    """
    # Create diverse supermarket item categories
    fruits_veggies = ['Apple', 'Banana', 'Orange', 'Tomato', 'Potato', 'Lettuce', 'Carrot']
    dairy = ['Milk', 'Cheese', 'Yogurt', 'Butter', 'Eggs']
    meat = ['Chicken', 'Beef', 'Fish', 'Pork']
    bakery = ['Bread', 'Croissant', 'Bagel', 'Cake']
    beverages = ['Coffee', 'Tea', 'Soda', 'Juice', 'Water']
    household = ['Soap', 'Shampoo', 'Toothpaste', 'Detergent', 'Toilet Paper']
    
    # Combine all categories into one pool
    item_pool = fruits_veggies + dairy + meat + bakery + beverages + household
    return item_pool

# Generate simulated transactions
def generate_transactions(num_transactions=3000, min_items=2, max_items=7):
    """
    Generate supermarket transactions with random items
    """
    # Get the item pool
    items = generate_item_pool()
    
    #  Initialize list to store transactions
    transactions = []
    
    #  Generate each transaction
    for i in range(num_transactions):
        # Randomly determine number of items in this transaction
        num_items_in_transaction = np.random.randint(min_items, max_items + 1)
        
        # Randomly select items without replacement
        transaction_items = np.random.choice(items, size=num_items_in_transaction, replace=False)
        
        # Convert to list and sort for consistency
        transaction_items = sorted(list(transaction_items))
        transactions.append(transaction_items)
    
    return transactions, items

# Generate the transaction data
print("Generating supermarket transactions...")
transactions, item_pool = generate_transactions(3000)

# Create DataFrame and save raw transactions
transactions_df = pd.DataFrame({'transaction_id': range(len(transactions)), 'items': transactions})
transactions_df.to_csv(os.path.join(DESKTOP_PATH, 'supermarket_transactions.csv'), index=False)

Generating supermarket transactions...


In [25]:

# Display transaction statistics
print(f"Generated {len(transactions)} transactions")
print(f"Item pool size: {len(item_pool)} items")
print(f"Sample transaction: {transactions[0]}")

# Preprocessing: One-Hot Encoding
def preprocess_transactions(transactions):
    """
    Convert transactions to one-hot encoded format
    """
    # Initialize transaction encoder
    te = TransactionEncoder()
    
    # Fit and transform the transactions
    te_array = te.fit(transactions).transform(transactions)
    
    # Create DataFrame from encoded transactions
    one_hot_df = pd.DataFrame(te_array, columns=te.columns_)
    
    return one_hot_df

# Apply one-hot encoding
print("Applying one-hot encoding...")
one_hot_df = preprocess_transactions(transactions)


# Display one-hot encoded data info
print(f"One-hot encoded DataFrame shape: {one_hot_df.shape}")
print(f"Columns: {list(one_hot_df.columns)}")

# Generate Frequent Itemsets using Apriori
def generate_frequent_itemsets(one_hot_df, min_support=0.05):
    """
    Generate frequent itemsets using Apriori algorithm
    """
    # Apply Apriori algorithm with minimum support
    frequent_itemsets = apriori(one_hot_df, 
                               min_support=min_support, 
                               use_colnames=True)
    
    # Convert itemsets from frozenset to tuple for easier handling
    frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: tuple(x))
    
    # Add length of itemsets for filtering
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    
    # Sort by support in descending order
    frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)
    
    return frequent_itemsets

# Generate frequent itemsets
print("Generating frequent itemsets with Apriori...")
frequent_itemsets = generate_frequent_itemsets(one_hot_df, min_support=0.05)


# Save frequent itemsets to CSV
frequent_itemsets.to_csv(os.path.join(DESKTOP_PATH, 'frequent_itemsets.csv'), index=False)

# Display top 10 frequent itemsets
print("\nTop 10 Frequent Itemsets:")
print(frequent_itemsets.head(10))

Generated 3000 transactions
Item pool size: 30 items
Sample transaction: [np.str_('Cheese'), np.str_('Croissant'), np.str_('Pork'), np.str_('Toilet Paper'), np.str_('Water')]
Applying one-hot encoding...
One-hot encoded DataFrame shape: (3000, 30)
Columns: [np.str_('Apple'), np.str_('Bagel'), np.str_('Banana'), np.str_('Beef'), np.str_('Bread'), np.str_('Butter'), np.str_('Cake'), np.str_('Carrot'), np.str_('Cheese'), np.str_('Chicken'), np.str_('Coffee'), np.str_('Croissant'), np.str_('Detergent'), np.str_('Eggs'), np.str_('Fish'), np.str_('Juice'), np.str_('Lettuce'), np.str_('Milk'), np.str_('Orange'), np.str_('Pork'), np.str_('Potato'), np.str_('Shampoo'), np.str_('Soap'), np.str_('Soda'), np.str_('Tea'), np.str_('Toilet Paper'), np.str_('Tomato'), np.str_('Toothpaste'), np.str_('Water'), np.str_('Yogurt')]
Generating frequent itemsets with Apriori...

Top 10 Frequent Itemsets:
     support    itemsets  length
8   0.157333   (Cheese,)       1
0   0.156333    (Apple,)       1
24  0.

In [26]:
# Identify Closed Frequent Itemsets
def find_closed_itemsets(frequent_itemsets):
    """
    Identify closed frequent itemsets
    Closed itemset: No superset has the same support
    """
    # Convert to list of tuples for easier processing
    itemset_list = list(frequent_itemsets[['itemsets', 'support']].itertuples(index=False, name=None))
    
    # Initialize list for closed itemsets
    closed_itemsets = []
    
    # Check each itemset to see if it's closed
    for itemset, support in itemset_list:
        is_closed = True
        
        # Get all supersets of current itemset
        for other_itemset, other_support in itemset_list:
            # Check if other_itemset is a proper superset
            if set(itemset).issubset(set(other_itemset)) and len(other_itemset) > len(itemset):
                # If superset has same support, current itemset is not closed
                if abs(support - other_support) < 1e-10:  # Account for floating point precision
                    is_closed = False
                    break
        
        # If no superset has same support, add to closed itemsets
        if is_closed:
            closed_itemsets.append((itemset, support))
    
    # Convert to DataFrame
    closed_df = pd.DataFrame(closed_itemsets, columns=['itemsets', 'support'])
    closed_df['length'] = closed_df['itemsets'].apply(lambda x: len(x))
    closed_df = closed_df.sort_values('support', ascending=False)
    
    return closed_df

# Find closed frequent itemsets
print("Identifying closed frequent itemsets...")
closed_itemsets = find_closed_itemsets(frequent_itemsets)

# Save closed itemsets to CSV
closed_itemsets.to_csv(os.path.join(DESKTOP_PATH, 'closed_itemsets.csv'), index=False)

# Display top 10 closed itemsets
print("\nTop 10 Closed Frequent Itemsets:")
print(closed_itemsets.head(10))

Identifying closed frequent itemsets...

Top 10 Closed Frequent Itemsets:
     itemsets   support  length
0   (Cheese,)  0.157333       1
1    (Apple,)  0.156333       1
2      (Tea,)  0.156333       1
3   (Potato,)  0.154333       1
4  (Chicken,)  0.154000       1
5  (Shampoo,)  0.154000       1
6     (Soda,)  0.153333       1
7    (Bagel,)  0.153333       1
8     (Cake,)  0.152333       1
9    (Water,)  0.152333       1


In [27]:

# Identify Maximal Frequent Itemsets
def find_maximal_itemsets(frequent_itemsets):
    """
    Identify maximal frequent itemsets
    Maximal itemset: No frequent superset exists
    """
    #  Convert to list of tuples
    itemset_list = list(frequent_itemsets[['itemsets', 'support']].itertuples(index=False, name=None))
    
    # Initialize list for maximal itemsets
    maximal_itemsets = []
    
    # Create a set of all frequent itemsets for quick lookup
    all_itemsets = set(frequent_itemsets['itemsets'])
    
    # Check each itemset to see if it's maximal
    for itemset, support in itemset_list:
        is_maximal = True
        
        # Check if any superset exists in frequent itemsets
        for other_itemset in all_itemsets:
            if set(itemset).issubset(set(other_itemset)) and len(other_itemset) > len(itemset):
                is_maximal = False
                break
        
        # If no frequent superset exists, add to maximal itemsets
        if is_maximal:
            maximal_itemsets.append((itemset, support))
    
    # Convert to DataFrame
    maximal_df = pd.DataFrame(maximal_itemsets, columns=['itemsets', 'support'])
    maximal_df['length'] = maximal_df['itemsets'].apply(lambda x: len(x))
    maximal_df = maximal_df.sort_values('support', ascending=False)
    
    return maximal_df

# Find maximal frequent itemsets
print("Identifying maximal frequent itemsets...")
maximal_itemsets = find_maximal_itemsets(frequent_itemsets)

# Save maximal itemsets to CSV
maximal_itemsets.to_csv(os.path.join(DESKTOP_PATH, 'maximal_itemsets.csv'), index=False)

# Display top 10 maximal itemsets
print("\nTop 10 Maximal Frequent Itemsets:")
print(maximal_itemsets.head(10))

Identifying maximal frequent itemsets...

Top 10 Maximal Frequent Itemsets:
     itemsets   support  length
0   (Cheese,)  0.157333       1
1    (Apple,)  0.156333       1
2      (Tea,)  0.156333       1
3   (Potato,)  0.154333       1
4  (Chicken,)  0.154000       1
5  (Shampoo,)  0.154000       1
6     (Soda,)  0.153333       1
7    (Bagel,)  0.153333       1
8     (Cake,)  0.152333       1
9    (Water,)  0.152333       1


In [28]:

# Analysis and Summary Statistics
def generate_summary_statistics(frequent_itemsets, closed_itemsets, maximal_itemsets):
    """
    Generate summary statistics for the analysis
    """
    print("=" * 50)
    print("FREQUENT ITEMSETS MINING - SUMMARY STATISTICS")
    print("=" * 50)
    
    print(f"Total Frequent Itemsets: {len(frequent_itemsets)}")
    print(f"Total Closed Itemsets: {len(closed_itemsets)}")
    print(f"Total Maximal Itemsets: {len(maximal_itemsets)}")
    
    print(f"\nSupport Range: {frequent_itemsets['support'].min():.3f} - {frequent_itemsets['support'].max():.3f}")
    print(f"Itemset Size Range: 1 - {frequent_itemsets['length'].max()}")
    
    # Count itemsets by length
    print("\nFrequent Itemsets by Length:")
    for length in sorted(frequent_itemsets['length'].unique()):
        count = len(frequent_itemsets[frequent_itemsets['length'] == length])
        print(f"  Length {length}: {count} itemsets")
    
    # Most frequent individual items
    single_items = frequent_itemsets[frequent_itemsets['length'] == 1]
    print(f"\nTop 5 Most Frequent Individual Items:")
    for _, row in single_items.head().iterrows():
        item = list(row['itemsets'])[0]
        print(f"  {item}: {row['support']:.3f}")

#  Generate summary statistics
generate_summary_statistics(frequent_itemsets, closed_itemsets, maximal_itemsets)

FREQUENT ITEMSETS MINING - SUMMARY STATISTICS
Total Frequent Itemsets: 30
Total Closed Itemsets: 30
Total Maximal Itemsets: 30

Support Range: 0.137 - 0.157
Itemset Size Range: 1 - 1

Frequent Itemsets by Length:
  Length 1: 30 itemsets

Top 5 Most Frequent Individual Items:
  Cheese: 0.157
  Apple: 0.156
  Tea: 0.156
  Potato: 0.154
  Chicken: 0.154


In [29]:

# Verification and Validation
def validate_results(frequent_itemsets, closed_itemsets, maximal_itemsets):
    """
    Validate that the results make logical sense
    """
    print("VALIDATION CHECKS")
    
    # Check 1: All maximal itemsets should be closed
    maximal_set = set(maximal_itemsets['itemsets'])
    closed_set = set(closed_itemsets['itemsets'])
    
    if maximal_set.issubset(closed_set):
        print(" Validation 1 PASSED: All maximal itemsets are closed")
    else:
        print(" Validation 1 FAILED: Some maximal itemsets are not closed")
    
    # Check 2: All closed itemsets should be frequent
    frequent_set = set(frequent_itemsets['itemsets'])
    
    if closed_set.issubset(frequent_set):
        print(" Validation 2 PASSED: All closed itemsets are frequent")
    else:
        print(" Validation 2 FAILED: Some closed itemsets are not frequent")
    
    # Check 3: Maximal itemsets should be the largest frequent itemsets
    max_length = frequent_itemsets['length'].max()
    maximal_max_length = maximal_itemsets['length'].max()
    
    if max_length == maximal_max_length:
        print(" Validation 3 PASSED: Maximal itemsets include the largest itemsets")
    else:
        print(" Validation 3 FAILED: Maximal itemsets don't include largest itemsets")

# Run validation checks
validate_results(frequent_itemsets, closed_itemsets, maximal_itemsets)

print("\nAnalysis complete. All output files have been generated.")

VALIDATION CHECKS
 Validation 1 PASSED: All maximal itemsets are closed
 Validation 2 PASSED: All closed itemsets are frequent
 Validation 3 PASSED: Maximal itemsets include the largest itemsets

Analysis complete. All output files have been generated.
