In [25]:
# pip install mlxtend

In [26]:
import pandas as pd
import random
from itertools import combinations
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings("ignore")

In [27]:
frequent_itemset_2 = [{'milk', 'bread'}, {'bread', 'butter'}, {'milk', 'egg'}]

In [28]:
# Generate C3

flatten_freq_itemst = []

for subset in frequent_itemset_2:
    for i in subset:
        flatten_freq_itemst.append(i)
        
print('Flattened 2 frequent itemset', flatten_freq_itemst)

Flattened 2 frequent itemset ['bread', 'milk', 'bread', 'butter', 'egg', 'milk']


In [29]:
unique_items = set(flatten_freq_itemst)
print("unique items in 2 frequen itemset: ", unique_items)

unique items in 2 frequen itemset:  {'egg', 'milk', 'bread', 'butter'}


In [30]:
frequent_itemset_3 = list(combinations(unique_items, 3))
print("C3: ", frequent_itemset_3)

C3:  [('egg', 'milk', 'bread'), ('egg', 'milk', 'butter'), ('egg', 'bread', 'butter'), ('milk', 'bread', 'butter')]


In [31]:
candidates_3 = list(combinations(set(i for subset in frequent_itemset_2 for i in subset), 3))
print("C3:", candidates_3)

print(candidates_3 == frequent_itemset_3)

C3: [('egg', 'milk', 'bread'), ('egg', 'milk', 'butter'), ('egg', 'bread', 'butter'), ('milk', 'bread', 'butter')]
True


In [32]:
transaction_data = pd.read_excel("./micro_data_transactions_comma_separated.xlsx")
transaction_data

Unnamed: 0,Transaction,Items
0,1,"item_D,item_F"
1,2,"item_D,item_C,item_A,item_E"
2,3,"item_D,item_F,item_C,item_A"
3,4,"item_A,item_F,item_C,item_B"
4,5,"item_B,item_A,item_F,item_D,item_E"
5,6,item_E
6,7,"item_D,item_C,item_A,item_F,item_B"
7,8,"item_E,item_A,item_F,item_D,item_C"
8,9,"item_E,item_C,item_F,item_A,item_D"
9,10,"item_E,item_C"


In [33]:
# split the csv into a list
transaction_data['Items'] = transaction_data['Items'].apply(lambda x: x.split(','))
transaction_data['Items']

0                            [item_D, item_F]
1            [item_D, item_C, item_A, item_E]
2            [item_D, item_F, item_C, item_A]
3            [item_A, item_F, item_C, item_B]
4    [item_B, item_A, item_F, item_D, item_E]
5                                    [item_E]
6    [item_D, item_C, item_A, item_F, item_B]
7    [item_E, item_A, item_F, item_D, item_C]
8    [item_E, item_C, item_F, item_A, item_D]
9                            [item_E, item_C]
Name: Items, dtype: object

In [34]:
# get the list of unique items in the entire list
all_items = sorted(set(item for subset in transaction_data['Items'] for item in subset))
all_items

['item_A', 'item_B', 'item_C', 'item_D', 'item_E', 'item_F']

In [35]:
# creating a DataFrame for one hot encoding with all 0s
one_hot_encoded_data = pd.DataFrame(0, index = transaction_data['Transaction'], columns=all_items)
one_hot_encoded_data

Unnamed: 0_level_0,item_A,item_B,item_C,item_D,item_E,item_F
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0
10,0,0,0,0,0,0


In [36]:
for index, row in transaction_data.iterrows():
    for item in row['Items']:
        one_hot_encoded_data.at[row['Transaction'], item] = 1

one_hot_encoded_data

Unnamed: 0_level_0,item_A,item_B,item_C,item_D,item_E,item_F
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,1,0,1
2,1,0,1,1,1,0
3,1,0,1,1,0,1
4,1,1,1,0,0,1
5,1,1,0,1,1,1
6,0,0,0,0,1,0
7,1,1,1,1,0,1
8,1,0,1,1,1,1
9,1,0,1,1,1,1
10,0,0,1,0,1,0


In [37]:
# Assign the one-hot encoded DataFrame (where each column represents an item and each row a transaction) to df
data = one_hot_encoded_data

# Set the minimum support count threshold. In this case, an itemset must appear in at least 2 transactions to be considered frequent
min_sup_count = 2

def get_support_count(data, itemset):
    """ 
    Function to calculate the support count of a given itemset (number of transactions in which the itemset appears).
    
    Parameters:
    df: The DataFrame where each row represents a transaction and each column represents an item (1 if present, 0 if absent).
    itemset: A tuple of items to check support for.
    
    Returns:
    The support count of the given itemset (how many transactions contain all items in the itemset).
    """
    # Create a boolean mask that checks if all items in the itemset have a value of 1 (present) in each transaction
    mask = data[list(itemset).all(axis = 1)]
    # Sum the mask (True = 1, False = 0) to count how many transactions contain the itemset
    return mask.sum()

def apriori_incremental(data, min_sup_count):
    """ 
    Function to generate frequent itemsets using an incremental approach (the Apriori algorithm).
    
    Parameters:
    df: The DataFrame where each row represents a transaction, and each column represents an item (1 if present, 0 if absent).
    min_sup_count: The minimum support count required for an itemset to be considered frequent.
    
    This function generates and displays all frequent itemsets incrementally, starting with single items and then moving to pairs, triples, etc.
    """
    # Extract the column names from the DataFrame (each column represents an item)
    items = data.columns
    # Initialize k to represent the size of the itemsets (starting with k=1 for single items)
    k = 1
    # Initialize an empty list to store frequent itemsets at each level
    frequent_itemsets = []

    # Loop indefinitely until no more frequent itemsets are found
    while True:
        if k == 1:
            # For k = 1, generate 1-frequent itemsets, which are just individual items
            candidates = [(items,) for item in items]
        else:
            # For k > 1, generate k-frequent itemsets by combining the items from the previous level
            # It does this by combining all previous frequent itemsets (flattening them and taking combinations of k items)
            candidates = list(combinations(set(i for subset in frequent_itemsets for i in subset), k))
            
        # Initialize lists to store all itemsets and their support counts for this level
        all_itemsets = []
        itemset_support = []
        
        
        # Loop through each candidate itemset and calculate its support count
        for itemset in candidates:
            # Sort and convert the itemset to a tuple to ensure consistent ordering and to avoid duplicates
            print(itemset)
            itemset = tuple(sorted(set(itemset)))
            # Get the support count for the current itemset by calling the get_support_count function
            support_count = get_support_count(data, itemset)
            # Store the itemset along with its support count
            all_itemsets.append((itemset, support_count))
            

        # Display all k-frequent itemsets before filtering by the minimum support count
        print(f"\n{k}-frequent itemsets (before filtering by min_sup_count):")
        for itemset, support_count in all_itemsets:
            print(f"Itemset: {itemset}, Support Count: {support_count}")
        

        # Filter the itemsets, keeping only those that meet or exceed the minimum support count
        
        valid_itemsets = [(itemset, support_count) for itemset, support_count in all_itemsets if support_count >= min_sup_count]
        
        # If there are no valid itemsets left (none meet the support threshold), break out of the loop
        if not valid_itemsets:
            break

        # Display the filtered frequent itemsets and their support counts
        print(f"\n{k}-frequent itemsets (after filtering by min_sup_count):")
        for itemset, support_count in valid_itemsets:
            print(f"Itemset: {itemset}, Support Count: {support_count}")
        
        # Update the frequent itemsets list with the valid itemsets for the next iteration
        frequent_itemsets = [itemset for itemset, support_count in valid_itemsets]
        # Increment k to move to the next level of itemsets (e.g., from pairs to triples)
        k += 1

# Call the apriori_incremental function to run the Apriori algorithm and find frequent itemsets
apriori_incremental(data, min_sup_count)


(Index(['item_A', 'item_B', 'item_C', 'item_D', 'item_E', 'item_F'], dtype='object'),)


TypeError: unhashable type: 'Index'