In [7]:
%pip install mlxtend



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
#(Student:Allan)
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
df = pd.read_csv(r'C:\Users\user\supermarket_transactions.csv') # Uses pandas to read the CSV file from the specified path.
                                 # The 'r' prefix makes it a raw string, correctly handling the backslashes in Windows paths.
                                 # The file contains columns like TransactionID, CustomerName, Date, and Items.
                                 # The 'Items' column holds a string of comma-separated items for each transaction.
df['Items'] = df['Items'].str.split(',') # Accesses the 'Items' column of the DataFrame.
                                 # The '.str' accessor allows string operations on the entire column.
                                 # '.split(',')' splits each string value in the column by the comma character.
                                 # This converts each row's 'Items' value from a string like "A,B,C" into a list like ['A', 'B', 'C'].
                                 # This is the format required by TransactionEncoder.

# Extract the list of itemsets (transactions)
transaction_list = df['Items'].tolist() # Converts the 'Items' column (now containing lists) from the DataFrame
                                 # into a standard Python list of lists. Each inner list represents one transaction's items.
encoder = TransactionEncoder()    # Creates an instance of the TransactionEncoder class.
                                 # This object will be used to fit the transaction data and then transform it.
one_hot_encoded_matrix = encoder.fit(transaction_list).transform(transaction_list)
                                 # First, '.fit(transaction_list)' analyzes the list of itemsets to identify all unique items
                                 # and their potential presence across transactions.
                                 # Then, '.transform(transaction_list)' converts the list of itemsets into a binary matrix.
                                 # Each row in the matrix corresponds to a transaction.
                                 # Each column corresponds to a unique item.
                                 # A cell contains True if the item was present in that transaction, False otherwise.
df_encoded = pd.DataFrame(one_hot_encoded_matrix, columns=encoder.columns_)
                                 # Converts the resulting binary numpy matrix into a pandas DataFrame.
                                 # 'encoder.columns_' provides the column names (the unique items found during fitting).
                                 # This DataFrame 'df_encoded' is the final input format required by the Apriori algorithm.

In [12]:
# --- (Student:Allan) 2. Find Frequent Itemsets using Apriori ---
from mlxtend.frequent_patterns import apriori, association_rules
min_support_threshold = 0.01     # Defines the minimum support threshold as a decimal (0.01 = 1%).
                                 # An itemset must appear in at least 1% of all transactions to be considered 'frequent'.
frequent_itemsets = apriori(df_encoded, min_support=min_support_threshold, use_colnames=True)
                                 # Calls the Apriori algorithm from mlxtend.
                                 # 'df_encoded': The one-hot encoded DataFrame is passed as the dataset.
                                 # 'min_support=0.01': Sets the minimum support threshold.
                                 # 'use_colnames=True': Ensures the output DataFrame uses the actual item names
                                 # (e.g., 'Bread', 'Milk') as the elements in the itemsets, instead of column indices.
                                 # The result is a pandas DataFrame with columns 'support' and 'itemsets'.
                                 # 'support' contains the support value for each frequent itemset.
                                 # 'itemsets' contains the actual set of items (as a frozenset if use_colnames=True).

# Display the total number of frequent itemsets found
print(f"Total number of frequent itemsets (support >= {min_support_threshold}): {len(frequent_itemsets)}")
                                 # Prints the total count of itemsets found by Apriori that meet the minimum support.
print("\nFirst few frequent itemsets found by Apriori:")
                                 # Prints a header for the next output.
print(frequent_itemsets.head(10)) # Prints the first 10 rows of the 'frequent_itemsets' DataFrame to preview results.

Total number of frequent itemsets (support >= 0.01): 465

First few frequent itemsets found by Apriori:
   support   itemsets
0   0.1484   (Apples)
1   0.1552  (Bananas)
2   0.1440     (Beef)
3   0.1574    (Bread)
4   0.1450   (Butter)
5   0.1466  (Carrots)
6   0.1478   (Cereal)
7   0.1460   (Cheese)
8   0.1486  (Chicken)
9   0.1524   (Coffee)


In [13]:
# --- (Student:Allan) 3. Identify Maximal Frequent Itemsets ---
# Convert the 'itemsets' column of the frequent_itemsets DataFrame into a Python set of frozensets.
frequent_itemsets_set = set(frequent_itemsets['itemsets'].apply(frozenset))
                                 # Creates a Python 'set' containing all the frequent itemsets found by Apriori.
                                 # 'frequent_itemsets['itemsets']' accesses the column of itemsets.
                                 # '.apply(frozenset)' ensures each itemset is a frozenset (immutable set).
                                 # Using a 'set' allows for very fast O(1) average time complexity
                                 # when checking if a specific itemset exists within it later.
                                 # This is crucial for the efficiency of the maximal check loop.

# Iterate through each frequent itemset found by Apriori
maximal_itemsets = []            # Initializes an empty list to store the itemsets that are determined to be maximal.
for itemset in frequent_itemsets_set: # Loops through every single frequent itemset found by Apriori.
    is_maximal = True            # Assumes the current 'itemset' is maximal until proven otherwise.
    for item in df_encoded.columns: # Loops through every unique item present in the entire dataset
                                 # (as represented by the columns of the encoded DataFrame).
        # Form a potential superset by adding the item
        if item not in itemset:  # Checks if the current 'item' from the dataset is NOT already in the 'itemset' being checked.
            potential_superset = frozenset(itemset | {item}) # Creates a new frozenset by taking the current 'itemset'
                                 # and adding the current 'item' to it. This new set is a potential superset of 'itemset'.
            if potential_superset in frequent_itemsets_set: # Checks if this potential superset exists in our collection
                                 # of *all* frequent itemsets found previously by Apriori.
                is_maximal = False # If the potential superset is frequent, then the original 'itemset' cannot be maximal.
                break            # Exits the inner loop over items, as we've already proven it's not maximal.

    # After checking all possible items for the current itemset,
    # if 'is_maximal' is still True, it means no frequent superset was found.
    if is_maximal:               # If the loop over all items completed without finding a frequent superset...
        maximal_itemsets.append(itemset) # ...then the 'itemset' is maximal, and it's added to the list.

# Create a DataFrame with the 'itemsets' column containing the maximal itemsets
maximal_df = pd.DataFrame({'itemsets': maximal_itemsets})
                                 # Converts the list of maximal itemsets (frozensets) into a new pandas DataFrame.
                                 # This creates the structure for the final output, similar to the Apriori result.

In [14]:
# --- (Student:Allan) 4. Retrieving Support Values for Maximal Itemsets ---
# Initialize an empty list for the support values of maximal itemsets
maximal_supports = []            # Initializes an empty list to store the support values corresponding to the maximal itemsets.

for idx, row in maximal_df.iterrows(): # Iterates through each row of the 'maximal_df' DataFrame.
    max_itemset_frozenset = row['itemsets'] # Gets the frozenset itemset from the current row.
    support_value = frequent_itemsets[frequent_itemsets['itemsets'].apply(frozenset) == max_itemset_frozenset]['support'].iloc[0]
                                 # Finds the corresponding support value for the current maximal itemset.
                                 # It does this by searching the original 'frequent_itemsets' DataFrame.
                                 # It filters rows where the 'itemsets' column (converted to frozenset for comparison)
                                 # matches the 'max_itemset_frozenset'.
                                 # The ['support'] accesses the support column of the filtered result.
                                 # '.iloc[0]' gets the first (and should be only) value from the support series.
    maximal_supports.append(support_value) # Adds the found support value to the list.

# Add the calculated support values as a new column to the maximal DataFrame
maximal_df['support'] = maximal_supports
                                 # Adds the list of support values as a new column named 'support' to the 'maximal_df'.
                                 # Now, 'maximal_df' has both the itemsets and their corresponding support values.

In [15]:
# --- (Student:Allan) 5. Output the Results ---
print(f"\n\nTotal number of MAXIMAL frequent itemsets (support >= {min_support_threshold}): {len(maximal_df)}")
                                 # Prints the total count of itemsets found to be maximal.
print("\nMaximal Frequent Itemsets:")
                                 # Prints a header for the final output.
print(maximal_df)                # Prints the entire DataFrame containing the maximal itemsets and their supports.

# Print the length (number of items) of the largest maximal itemset found
if not maximal_df.empty:         # Checks if the maximal DataFrame is not empty (i.e., any maximal itemsets were found).
    max_length = maximal_df['itemsets'].apply(len).max() # Calculates the length of each itemset in the 'itemsets' column
                                 # and finds the maximum length among them.
    print(f"\nThe longest maximal itemset found contains {max_length} items.")
                                 # Prints the maximum length found.



Total number of MAXIMAL frequent itemsets (support >= 0.01): 435

Maximal Frequent Itemsets:
               itemsets  support
0        (Butter, Salt)   0.0206
1       (Bread, Yogurt)   0.0206
2       (Salt, Bananas)   0.0226
3       (Bread, Grapes)   0.0176
4       (Sugar, Cereal)   0.0214
..                  ...      ...
430     (Bread, Cereal)   0.0202
431  (Coffee, Potatoes)   0.0220
432       (Sugar, Milk)   0.0226
433   (Grapes, Chicken)   0.0218
434   (Juice, Tomatoes)   0.0214

[435 rows x 2 columns]

The longest maximal itemset found contains 2 items.


In [16]:
# --- 6. Save the Results to CSV ---
# [Student: Allan] Save the maximal frequent itemsets DataFrame as a CSV file to the specified directory
maximal_df.to_csv(r'C:\Users\user\maximal_frequent_itemsets.csv', index=False)
# The 'index=False' parameter ensures that the row indices are not saved as a separate column in the CSV file.
# The raw string (r"") is used again to correctly handle the Windows file path.
print(f"\nMaximal frequent itemsets have been saved to 'C:\\Users\\user\\maximal_frequent_itemsets.csv'")


Maximal frequent itemsets have been saved to 'C:\Users\user\maximal_frequent_itemsets.csv'


In [None]:
# --- (Student:Allan) 2. Find Frequent Itemsets using Apriori ---
from mlxtend.frequent_patterns import apriori, association_rules
min_support_threshold = 0.01     # Defines the minimum support threshold as a decimal (0.01 = 1%).
                                 # An itemset must appear in at least 1% of all transactions to be considered 'frequent'.
frequent_itemsets = apriori(df_encoded, min_support=min_support_threshold, use_colnames=True)
                                 # Calls the Apriori algorithm from mlxtend.
                                 # 'df_encoded': The one-hot encoded DataFrame is passed as the dataset.
                                 # 'min_support=0.01': Sets the minimum support threshold.
                                 # 'use_colnames=True': Ensures the output DataFrame uses the actual item names
                                 # (e.g., 'Bread', 'Milk') as the elements in the itemsets, instead of column indices.
                                 # The result is a pandas DataFrame with columns 'support' and 'itemsets'.
                                 # 'support' contains the support value for each frequent itemset.
                                 # 'itemsets' contains the actual set of items (as a frozenset if use_colnames=True).

# Display the total number of frequent itemsets found
print(f"Total number of frequent itemsets (support >= {min_support_threshold}): {len(frequent_itemsets)}")
                                 # Prints the total count of itemsets found by Apriori that meet the minimum support.
print("\nFirst few frequent itemsets found by Apriori:")
                                 # Prints a header for the next output.
print(frequent_itemsets.head(10)) # Prints the first 10 rows of the 'frequent_itemsets' DataFrame to preview results.

Total number of frequent itemsets (support >= 0.01): 465

First few frequent itemsets found by Apriori:
   support   itemsets
0   0.1484   (Apples)
1   0.1552  (Bananas)
2   0.1440     (Beef)
3   0.1574    (Bread)
4   0.1450   (Butter)
5   0.1466  (Carrots)
6   0.1478   (Cereal)
7   0.1460   (Cheese)
8   0.1486  (Chicken)
9   0.1524   (Coffee)


In [17]:
# Save directly to the Git repository folder
maximal_df.to_csv(r'C:\Users\user\-frequent-itemsets-group-BATAM\maximal_frequent_itemsets.csv', index=False)

In [None]:
# --- (Student:Allan) 2. Find Frequent Itemsets using Apriori ---
from mlxtend.frequent_patterns import apriori, association_rules
min_support_threshold = 0.01     # Defines the minimum support threshold as a decimal (0.01 = 1%).
                                 # An itemset must appear in at least 1% of all transactions to be considered 'frequent'.
frequent_itemsets = apriori(df_encoded, min_support=min_support_threshold, use_colnames=True)
                                 # Calls the Apriori algorithm from mlxtend.
                                 # 'df_encoded': The one-hot encoded DataFrame is passed as the dataset.
                                 # 'min_support=0.01': Sets the minimum support threshold.
                                 # 'use_colnames=True': Ensures the output DataFrame uses the actual item names
                                 # (e.g., 'Bread', 'Milk') as the elements in the itemsets, instead of column indices.
                                 # The result is a pandas DataFrame with columns 'support' and 'itemsets'.
                                 # 'support' contains the support value for each frequent itemset.
                                 # 'itemsets' contains the actual set of items (as a frozenset if use_colnames=True).

# Display the total number of frequent itemsets found
print(f"Total number of frequent itemsets (support >= {min_support_threshold}): {len(frequent_itemsets)}")
                                 # Prints the total count of itemsets found by Apriori that meet the minimum support.
print("\nFirst few frequent itemsets found by Apriori:")
                                 # Prints a header for the next output.
print(frequent_itemsets.head(10)) # Prints the first 10 rows of the 'frequent_itemsets' DataFrame to preview results.

Total number of frequent itemsets (support >= 0.01): 465

First few frequent itemsets found by Apriori:
   support   itemsets
0   0.1484   (Apples)
1   0.1552  (Bananas)
2   0.1440     (Beef)
3   0.1574    (Bread)
4   0.1450   (Butter)
5   0.1466  (Carrots)
6   0.1478   (Cereal)
7   0.1460   (Cheese)
8   0.1486  (Chicken)
9   0.1524   (Coffee)
