In [13]:
# Step 0: Install necessary libraries
!pip install pandas mlxtend openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [14]:
# Step 1: Import necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [15]:
data = pd.read_csv('groceries.csv')
print(data)


                MILK,BREAD,BISCUIT
0    BREAD,MILK,BISCUIT,CORNFLAKES
1              BREAD,TEA,BOURNVITA
2             JAM,MAGGI,BREAD,MILK
3                MAGGI,TEA,BISCUIT
4              BREAD,TEA,BOURNVITA
..                             ...
145          BREAD,TEA,COFFEE,MILK
146              BREAD,COFFEE,COCK
147         BREAD,TEA,COFFEE,SUGER
148                 BREAD,MILK,TEA
149              BREAD,COFFEE,COCK

[150 rows x 1 columns]


In [20]:
# Function to preprocess data
def preprocess_data(data):
    # Get the column name
    column_name = data.columns[0]
    print(f"Step 1: Original Data\n{data}\n")
    
    # Split the transactions into separate items
    transactions = data[column_name].str.split(',')
    print(f"Step 2: Transactions Split into Items\n{transactions}\n")
    
    # Initialize TransactionEncoder
    te = TransactionEncoder()
    print(f"Step 3: TransactionEncoder Object\n{te}\n")
        
    # Fit and transform data with TransactionEncoder
    te_ary = te.fit(transactions).transform(transactions)
    print(f"Step 4: Transformed Data\n{te_ary}\n")
    
    # Convert transformed data into a pandas DataFrame
    df = pd.DataFrame(te_ary, columns=te.columns_)
    # print(f"Step 5: Final DataFrame\n{df}\n")
    
    return df

# Preprocess the data
df = preprocess_data(data)

# Print the preprocessed data
print(f"Preprocessed Data:\n{df}")


Step 1: Original Data
                MILK,BREAD,BISCUIT
0    BREAD,MILK,BISCUIT,CORNFLAKES
1              BREAD,TEA,BOURNVITA
2             JAM,MAGGI,BREAD,MILK
3                MAGGI,TEA,BISCUIT
4              BREAD,TEA,BOURNVITA
..                             ...
145          BREAD,TEA,COFFEE,MILK
146              BREAD,COFFEE,COCK
147         BREAD,TEA,COFFEE,SUGER
148                 BREAD,MILK,TEA
149              BREAD,COFFEE,COCK

[150 rows x 1 columns]

Step 2: Transactions Split into Items
0      [BREAD, MILK, BISCUIT, CORNFLAKES]
1                 [BREAD, TEA, BOURNVITA]
2               [JAM, MAGGI, BREAD, MILK]
3                   [MAGGI, TEA, BISCUIT]
4                 [BREAD, TEA, BOURNVITA]
                      ...                
145            [BREAD, TEA, COFFEE, MILK]
146                 [BREAD, COFFEE, COCK]
147           [BREAD, TEA, COFFEE, SUGER]
148                    [BREAD, MILK, TEA]
149                 [BREAD, COFFEE, COCK]
Name: MILK,BREAD,BISCUIT, Length:

In [25]:
# Step 3: Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)


     support                     itemsets
0   0.313333                    (BISCUIT)
1   0.126667                  (BOURNVITA)
2   0.640000                      (BREAD)
3   0.253333                       (COCK)
4   0.686667                     (COFFEE)
5   0.166667                 (CORNFLAKES)
6   0.386667                       (MILK)
7   0.506667                      (SUGER)
8   0.573333                        (TEA)
9   0.146667              (BISCUIT, COCK)
10  0.220000            (BISCUIT, COFFEE)
11  0.120000        (CORNFLAKES, BISCUIT)
12  0.140000             (SUGER, BISCUIT)
13  0.153333               (BISCUIT, TEA)
14  0.113333             (TEA, BOURNVITA)
15  0.140000                (BREAD, COCK)
16  0.406667              (BREAD, COFFEE)
17  0.273333                (BREAD, MILK)
18  0.280000               (SUGER, BREAD)
19  0.340000                 (TEA, BREAD)
20  0.253333               (COFFEE, COCK)
21  0.120000         (CORNFLAKES, COFFEE)
22  0.226667               (MILK, 

In [23]:
# Step 4: Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules.to_string())


             antecedents           consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction  zhangs_metric
0              (BISCUIT)                (COCK)            0.313333            0.253333  0.146667    0.468085  1.847704  0.067289    1.403733       0.668138
1                 (COCK)             (BISCUIT)            0.253333            0.313333  0.146667    0.578947  1.847704  0.067289    1.630833       0.614448
2              (BISCUIT)              (COFFEE)            0.313333            0.686667  0.220000    0.702128  1.022516  0.004844    1.051905       0.032068
3               (COFFEE)             (BISCUIT)            0.686667            0.313333  0.220000    0.320388  1.022516  0.004844    1.010381       0.070277
4           (CORNFLAKES)             (BISCUIT)            0.166667            0.313333  0.120000    0.720000  2.297872  0.067778    2.452381       0.677778
5              (BISCUIT)          (CORNFLAKES)            0.3133