In [1]:
# Step 0: Install necessary libraries
%pip install pandas mlxtend 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [3]:
data = pd.read_csv('groceries.csv')
print(data)


                MILK,BREAD,BISCUIT
0    BREAD,MILK,BISCUIT,CORNFLAKES
1              BREAD,TEA,BOURNVITA
2             JAM,MAGGI,BREAD,MILK
3                MAGGI,TEA,BISCUIT
4              BREAD,TEA,BOURNVITA
..                             ...
145          BREAD,TEA,COFFEE,MILK
146              BREAD,COFFEE,COCK
147         BREAD,TEA,COFFEE,SUGER
148                 BREAD,MILK,TEA
149              BREAD,COFFEE,COCK

[150 rows x 1 columns]


In [4]:
# Function to preprocess data
def preprocess_data(data):
    # Get the column name
    column_name = data.columns[0]
    print(f"Step 1: Original Data\n{data}\n")
    
    # Split the transactions into separate items
    transactions = data[column_name].str.split(',')
    print(f"Step 2: Transactions Split into Items\n{transactions}\n")
    
    # Initialize TransactionEncoder
    te = TransactionEncoder()
    print(f"Step 3: TransactionEncoder Object\n{te}\n")
        
    # Fit and transform data with TransactionEncoder
    te_ary = te.fit(transactions).transform(transactions)
    print(f"Step 4: Transformed Data\n{te_ary}\n")
    
    # Convert transformed data into a pandas DataFrame
    df = pd.DataFrame(te_ary, columns=te.columns_)
    # print(f"Step 5: Final DataFrame\n{df}\n")
    
    return df

# Preprocess the data
df = preprocess_data(data)

# Print the preprocessed data
print(f"Preprocessed Data:\n{df}")


Step 1: Original Data
                MILK,BREAD,BISCUIT
0    BREAD,MILK,BISCUIT,CORNFLAKES
1              BREAD,TEA,BOURNVITA
2             JAM,MAGGI,BREAD,MILK
3                MAGGI,TEA,BISCUIT
4              BREAD,TEA,BOURNVITA
..                             ...
145          BREAD,TEA,COFFEE,MILK
146              BREAD,COFFEE,COCK
147         BREAD,TEA,COFFEE,SUGER
148                 BREAD,MILK,TEA
149              BREAD,COFFEE,COCK

[150 rows x 1 columns]

Step 2: Transactions Split into Items
0      [BREAD, MILK, BISCUIT, CORNFLAKES]
1                 [BREAD, TEA, BOURNVITA]
2               [JAM, MAGGI, BREAD, MILK]
3                   [MAGGI, TEA, BISCUIT]
4                 [BREAD, TEA, BOURNVITA]
                      ...                
145            [BREAD, TEA, COFFEE, MILK]
146                 [BREAD, COFFEE, COCK]
147           [BREAD, TEA, COFFEE, SUGER]
148                    [BREAD, MILK, TEA]
149                 [BREAD, COFFEE, COCK]
Name: MILK,BREAD,BISCUIT, Length:

In [5]:
# Step 3: Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)


     support                     itemsets
0   0.313333                    (BISCUIT)
1   0.126667                  (BOURNVITA)
2   0.640000                      (BREAD)
3   0.253333                       (COCK)
4   0.686667                     (COFFEE)
5   0.166667                 (CORNFLAKES)
6   0.386667                       (MILK)
7   0.506667                      (SUGER)
8   0.573333                        (TEA)
9   0.146667              (BISCUIT, COCK)
10  0.220000            (BISCUIT, COFFEE)
11  0.120000        (BISCUIT, CORNFLAKES)
12  0.140000             (BISCUIT, SUGER)
13  0.153333               (BISCUIT, TEA)
14  0.113333             (TEA, BOURNVITA)
15  0.140000                (BREAD, COCK)
16  0.406667              (BREAD, COFFEE)
17  0.273333                (BREAD, MILK)
18  0.280000               (BREAD, SUGER)
19  0.340000                 (BREAD, TEA)
20  0.253333               (COFFEE, COCK)
21  0.120000         (COFFEE, CORNFLAKES)
22  0.226667               (COFFEE

In [6]:
# Step 4: Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Simplify and format the output of the association rules
def format_rules(rules):
    simplified_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
    simplified_rules['antecedents'] = simplified_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
    simplified_rules['consequents'] = simplified_rules['consequents'].apply(lambda x: ', '.join(list(x)))
    simplified_rules = simplified_rules.round(2)
    return simplified_rules

formatted_rules = format_rules(rules)
print(formatted_rules.to_string(index=False))


       antecedents        consequents  support  confidence  lift
           BISCUIT               COCK     0.15        0.47  1.85
              COCK            BISCUIT     0.15        0.58  1.85
           BISCUIT             COFFEE     0.22        0.70  1.02
            COFFEE            BISCUIT     0.22        0.32  1.02
           BISCUIT         CORNFLAKES     0.12        0.38  2.30
        CORNFLAKES            BISCUIT     0.12        0.72  2.30
               TEA          BOURNVITA     0.11        0.20  1.56
         BOURNVITA                TEA     0.11        0.89  1.56
             BREAD               MILK     0.27        0.43  1.10
              MILK              BREAD     0.27        0.71  1.10
            COFFEE               COCK     0.25        0.37  1.46
              COCK             COFFEE     0.25        1.00  1.46
            COFFEE         CORNFLAKES     0.12        0.17  1.05
        CORNFLAKES             COFFEE     0.12        0.72  1.05
            COFFEE       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_rules['antecedents'] = simplified_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplified_rules['consequents'] = simplified_rules['consequents'].apply(lambda x: ', '.join(list(x)))
