In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Groceries_dataset.csv")

In [3]:
df.head(3)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit


In [4]:
df['itemDescription']

0               tropical fruit
1                   whole milk
2                    pip fruit
3             other vegetables
4                   whole milk
                 ...          
38760            sliced cheese
38761                    candy
38762                 cake bar
38763    fruit/vegetable juice
38764                 cat food
Name: itemDescription, Length: 38765, dtype: object

In [6]:
items = df['itemDescription'].value_counts().reset_index()
items.columns = ['item', 'counts']

In [8]:
items.head(3)

Unnamed: 0,item,counts
0,whole milk,2502
1,other vegetables,1898
2,rolls/buns,1716


In [12]:
x = items.item
x.head(2)

0          whole milk
1    other vegetables
Name: item, dtype: object

## Transaction Encoder

In [14]:
from mlxtend.preprocessing import TransactionEncoder

In [15]:
tr = TransactionEncoder()
tr.fit(x)

In [17]:
te_x = pd.DataFrame(tr.transform(x), columns=tr.columns_)
te_x.head(3)

Unnamed: 0,Unnamed: 1,(,),-,.,/,H,I,T,U,...,p,q,r,s,t,u,v,w,y,z
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,True,False,True,False,False,False
2,False,False,False,False,False,True,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False


In [18]:
te_x.shape

(167, 35)

In [24]:
from mlxtend.frequent_patterns import apriori

ap = apriori(te_x, min_support=0.004, use_colnames=True).sort_values(by='support')

In [25]:
ap

Unnamed: 0,support,itemsets
94281,0.005988,"(m, b, e, i, h, r)"
640913,0.005988,"(l, n, r, b, , t, y, c, u, i, k)"
640914,0.005988,"(l, r, b, p, , t, c, u, i, o, k)"
640915,0.005988,"(l, r, b, p, , t, y, c, i, o, k)"
640916,0.005988,"(l, r, b, p, , y, c, u, i, o, k)"
...,...,...
0,0.592814,( )
28,0.604790,(s)
10,0.622754,(a)
27,0.634731,(r)


## FP Growth Algorithm

In [26]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from mlxtend.frequent_patterns import apriori, association_rules

# ---------------------------------------------
# 2. Load Dataset
# ---------------------------------------------
df = pd.read_csv(r"D:\\Ultimate Programming\\Data Bases\\Groceries_dataset.csv")
df.head()

# ---------------------------------------------
# 3. Check Item Column
# ---------------------------------------------
df['itemDescription'].head()

# ---------------------------------------------
# 4. Prepare Dataset for Apriori
# ---------------------------------------------

# We need basket format → each transaction = list of items
basket = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

# ---------------------------------------------
# 5. Convert to One-Hot Encoding
# ---------------------------------------------
# Create a dummy variable for each item
all_items = sorted(set(df['itemDescription']))

def encode_transaction(items):
    return pd.Series([1 if item in items else 0 for item in all_items], index=all_items)

df_hot_encoded = basket['itemDescription'].apply(encode_transaction)
df_hot_encoded.head()

# ---------------------------------------------
# 6. Apriori Algorithm (use_colnames=True → item names)
# ---------------------------------------------
frequent_itemsets = apriori(df_hot_encoded, 
                            min_support=0.004, 
                            use_colnames=True)

frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
frequent_itemsets.head()

# ---------------------------------------------
# 7. Generate Association Rules
# ---------------------------------------------
rules = association_rules(frequent_itemsets, 
                          metric="lift", 
                          min_threshold=1)

rules = rules.sort_values(by='lift', ascending=False)
rules.head()

# ---------------------------------------------
# 8. Show Top Rules
# ---------------------------------------------
print("Top Association Rules:")
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)


Top Association Rules:


Unnamed: 0,antecedents,consequents,support,confidence,lift
4,(frankfurter),(other vegetables),0.005146,0.136283,1.11615
5,(other vegetables),(frankfurter),0.005146,0.042146,1.11615
3,(yogurt),(sausage),0.005748,0.066926,1.108986
2,(sausage),(yogurt),0.005748,0.095238,1.108986
0,(soda),(sausage),0.005948,0.061253,1.014975
1,(sausage),(soda),0.005948,0.09856,1.014975
6,(citrus fruit),(yogurt),0.004611,0.086792,1.010642
7,(yogurt),(citrus fruit),0.004611,0.053696,1.010642


In [27]:
basket = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

In [28]:
basket

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"
...,...,...,...
14958,4999,24-01-2015,"[tropical fruit, berries, other vegetables, yo..."
14959,4999,26-12-2015,"[bottled water, herbs]"
14960,5000,09-03-2014,"[fruit/vegetable juice, onions]"
14961,5000,10-02-2015,"[soda, root vegetables, semi-finished bread]"
