# 02. Data Preprocessing

Prepare the data for the Apriori algorithm by converting it into a transaction format (one-hot encoded).

In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
import os

In [3]:
data_path = os.path.join("..", "data", "raw", "grocery_orders.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1000,2014-06-24,whole milk
1,1000,2014-06-24,pastry
2,1000,2014-06-24,salty snack
3,1000,2015-03-15,sausage
4,1000,2015-03-15,whole milk


## 1. Create Transactions
Group by `Member_number` and `Date` to assume items bought together on the same day are one transaction.

In [4]:
transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).tolist()
print(f"Total transactions: {len(transactions)}")
print(f"Sample transaction: {transactions[0]}")

Total transactions: 14963
Sample transaction: ['whole milk', 'pastry', 'salty snack']


## 2. One-Hot Encoding

In [5]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded.head()

Unnamed: 0,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,...,uht-milk,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 3. Save Processed Data

In [8]:
output_path = os.path.join("..", "data", "processed", "grocery_encoded.csv")
df_encoded.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ..\data\processed\grocery_encoded.csv
