__import re__

* Used for string cleaning and normalization.


__import csv__

* Allows reading and writing CSV files in plain Python.
* Save cleaned transactions row by row or read CSVs without pandas.

__import os__

* Provides tools to interact with the filesystem.

_Used for_

* Creating directories (os.makedirs)

* Joining file paths (os.path.join)

* Checking if directories exist

__from collections import Counter, defaultdict__

* Counts frequency of items in a list or iterable.

__defaultdict:__

* Like a normal dictionary but provides default values automatically for missing keys.

__from typing import List, Tuple, Dict, Any, Optional__

* Provides type hints to make code more readable and help with debugging.

In [178]:
import re                      
import csv                      
import os                      
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any, Optional

import pandas as pd           
from mlxtend.preprocessing import TransactionEncoder  


This block sets all paths, parameters, and directories needed for the preprocessing pipeline so that your code can read the 

raw CSV, clean it, and save outputs safely

In [179]:
EXPORT_DIR = "preprocessed_outputs"
# Use the local CSV in the repository by default. Change this to an absolute path if needed.
LOAD_PATH = "supermarket_transactions.csv"
MIN_ITEMS_PER_TX = 2
MAX_ITEMS_PER_TX = 7
OHE_CSV_NAME = "one_hot_transactions.csv"
CLEAN_CSV_NAME = "clean_transactions.csv"
SUMMARY_CSV_NAME = "preprocessing_summary.csv"

os.makedirs(EXPORT_DIR, exist_ok=True)


_str(item)_  : ensures the input is a string (handles None or numbers).

_.strip()_  : removes spaces at the beginning or end (" Milk " → "Milk").

_.lower()_ :  everything to lowercase ("Milk" → "milk").

_re.sub(pattern, replacement, string)_ : replaces all characters matching the pattern with a space.

_\s+_ : matches one or more whitespace characters

Replaces them with a single space

_s.replace_ : "&" to " and "


In [180]:
def normalize_item_name(item: str) -> str:
    s = str(item).strip().lower()
    s = re.sub(r"[\.\-_/\\\(\)]", " ", s)  # remove punctuation
    s = re.sub(r"\s+", " ", s)
    s = s.replace("&", " and ")
    return s.strip()

__Example use__

In [181]:
transactions = [
    ["Fish", "Salt", "Sugar", "Cereal", "Soap", "Toothpaste"],
    ["Shampoo", "Oranges", "Potatoes"]
]
for tx in transactions:
    cleaned_tx = [normalize_item_name(item) for item in tx]
    print(f"Original: {tx}")
    print(f"Cleaned : {cleaned_tx}")
    print("" * 40)

Original: ['Fish', 'Salt', 'Sugar', 'Cereal', 'Soap', 'Toothpaste']
Cleaned : ['fish', 'salt', 'sugar', 'cereal', 'soap', 'toothpaste']

Original: ['Shampoo', 'Oranges', 'Potatoes']
Cleaned : ['shampoo', 'oranges', 'potatoes']



This ensures no empty items or None values remain in a single transaction.

In [182]:
def clean_transaction(tx: List[str]) -> List[str]:
    cleaned = [normalize_item_name(it) for it in tx if it and normalize_item_name(it) != ""]
    return cleaned

_min items_ / _max items_: filters out transactions that are beyond the 2 or 7 boundary

transactions outside the allowed item range are dropped.

only transactions with an acceptable number of items are kept.

In [183]:
def clean_transactions(transactions: List[List[str]],
                       min_items: int = MIN_ITEMS_PER_TX,
                       max_items: int = MAX_ITEMS_PER_TX) -> List[List[str]]:
    cleaned_all = []
    for tx in transactions:
        cleaned_tx = clean_transaction(tx)
        if len(cleaned_tx) < min_items:
            continue
        if len(cleaned_tx) > max_items:
            continue
        cleaned_all.append(cleaned_tx)
    return cleaned_all

__Sorts the items in alphabetical order:__

In [184]:
def sort_items_in_transactions(transactions: List[List[str]]) -> List[List[str]]:
    return [sorted(tx) for tx in transactions]

In [185]:
transactions = [
    ["Milk", "Bread", "Eggs"],
    ["Juice", "Soap", "Tea"],
    ["Potatoes", "Coffee", "Onions"]
]

sorted_tx = sort_items_in_transactions(transactions)
for tx in sorted_tx:
    print(tx)

['Bread', 'Eggs', 'Milk']
['Juice', 'Soap', 'Tea']
['Coffee', 'Onions', 'Potatoes']


`TransactionEncoder()` comes from mlxtend.

It converts transactions into a boolean array  for each unique item.

`fit(transactions)` finds all unique items in your dataset.

`transform(transactions)` creates a 2D array where:

Rows = transactions

Columns = unique items

Value = True if the item exists in that transaction, else False.

In [186]:
def one_hot_encode_transactions(transactions: List[List[str]]) -> pd.DataFrame:
    """One-hot encode transactions for Apriori."""
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_).astype(int)
    return df

In [187]:
transactions = [
    ['milk', 'bread', 'eggs'],
    ['juice', 'soap'],
    ['potatoes', 'coffee', 'onions']
]
ohe_df = one_hot_encode_transactions(transactions)
print(ohe_df)

   bread  coffee  eggs  juice  milk  onions  potatoes  soap
0      1       0     1      0     1       0         0     0
1      0       0     0      1     0       0         0     1
2      0       1     0      0     0       1         1     0


__1 → the transaction contains that item.__

__0 → the transaction does not contain that item__

In [188]:
def check_data_quality(transactions: List[List[str]]) -> Dict[str, Any]:
    n = len(transactions)
    lengths = [len(tx) for tx in transactions]
    avg_len = sum(lengths)/n if n else 0
    min_len = min(lengths) if lengths else 0
    max_len = max(lengths) if lengths else 0
    flat_items = [item for tx in transactions for item in tx]
    unique_items = sorted(set(flat_items))
    item_counts = Counter(flat_items)
    summary = {
        "num_transactions": n,
        "avg_tx_length": avg_len,
        "min_tx_length": min_len,
        "max_tx_length": max_len,
        "num_unique_items": len(unique_items),
        "unique_items_sample": unique_items[:30],
        "top_20_items": item_counts.most_common(20),
        "item_counts": dict(item_counts)
    }
    return summary
summary_stats = check_data_quality(transactions)
summary_stats

{'num_transactions': 3,
 'avg_tx_length': 2.6666666666666665,
 'min_tx_length': 2,
 'max_tx_length': 3,
 'num_unique_items': 8,
 'unique_items_sample': ['bread',
  'coffee',
  'eggs',
  'juice',
  'milk',
  'onions',
  'potatoes',
  'soap'],
 'top_20_items': [('milk', 1),
  ('bread', 1),
  ('eggs', 1),
  ('juice', 1),
  ('soap', 1),
  ('potatoes', 1),
  ('coffee', 1),
  ('onions', 1)],
 'item_counts': {'milk': 1,
  'bread': 1,
  'eggs': 1,
  'juice': 1,
  'soap': 1,
  'potatoes': 1,
  'coffee': 1,
  'onions': 1}}

In [189]:
def save_clean_transactions_csv(df: pd.DataFrame, export_dir: str, filename: str):
    os.makedirs(export_dir, exist_ok=True)
    path = os.path.join(export_dir, filename)
    df.to_csv(path, index=False)
    print(f"Cleaned transactions CSV saved to: {path}")

In [190]:
def save_ohe_csv(ohe_df: pd.DataFrame, export_dir: str, filename: str):
    os.makedirs(export_dir, exist_ok=True)
    path = os.path.join(export_dir, filename)
    ohe_df.to_csv(path, index=False)
    print(f"One-hot encoded CSV saved to: {path}")

In [191]:
def save_summary_csv(summary: Dict[str, Any], export_dir: str, filename: str):
    """Save a human-readable summary DataFrame built from the summary dict."""
    os.makedirs(export_dir, exist_ok=True)
    # Build a flat summary dataframe
    flat = {
        "num_transactions": summary.get("num_transactions", 0),
        "avg_tx_length": summary.get("avg_tx_length", 0),
        "min_tx_length": summary.get("min_tx_length", 0),
        "max_tx_length": summary.get("max_tx_length", 0),
        "num_unique_items": summary.get("num_unique_items", 0)
    }
    df_flat = pd.DataFrame([flat])
    # Top items (if present) as separate columns concatenated
    top_items = summary.get("top_20_items", [])
    if top_items:
        df_top = pd.DataFrame(top_items, columns=["item", "count"])
        df_summary = pd.concat([df_flat, df_top], axis=1)
    else:
        df_summary = df_flat
    path = os.path.join(export_dir, filename)
    df_summary.to_csv(path, index=False)
    print(f"Summary CSV saved to: {path}")


In [192]:
flat = {
        "num_transactions": summary_stats.get("num_transactions", 0),
        "avg_tx_length": summary_stats.get("avg_tx_length", 0),
        "min_tx_length": summary_stats.get("min_tx_length", 0),
        "max_tx_length": summary_stats.get("max_tx_length", 0),
        "num_unique_items": summary_stats.get("num_unique_items", 0)
    }
df_flat = pd.DataFrame([flat])

In [193]:
# Build and display a readable summary DataFrame if `summary_stats` exists
try:
    flat = {
        "num_transactions": summary_stats.get("num_transactions", 0),
        "avg_tx_length": summary_stats.get("avg_tx_length", 0),
        "min_tx_length": summary_stats.get("min_tx_length", 0),
        "max_tx_length": summary_stats.get("max_tx_length", 0),
        "num_unique_items": summary_stats.get("num_unique_items", 0)
    }
    df_flat = pd.DataFrame([flat])
    top_items = summary_stats.get("top_20_items", [])
    if top_items:
        df_top = pd.DataFrame(top_items, columns=["item", "count"])
        df_summary = pd.concat([df_flat, df_top], axis=1)
    else:
        df_summary = df_flat
    display(df_summary)
except NameError:
    print("summary_stats is not defined. Run the preprocessing pipeline first.")


Unnamed: 0,num_transactions,avg_tx_length,min_tx_length,max_tx_length,num_unique_items,item,count
0,3.0,2.666667,2.0,3.0,8.0,milk,1
1,,,,,,bread,1
2,,,,,,eggs,1
3,,,,,,juice,1
4,,,,,,soap,1
5,,,,,,potatoes,1
6,,,,,,coffee,1
7,,,,,,onions,1


`num_transactions`	Total number of transactions in your dataset. Only the first row has this value because the code repeated the summary stats for every top item. Here, 5000 transactions.

`avg_tx_length`	Average number of items per transaction. Here 4.4904 items on average.

`min_tx_length`	Smallest transaction length (number of items). Here 2.0 items. 
`max_tx_length`	Largest transaction length. Here 7.0 items. 

`num_unique_items`	Total number of unique items across all transactions. Here 30. 

`item`	_Name of the item_. This comes from the top 20 most frequent items list. Repeats down the rows.
count	How many times that item appears across all transactions. Example: "soap" occurs 788 times.

In [194]:
def preprocess_pipeline(LOAD_PATH: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
    df_original = pd.read_csv(LOAD_PATH) 
    transactions_raw = df_original.iloc[:, -1].apply(lambda x: str(x).split(",")).tolist()

    cleaned_tx = clean_transactions(transactions_raw) 
    sorted_tx = sort_items_in_transactions(cleaned_tx)

    df_cleaned = pd.DataFrame({"cleaned_items": ["; ".join(tx) for tx in sorted_tx]})
    
    ohe_df = one_hot_encode_transactions(sorted_tx)
    summary_stats = check_data_quality(sorted_tx)

    return df_cleaned, ohe_df, summary_stats


In [195]:
# Ensure preprocessing has been executed before saving outputs
try:
    df_cleaned, ohe_df, summary_stats  # check if already present
except NameError:
    df_cleaned, ohe_df, summary_stats = preprocess_pipeline(LOAD_PATH)

save_clean_transactions_csv(df_cleaned, EXPORT_DIR, CLEAN_CSV_NAME)
save_ohe_csv(ohe_df, EXPORT_DIR, OHE_CSV_NAME)
save_summary_csv(summary_stats, EXPORT_DIR, SUMMARY_CSV_NAME)


Cleaned transactions CSV saved to: preprocessed_outputs\clean_transactions.csv
One-hot encoded CSV saved to: preprocessed_outputs\one_hot_transactions.csv
Summary CSV saved to: preprocessed_outputs\preprocessing_summary.csv


In [196]:
if __name__ == "__main__":
    df_cleaned, ohe_df, summary_stats = preprocess_pipeline(LOAD_PATH)
    
    print(f"Number of transactions: {summary_stats['num_transactions']}")
    print(f"Unique items (sample): {summary_stats['unique_items_sample'][:10]}")
    print("Top 10 items (item, count):")
    print(summary_stats["top_20_items"][:10])


Number of transactions: 5000
Unique items (sample): ['apples', 'bananas', 'beef', 'bread', 'butter', 'carrots', 'cereal', 'cheese', 'chicken', 'coffee']
Top 10 items (item, count):
[('soap', 788), ('bread', 787), ('salt', 786), ('toothpaste', 780), ('fish', 778), ('bananas', 776), ('shampoo', 773), ('tomatoes', 767), ('coffee', 762), ('flour', 761)]


**Apriori — generate & export frequent itemsets**

This cell runs the Apriori algorithm on the one-hot-encoded transactions (`ohe_df`) and saves the top 10 frequent itemsets.

- **Purpose:** find frequent item combinations using `mlxtend.frequent_patterns.apriori` with `min_support=0.05`.
- **Input:** the one-hot encoded DataFrame `ohe_df`. If `ohe_df` is missing the cell will call `preprocess_pipeline(LOAD_PATH)` to create it.
- **Output:** a `frequent_itemsets` DataFrame containing `itemsets` (frozensets) and `support` (fraction of transactions). The cell also creates `itemset_str` (readable string) for display and export.
- **Display:** shows the top 10 itemsets sorted by `support` (readable `itemset_str` and `support`).
- **Export:** writes `preprocessed_outputs/top10_itemsets.csv` with columns `itemset_str` and `support`.

Notes / quick tips:
- To restrict to itemsets of size ≥ 2: `frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda s: len(s) >= 2)]`.
- To generate association rules afterwards, use:
  `from mlxtend.frequent_patterns import association_rules`
  `rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)` and filter by `lift` or `confidence` as needed.
- If you change `LOAD_PATH`, ensure it points to the correct CSV (absolute path or `supermarket_transactions.csv` in the repo).

Run instructions: execute preprocessing cells first (or run this cell alone — it will call the preprocessing pipeline automatically if `ohe_df` is not defined).


In [197]:
# Apriori: generate frequent itemsets and export top 10
from mlxtend.frequent_patterns import apriori
import os

# Ensure the preprocessing pipeline has been run and `ohe_df` exists.
try:
    ohe_df  # noqa: F821
except NameError:
    df_cleaned, ohe_df, summary_stats = preprocess_pipeline(LOAD_PATH)

# Run apriori to get frequent itemsets with minimum support 0.05
frequent_itemsets = apriori(ohe_df, min_support=0.05, use_colnames=True)

# Add readable itemset string for display/export
frequent_itemsets['itemset_str'] = frequent_itemsets['itemsets'].apply(lambda s: ', '.join(sorted(list(s))))

# Sort by support and select top 10
frequent_itemsets_sorted = frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True)
top10_itemsets = frequent_itemsets_sorted.head(10)

# Display top 10 itemsets (itemset string and support)
print(f"Total frequent itemsets found: {len(frequent_itemsets)}")
display(top10_itemsets[['itemset_str', 'support']])

# Export the top 10 itemsets to CSV
os.makedirs(EXPORT_DIR, exist_ok=True)
export_path = os.path.join(EXPORT_DIR, 'top10_itemsets.csv')
top10_itemsets[['itemset_str', 'support']].to_csv(export_path, index=False)
print(f"Top 10 itemsets exported to: {export_path}")


Total frequent itemsets found: 30




Unnamed: 0,itemset_str,support
0,soap,0.1576
1,bread,0.1574
2,salt,0.1572
3,toothpaste,0.156
4,fish,0.1556
5,bananas,0.1552
6,shampoo,0.1546
7,tomatoes,0.1534
8,coffee,0.1524
9,flour,0.1522


Top 10 itemsets exported to: preprocessed_outputs\top10_itemsets.csv
