**Loading libraries.**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from collections import Counter
from itertools import combinations
import ast


**Set Parameters.**

In [None]:

rng_seed = 42
top_n_items = 15
min_count_pairs = 20
min_count_triples = 5
top_k = 10
price_min, price_max = 0.50, 15.00


groc_path = 'groceries.csv'
clean_path = 'transactions_clean.csv'
prices_path = 'product_prices.csv'
transaction_path = 'transactions_priced.csv'



**PART A**


1.   Load groceries.csv
2.   Transform schema to a canonical format
3.   Print a data dictionary
4.   Basic EDA




1. Loading groceries.csv.

In [None]:
groc = pd.read_csv(groc_path, header=None)
groc.head(10)


2. Transform schema to a canonical format.

In [None]:
groc['TransactionID'] = groc.index
groc['Items'] = groc.iloc[:, :-1].apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
groc = groc[['TransactionID', 'Items']]

In [None]:
groc

3. Print a data dictionary.

In [None]:
data_dict = pd.DataFrame({
    'Column': ['TransactionID', 'Items'],
    'Type': ['int', 'list of str'],
    'Description': ['Unique identifier', 'List of grocery items in that basket']
})

In [None]:
data_dict

4. Basic EDA.

a. Number of transactions and unique products.

In [None]:
transac_number = len(groc)
unique_prod = set([item for basket in groc['Items'] for item in basket])



In [None]:
unique_prod

b. Basketsize distribution (min/median/95th percentile).

In [None]:
size_basket = groc['Items'].apply(len)
min_size = size_basket.min()
median_size = size_basket.median()
size_basket_95th_percentile = size_basket.quantile(0.95)

In [None]:
min_size

In [None]:
median_size

In [None]:
size_basket_95th_percentile

c. Top 20 products by frequency.

In [None]:
count_prod = Counter([item for basket in groc['Items'] for item in basket])
top_20 = count_prod.most_common(20)

In [None]:
for item, count in top_20:
    print(f"{item}: {count}")

**PART B**


1. Standardize item names (lowercase, strip whitespace; optional: replace spaces with underscores).
2. Remove empty/invalid items; drop baskets with fewer than 2 items.
3. Create a canonical transactions table with columns: transaction id, items (list of strings),
basket size. Persist as transactions clean.csv.


1. Standardize item names. Remove empty/invalid items.

In [None]:
def standardize_row(row):
    clean_items = []
    for item in row['Items']:
        if isinstance(item, str) and item.strip() != '':
            cleaned = item.strip().lower().replace(' ', '_')
            clean_items.append(cleaned)
    return clean_items




In [None]:
groc['Items'] = groc.apply(standardize_row, axis=1)


Printing the result to check.


In [None]:
groc

 2. Drop baskets with fewer than 2 items.

In [None]:
groc = groc[groc['Items'].map(len) >= 2]

**Explanation**:

Empty or invalid items (blank strings or missing values) were removed to ensure data consistency.  
Transactions with fewer than 2 items were dropped because they cannot contribute to pairwise or triple co-occurrence analysis (no possible combinations).  
This filtering step keeps only meaningful baskets for association rule mining.


In [None]:
groc

3. Create a canonical transactions table with columns.

In [None]:
groc['Basket_Size'] = groc['Items'].apply(len)

In [None]:
groc = groc.reset_index(drop=True)

groc['TransactionID'] = groc.index

In [None]:
groc

4. Persist as transactions clean.csv.

In [None]:
groc.to_csv(clean_path, index=False)

Load transactions_clean.csv.

In [None]:
fd = pd.read_csv(clean_path)

In [None]:
fd

**PART C**

1. Create a product-level price map: assign each unique product a random unit price in a
reasonable range (e.g., $0.50â€“$15.00) using a fixed random seed for reproducibility. Save as product prices.csv with columns product, price.
2. Compute basket totals by summing unit prices for items in each transaction (assume quantity=
1 unless specified).
3. Add a basket total column to the transactions table and export as transactions priced.csv.


1. Convert Items from string form (loaded from CSV) back to a real Python list.

In [None]:
if isinstance(fd['Items'].iloc[0], str):
    fd['Items'] = fd['Items'].apply(ast.literal_eval)

2. Assign each unique product a random unit price in a reasonable range.

In [None]:
unique_items = sorted(set([item for basket in fd['Items'] for item in basket]))
random_prices = np.random.default_rng(rng_seed).uniform(price_min, price_max, len(unique_items))


In [None]:
unique_items

In [None]:
random_prices

3. Save as product prices.csv.

In [None]:
price = pd.DataFrame({'item': unique_items, 'price': random_prices.round(2)})


In [None]:
price

In [None]:
price.to_csv(prices_path, index=False)

4. Create a product-level price map.

In [None]:
price_map = dict(zip(price.item, price.price))


In [None]:
price_map

5. Compute basket totals.

In [None]:
def matching_prices(items):
    return sum(price_map[i] for i in items)



6. Add a basket total column.

In [None]:
fd['Basket_Total'] = fd['Items'].apply(matching_prices)

7. Export as transactions priced.csv.

In [None]:
fd.to_csv(transaction_path, index=False)


**PART D**

1. Count pairs and triples of items that occur in the same basket. Define support count (number
of baskets containing the itemset).
2. Make min count configurable (default 20). Return all pairs/triples meeting the threshold.
3. Compute top-k pairs and top-k triples by frequency (default k=10), with ties broken deterministically
(alphabetical).
4. Report both support count and support fraction for each itemset.

1. Set a Counter.

In [None]:
pair_count = Counter()
triple_count = Counter()

2. Count pairs and triples of items that occur in the same basket.

In [None]:
for items in fd['Items']:
    unique_items = sorted(set(items))
    pair_count.update(combinations(unique_items, 2))

    if len(unique_items) >= 3:
        triple_count.update(combinations(unique_items, 3))

3. Return all pairs meeting the treshold.

In [None]:
pairs = pd.DataFrame(pair_count.items(), columns=['Pairs', 'Count'])
pairs = pairs[pairs['Count'] >= min_count_pairs]

In [None]:
pairs

4. Return all triples meeting the treshold.

In [None]:
triples = pd.DataFrame(triple_count.items(), columns=['Tripleset', 'Count'])
triples = triples[triples['Count'] >= min_count_triples].copy()

In [None]:
triples

5. Support Fraction for pairs and triples.

In [None]:
baskets_number = len(fd)
pairs['Support_Fraction'] = pairs['Count'] / baskets_number
triples['Support_Fraction'] = triples['Count'] / baskets_number

6. Compute top-k pairs and top-k triples by frequency.

In [None]:
pairs = pairs.sort_values(by=['Count', 'Pairs'], ascending=[False, True]).head(top_k)

In [None]:
pairs

In [None]:
triples = triples.sort_values(by=['Count', 'Tripleset'], ascending=[False, True]).head(top_k)

In [None]:
triples

**PART E**
1. Bar chart of the top 15 individual items by frequency.
2. Bar chart of top-k pairs by support fraction.
3. Heatmap of a co-occurrence matrix for the 25 most frequent items.
4. Distribution plot of basket size and basket total (histogram or ECDF).

Recompute clean summaries from fd (post-clean, post-pricing).

In [None]:
clean_counts = Counter([it for b in fd['Items'] for it in b])
top_items_df = pd.DataFrame(clean_counts.most_common(top_n_items), columns=['Item','Count'])

size_basket_clean = fd['Items'].str.len()

Adding percentage columns.

In [None]:
pairs  = pairs.assign(**{'Support_%': 100 * pairs['Support_Fraction']})
triples = triples.assign(**{'Support_%': 100 * triples['Support_Fraction']})


1. Bar chart of the top 15 individual items by frequency.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(y='Item', x='Count', data=top_items_df)
plt.title("Top 15 Most Frequent Products")
plt.xlabel("Frequency")
plt.ylabel("Product")
plt.tight_layout()
plt.show()

2. Bar chart of top-k pairs by support fraction.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(
    x='Support_%',
    y=pairs['Pairs'].apply(lambda x: ', '.join(x)),
    data=pairs
)
plt.title(f'Top {top_k} Item Pairs by Support')
plt.xlabel('Support (%)')
plt.ylabel('Item Pair')
plt.tight_layout()
plt.show()

Bar chart of top-k triples by support fraction.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(
    x='Support_%',
    y=triples['Tripleset'].apply(lambda x: ', '.join(x)),
    data=triples
)
plt.title(f'Top {top_k} Triples by Support')
plt.xlabel('Support (%)')
plt.ylabel('Triples')
plt.tight_layout()
plt.show()

3. Heatmap of a co-occurrence matrix for the 25 most frequent items.

In [None]:
top25_products = [item for item, _ in clean_counts.most_common(25)]
co_matrix = pd.DataFrame(0, index=top25_products, columns=top25_products, dtype=int)
for items in fd['Items']:
    items = [i for i in items if i in top25_products]
    for a, b in combinations(sorted(set(items)), 2):
        co_matrix.loc[a, b] += 1
        co_matrix.loc[b, a] += 1

plt.figure(figsize=(10,8))
sns.heatmap(co_matrix, square=True)
plt.title('Co-occurrence Heatmap (Top 25 Products)')
plt.xlabel('Product'); plt.ylabel('Product')
plt.tight_layout()
plt.show()

4. Distribution plot of basket size.

In [None]:
plt.figure(figsize=(6,4))
plt.hist(size_basket_clean, bins=20, edgecolor='black')
plt.title("Distribution of Basket Sizes")
plt.xlabel("Basket Size")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

Distribution plot of basket total.

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(fd['Basket_Total'], bins=30)
plt.title("Distribution of Basket Totals ($)")
plt.xlabel("Total Basket Value ($)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()