In [1]:
import pandas as pd
import numpy as np
import networkx as nx

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, fpmax, association_rules

import matplotlib as mpl
import matplotlib.pyplot as plt
from itertools import chain

In [3]:
all_data = pd.read_csv('groceries - groceries.csv')
all_data

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,17,sausage,chicken,beef,hamburger meat,citrus fruit,grapes,root vegetables,whole milk,butter,...,,,,,,,,,,
9831,1,cooking chocolate,,,,,,,,,...,,,,,,,,,,
9832,10,chicken,citrus fruit,other vegetables,butter,yogurt,frozen dessert,domestic eggs,rolls/buns,rum,...,,,,,,,,,,
9833,4,semi-finished bread,bottled water,soda,bottled beer,,,,,,...,,,,,,,,,,


In [7]:
np_data = all_data.to_numpy()
np_data = [[elem for elem in row[1:] if isinstance(elem,str)] for row in np_data]

In [5]:
unique_items = set()
for row in np_data:
    for elem in row:
        unique_items.add(elem)

In [10]:
print(unique_items)

{'abrasive cleaner', 'soap', 'skin care', 'liver loaf', 'prosecco', 'cream', 'kitchen utensil', 'sugar', 'fruit/vegetable juice', 'packaged fruit/vegetables', 'zwieback', 'male cosmetics', 'flower (seeds)', 'canned vegetables', 'fish', 'ketchup', 'red/blush wine', 'chewing gum', 'decalcifier', 'cream cheese', 'organic products', 'honey', 'softener', 'brown bread', 'liqueur', 'mayonnaise', 'long life bakery product', 'salty snack', 'pudding powder', 'margarine', 'dessert', 'yogurt', 'herbs', 'curd cheese', 'turkey', 'detergent', 'hard cheese', 'other vegetables', 'brandy', 'ready soups', 'tropical fruit', 'organic sausage', 'baby food', 'baby cosmetics', 'berries', 'pastry', 'hamburger meat', 'pet care', 'salad dressing', 'cereals', 'rubbing alcohol', 'light bulbs', 'toilet cleaner', 'meat', 'misc. beverages', 'rolls/buns', 'mustard', 'pasta', 'bags', 'napkins', 'baking powder', 'syrup', 'dental care', 'house keeping products', 'Instant food products', 'cooking chocolate', 'spices', 'di

In [8]:
len(unique_items)

169

In [11]:
te = TransactionEncoder()
te_ary = te.fit(np_data).transform(np_data)
data = pd.DataFrame(te_ary, columns=te.columns_)

In [12]:
result = fpgrowth(data, min_support=0.03, use_colnames = True)
result

Unnamed: 0,support,itemsets
0,0.082766,(citrus fruit)
1,0.058566,(margarine)
2,0.139502,(yogurt)
3,0.104931,(tropical fruit)
4,0.058058,(coffee)
...,...,...
58,0.033249,"(whole milk, pastry)"
59,0.047382,"(other vegetables, root vegetables)"
60,0.048907,"(whole milk, root vegetables)"
61,0.030605,"(sausage, rolls/buns)"


In [13]:
def min_max_support(result):
    support_stats = {}
    for _, [support, itemsets] in result.iterrows():
        set_size = len(itemsets)
        if set_size in support_stats:
            support_stats[set_size]["min"] = min(support, support_stats[set_size]["min"])
            support_stats[set_size]["max"] = max(support, support_stats[set_size]["max"])
        else:
            support_stats[set_size] = {"min": support, "max": support}
    return support_stats

In [15]:
pd.DataFrame(min_max_support(result)).T

Unnamed: 0,min,max
1,0.030402,0.255516
2,0.030097,0.074835


In [16]:
result = fpmax(data, min_support=0.03, use_colnames = True).sort_values('support', ascending=False)
result

Unnamed: 0,support,itemsets
35,0.098526,(shopping bags)
31,0.080529,(bottled beer)
30,0.079817,(newspapers)
29,0.077682,(canned beer)
49,0.074835,"(other vegetables, whole milk)"
27,0.072293,(fruit/vegetable juice)
25,0.06487,(brown bread)
24,0.063447,(domestic eggs)
23,0.058973,(frankfurter)
22,0.058566,(margarine)


In [18]:
pd.DataFrame(min_max_support(result)).T

Unnamed: 0,min,max
1,0.030402,0.098526
2,0.030097,0.074835
