In [40]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [29]:
# Load your data
df = pd.read_csv('basket_analysis.csv', index_col='Unnamed: 0')  
print(f"Loaded {len(df)} transactions with {len(df.columns)} items")
df.head()

Loaded 999 transactions with 16 items


Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Garlic,Onion,Sugar,Broccoli,Yogurt,Chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
# Run Apriori algorithm
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

print(f"Found {len(frequent_itemsets)} frequent itemsets")


Found 169 frequent itemsets


In [31]:
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.383383,(Apple),1
1,0.384384,(Bread),1
2,0.420420,(Butter),1
3,0.404404,(Cheese),1
4,0.407407,(Corn),1
...,...,...,...
164,0.101101,"(Chocolate, Garlic, Ice cream)",3
165,0.101101,"(Chocolate, Onion, Ice cream)",3
166,0.100100,"(Garlic, Milk, Kidney Beans)",3
167,0.101101,"(Yogurt, Garlic, Kidney Beans)",3


In [25]:
# Show most frequent individual items
print("Single Items:")
single_items = frequent_itemsets[frequent_itemsets['length'] == 1].sort_values('support', ascending=False)
single_items

Single Items:


Unnamed: 0,support,itemsets,length
15,0.421421,(Chocolate),1
2,0.42042,(Butter),1
14,0.42042,(Yogurt),1
7,0.41041,(Ice cream),1
12,0.409409,(Sugar),1
8,0.408408,(Kidney Beans),1
4,0.407407,(Corn),1
9,0.405405,(Milk),1
3,0.404404,(Cheese),1
11,0.403403,(Onion),1


In [None]:
print("MOST FREQUENT ITEMS:")
for i, (_, row) in enumerate(single_items.head(10).iterrows()):
    # print(i)        # 0-based loop counter from enumerate
    # print(_)        # the index of the row from iterrows
    # print(row)      # series with all values in the row

    item = list(row['itemsets'])[0]     # Take the first value in itemsets [itemset is frozen set so it converted first to list]

    print(f"{i+1}. {item} - {row['support']*100:.1f}%")

MOST FREQUENT ITEMS:
1. Chocolate - 42.1%
2. Butter - 42.0%
3. Yogurt - 42.0%
4. Ice cream - 41.0%
5. Sugar - 40.9%
6. Kidney Beans - 40.8%
7. Corn - 40.7%
8. Milk - 40.5%
9. Cheese - 40.4%
10. Onion - 40.3%


In [33]:
# Show most frequent pairs
print("MOST BOUGHT TOGETHER:")
pairs = frequent_itemsets[frequent_itemsets['length'] == 2].sort_values('support', ascending=False)

for i, (_, row) in enumerate(pairs.head(10).iterrows()):
    items = ' + '.join(list(row['itemsets']))       # join elemnts in the list by +
    print(f"{i+1}. {items} - {row['support']*100:.1f}%")


MOST BOUGHT TOGETHER:
1. Chocolate + Milk - 21.1%
2. Butter + Ice cream - 20.7%
3. Chocolate + Ice cream - 20.2%
4. Butter + Chocolate - 20.2%
5. Butter + Kidney Beans - 20.2%
6. Cheese + Kidney Beans - 20.0%
7. Milk + Kidney Beans - 19.9%
8. Dill + Chocolate - 19.9%
9. Chocolate + Yogurt - 19.8%
10. Butter + Milk - 19.8%


In [None]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
print(f"Generated {len(rules)} association rules")

Generated 97 association rules


In [37]:
# Show strongest rules
print("STRONGEST RULES:")
rules_sorted = rules.sort_values('confidence', ascending=False)
rules_sorted

STRONGEST RULES:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
81,"(Dill, Broccoli)",(Chocolate),0.168168,0.421421,0.101101,0.601190,1.426578,0.030231,1.450764,0.359474
75,"(Dill, Milk)",(Chocolate),0.190190,0.421421,0.114114,0.600000,1.423753,0.033964,1.446446,0.367532
43,"(Dill, Cheese)",(Onion),0.177177,0.403403,0.102102,0.576271,1.428523,0.030628,1.407968,0.364570
74,"(Dill, Chocolate)",(Milk),0.199199,0.405405,0.114114,0.572864,1.413065,0.033358,1.392051,0.365033
15,"(Ice cream, Kidney Beans)",(Butter),0.196196,0.420420,0.110110,0.561224,1.334913,0.027625,1.320902,0.312125
...,...,...,...,...,...,...,...,...,...,...
84,"(Chocolate, Ice cream)",(Garlic),0.202202,0.401401,0.101101,0.500000,1.245636,0.019937,1.197197,0.247177
87,"(Chocolate, Ice cream)",(Onion),0.202202,0.403403,0.101101,0.500000,1.239454,0.019532,1.193193,0.242158
11,"(Butter, Kidney Beans)",(Corn),0.202202,0.407407,0.101101,0.500000,1.227273,0.018722,1.185185,0.232120
32,"(Butter, Kidney Beans)",(Garlic),0.202202,0.401401,0.101101,0.500000,1.245636,0.019937,1.197197,0.247177


In [38]:
for i, (_, rule) in enumerate(rules_sorted.head(10).iterrows()):
    antecedent = ' + '.join(list(rule['antecedents']))
    consequent = ' + '.join(list(rule['consequents']))
    print(f"{i+1}. {antecedent} → {consequent} ({rule['confidence']*100:.1f}%)")


1. Dill + Broccoli → Chocolate (60.1%)
2. Dill + Milk → Chocolate (60.0%)
3. Dill + Cheese → Onion (57.6%)
4. Dill + Chocolate → Milk (57.3%)
5. Ice cream + Kidney Beans → Butter (56.1%)
6. Dill + Ice cream → Chocolate (55.7%)
7. Cheese + Ice cream → Kidney Beans (55.6%)
8. Onion + Cheese → Dill (55.1%)
9. Butter + Broccoli → Ice cream (54.9%)
10. Garlic + Milk → Kidney Beans (54.9%)


In [39]:
# Summary
print(f"SUMMARY:")
print(f"Transactions: {len(df)}")
print(f"Items: {len(df.columns)}")
print(f"Frequent itemsets: {len(frequent_itemsets)}")
print(f"Association rules: {len(rules)}")


SUMMARY:
Transactions: 999
Items: 16
Frequent itemsets: 169
Association rules: 97
