## Import Libraries

In [39]:

import sys
import os

import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

sys.path.append(os.path.abspath('..'))
from utils.db import get_db

In [41]:
engine = get_db()

## Gather Data

select sale_date, sale_id, product_code, product_description, qty, total_sales, sub_department_description,
sale_type, item_ring_type
from sales
where store_number='440'
and sale_date='5/15/2025'
and sale_type in ('Refunded', 'Sale')
and item_ring_type in ('ITEM')

In [145]:
query = """select sale_date, sale_id, product_code, product_description, qty, total_sales, sub_department_description,
sale_type, item_ring_type
from sales
where store_number='440'
and sale_date between '5/10/2025' and '5/15/2025'
and sale_type in ('Refunded', 'Sale')
and item_ring_type in ('ITEM')"""

In [None]:
#ds = pd.read_csv('../data/local_daily_basket_analysis_for_items_purchased_together.csv')

In [146]:
ds = pd.read_sql(query,engine)

2025-06-11 16:03:08,205 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-06-11 16:03:08,205 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-06-11 16:03:08,205 INFO sqlalchemy.engine.Engine [cached since 1.143e+04s ago] {'table_name': "select sale_date, sale_id, product_code, product_description, qty, total_sales, sub_department_description,\nsale_type, item_ring_type\nfrom sales\nwhere store_number='440'\nand sale_date between '5/10/2025' and '5/15/2025'\nand sale_type in ('Refunded', 'Sale')\nand item_ring_type in ('ITEM')", 'param_1': 'r', 'param_2': 'p', 'param_3': 'f'

In [147]:
ds.head()

Unnamed: 0,sale_date,sale_id,product_code,product_description,qty,total_sales,sub_department_description,sale_type,item_ring_type
0,2025-05-15,374576,467.0,ACE HARDWARE,3.0,0.57,Hillman Fasteners,Sale,ITEM
1,2025-05-15,106319,4011.0,BANANA LB,1.0,1.81,Wic Produce,Sale,ITEM
2,2025-05-10,102508,2.0,UNLEADED,1.0,54.65,Unleaded,Sale,ITEM
3,2025-05-10,102505,2.0,UNLEADED,1.0,23.4,Unleaded,Sale,ITEM
4,2025-05-10,102501,2.0,UNLEADED,1.0,20.1,Unleaded,Sale,ITEM


In [148]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41030 entries, 0 to 41029
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   sale_date                   41030 non-null  datetime64[ns]
 1   sale_id                     41030 non-null  int64         
 2   product_code                41030 non-null  object        
 3   product_description         41030 non-null  object        
 4   qty                         41030 non-null  float64       
 5   total_sales                 41030 non-null  float64       
 6   sub_department_description  41030 non-null  object        
 7   sale_type                   41030 non-null  object        
 8   item_ring_type              41030 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 2.8+ MB


## Clean Data

In [149]:
# lets filter this down to just items
ds = ds[ds['item_ring_type'] == 'ITEM']

In [150]:
#now lets get rid of any empty products
ds.dropna(subset=['product_description', 'sale_id'], inplace=True)

In [151]:
# convert sale_id to string
ds['sale_id'] = ds['sale_id'].astype(str)

In [152]:
transactions = ds.groupby('sale_id')['product_description'].apply(list)

In [153]:
transactions.head()

sale_id
102481     [PREMIUM]
102482    [UNLEADED]
102483    [UNLEADED]
102484    [UNLEADED]
102486    [UNLEADED]
Name: product_description, dtype: object

In [154]:
# filter out single item transactions as these wont apply
transactions_filtered = transactions[transactions.apply(lambda x: len(x) > 1)]
transaction_list = transactions_filtered.tolist()


In [155]:
len(transaction_list)

4808

In [156]:
transaction_list[:-20]

[['KEB ZESTA SALTINES 16 OZ',
  'BST CH GRD A LRG 18P EGG 1.5 DZ',
  'DAISY 4% CTGE CHS/PINAPL 6 OZ',
  'MC PURE VANILLA EXTRACT 1 OZ',
  'VEL SHRP CHDR SNGLS 12 OZ',
  'LEAN GROUND BEEF LB',
  'LAYS LIGHTLY SALTED 7.75 OZ',
  'LEAN GROUND BEEF LB',
  'SUMMER SAUSAGE W/JALAPENO AND CHEESE',
  'BANANA LB',
  'APPLE WA HONEYCRISP LB',
  'DAISY 4% CTGE CHS/PINAPL 6 OZ',
  'DAISY PEACH COTT CHS 6 OZ',
  'DAISY PEACH COTT CHS 6 OZ',
  'CARD MDAY ANYONE',
  'PRAIRIE FARM 2% MILK GL 128 OZ',
  'BUNNY HONEY WHEAT 20 OZ'],
 ['YOU ARE MY SUNSHINE BQT BQT', 'COUNTRY GIRL CHARM FLOWER BQT'],
 ['BRKSTN RICOTTA CHEESE 15 OZ',
  'BRKSTN RICOTTA CHEESE 15 OZ',
  'SCOPE MW OUTLAST FRESH MINT 16.9 OZ',
  'PRAIRIE FARM WHOLE MILK 128 OZ',
  'MICK CHOC TRUFFLE MINT EACH',
  'DARK CHOC TRUFFLE EACH',
  'BORDEN HEAVY WHIPNG CREAM 16 OZ',
  'BORDEN HEAVY WHIPNG CREAM 16 OZ'],
 ['VARIETY DOZEN DONUT',
  'MISC FLORAL GIFTWARE',
  'MISC FLORAL GIFTWARE',
  'CHOCOLATE CHIP COOKIE',
  'MISC BALLOON ORDER'],
 ['AI

## Preprocessing

In [157]:
# one-hot encode the transactions
te = TransactionEncoder()
te_array = te.fit(transaction_list).transform(transaction_list)
ds_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [158]:
ds_encoded.head()

Unnamed: 0,BUTTERFLY PORK CHOPS LB,FRYER LIVERS LB,FRYER THIGHS LB,FRYER WHOLE LB,FRYER WINGS LB,SMOKED PORK HOCKS LB,1/16 CAKE EACH,1/2 FRESH APPLE PIE,1/2 SLAB RIB COLD EACH,1/4 WATERMELON LB,...,ZIP FREEZER BAG GALLON 28 CT,ZIP FREEZER BAG GL 14 CT,ZIP FRZ BAG GAL 16 CT,ZIP SNDWICH BAG 40 CT,ZIP STORAGE BAG QT 24 CT,ZIPL SNDWCH BAG 50 CT,ZOOM LIZARD 6 IN BLACK BLUE 9 PK,ZSGR RC COLA 2LTR 67.62 OZ,ZYRTEC 10MG TABLETS 5 CT,ZZZQUIL ULTR DOX TAB 12 CT
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Create Itemsets

In [159]:
frequent_itemsets = fpgrowth(ds_encoded, min_support=0.002, use_colnames=True)

In [160]:
frequent_itemsets.tail()

Unnamed: 0,support,itemsets
668,0.00208,"(GALA APPLES 3 LB, BANANA LB)"
669,0.00208,"(STRAWBERRIES 1 LB, DESSERT SHELLS 6 CT)"
670,0.002496,"(SQUASH YELLOW LB, SQUASH ZUCCHINI LB)"
671,0.002288,"(JJ BERRY FRT SNK PIE 4 OZ, JJ PEACH SNACK PIE..."
672,0.00208,"(BANANA LB, LEWIS 1/2 LOAF NUTTY OAT 12 OZ)"


In [161]:
fi = frequent_itemsets.sort_values(by='support', ascending=False)
fi.tail(20)

Unnamed: 0,support,itemsets
368,0.00208,(CILANTRO LB)
206,0.00208,(TENN PRIDE MILD SAUSAGE 1 LB)
205,0.00208,(LAYS BBQ PRTY SZ 12.5 OZ)
71,0.00208,(BORDN HALF AND HALF 32 OZ)
156,0.00208,(MTLF MSHD POTATO EACH)
604,0.00208,"(DAISY SOUR CREAM 16 OZ, BANANA LB)"
341,0.00208,(BOB EVANS SAUS ROLLS 1LB)
350,0.00208,(BST CH BACON BITS REAL 3 OZ)
333,0.00208,(BST CH TORTILLA SOFT TACO 6 IN 10 CT 10)
642,0.00208,"(MISC HOT FOODS EACH, TENDERS 1 LB)"


## Create rules

In [164]:
rules = association_rules(frequent_itemsets,metric="lift", min_threshold=0.5)
print(rules.sort_values(by='lift', ascending=False).head(10))

                                      antecedents  \
125                            (MG POTTING MIX S)   
124                 (MG POTTING MIX SOIL 2 CF EA)   
224               (GM HONEY NUT CHEERIOS 10.8 OZ)   
225  (CPN: Buy 2 Save $2 on GM Cereals 5/14-5/20)   
238                   (JJ BERRY FRT SNK PIE 4 OZ)   
239                     (JJ PEACH SNACK PIE 4 OZ)   
236                            (SQUASH YELLOW LB)   
237                          (SQUASH ZUCCHINI LB)   
175                            (PEPPER YELLOW EA)   
174                            (PEPPER ORANGE EA)   

                                      consequents  antecedent support  \
125                 (MG POTTING MIX SOIL 2 CF EA)            0.002288   
124                            (MG POTTING MIX S)            0.002288   
224  (CPN: Buy 2 Save $2 on GM Cereals 5/14-5/20)            0.004576   
225               (GM HONEY NUT CHEERIOS 10.8 OZ)            0.002496   
238                     (JJ PEACH SNACK PIE 4 OZ)  

In [165]:
def format_rule(row):
    antecedent = ' + '.join(sorted(row['antecedents']))
    consequent = ', '.join(sorted(row['consequents']))
    return f"If a customer buys {antecedent}, they also buy {consequent} (Lift: {row['lift']:.2f}, Confidence: {row['confidence']:.0%})"


In [166]:
rules['rule_summary'] = rules.apply(format_rule, axis=1)

In [167]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,rule_summary
0,(BUNNY HONEY WHEAT 20 OZ),(BANANA LB),0.015391,0.085067,0.002496,0.162162,1.906297,1.0,0.001187,1.092017,0.482855,0.025478,0.084264,0.095751,"If a customer buys BUNNY HONEY WHEAT 20 OZ, th..."
1,(BANANA LB),(BUNNY HONEY WHEAT 20 OZ),0.085067,0.015391,0.002496,0.02934,1.906297,1.0,0.001187,1.01437,0.519626,0.025478,0.014167,0.095751,"If a customer buys BANANA LB, they also buy BU..."
2,(CARD MDAY ANYONE),(FRESH ARRANGEMENT OR STEMS),0.010815,0.018303,0.00208,0.192308,10.506993,1.0,0.001882,1.215435,0.914718,0.076923,0.177249,0.152972,"If a customer buys CARD MDAY ANYONE, they also..."
3,(FRESH ARRANGEMENT OR STEMS),(CARD MDAY ANYONE),0.018303,0.010815,0.00208,0.113636,10.506993,1.0,0.001882,1.116003,0.921695,0.076923,0.103945,0.152972,"If a customer buys FRESH ARRANGEMENT OR STEMS,..."
4,(APPLE WA HONEYCRISP LB),(BANANA LB),0.006864,0.085067,0.002288,0.333333,3.9185,1.0,0.001704,1.3724,0.749948,0.025522,0.27135,0.180114,"If a customer buys APPLE WA HONEYCRISP LB, the..."


In [169]:
rules.to_csv('../data/association_rules.csv', index=False)

In [168]:
print(rules[['rule_summary', 'lift']].sort_values(by='lift', ascending=False).head(20))

                                          rule_summary        lift
125  If a customer buys MG POTTING MIX S, they also...  437.090909
124  If a customer buys MG POTTING MIX SOIL 2 CF EA...  437.090909
224  If a customer buys GM HONEY NUT CHEERIOS 10.8 ...  182.121212
225  If a customer buys CPN: Buy 2 Save $2 on GM Ce...  182.121212
238  If a customer buys JJ BERRY FRT SNK PIE 4 OZ, ...  135.610256
239  If a customer buys JJ PEACH SNACK PIE 4 OZ, th...  135.610256
236  If a customer buys SQUASH YELLOW LB, they also...   75.125000
237  If a customer buys SQUASH ZUCCHINI LB, they al...   75.125000
175  If a customer buys PEPPER YELLOW EA, they also...   57.612200
174  If a customer buys PEPPER ORANGE EA, they also...   57.612200
102  If a customer buys DORITOS COOL RANCH 9.25 OZ,...   45.695868
103  If a customer buys DORITOS NACHO CHEESE 9.25 O...   45.695868
173  If a customer buys PEPPER YELLOW EA, they also...   42.737778
172  If a customer buys PEPPER  RED EA, they also b...   42.73

In [170]:
sorted_rules = rules[['rule_summary', 'lift', 'confidence']].sort_values(by='lift', ascending=False)

In [171]:
sorted_rules.to_csv('../data/sorted_association_rules.csv', index=False)