## Import Libraries

In [1]:

import sys
import os

import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

sys.path.append(os.path.abspath('..'))
from utils.db import get_db

In [2]:
engine = get_db()

## Gather Data

select sale_date, sale_id, product_code, product_description, qty, total_sales, sub_department_description,
sale_type, item_ring_type
from sales
where store_number='440'
and sale_date='5/15/2025'
and sale_type in ('Refunded', 'Sale')
and item_ring_type in ('ITEM')

In [3]:
query = """select sale_date, sale_id, product_code, product_description, qty, total_sales, sub_department_description,
sale_type, item_ring_type
from sales
where store_number='440'
and sale_date between '5/10/2025' and '5/15/2025'
and sale_type in ('Refunded', 'Sale')
and item_ring_type in ('ITEM')"""

In [None]:
#ds = pd.read_csv('../data/local_daily_basket_analysis_for_items_purchased_together.csv')

In [4]:
ds = pd.read_sql(query,engine)

2025-06-12 14:08:59,186 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-06-12 14:08:59,186 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-12 14:08:59,241 INFO sqlalchemy.engine.Engine select current_schema()
2025-06-12 14:08:59,243 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-12 14:08:59,296 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-06-12 14:08:59,296 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-12 14:08:59,353 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-06-12 14:08:59,353 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [5]:
ds.head()

Unnamed: 0,sale_date,sale_id,product_code,product_description,qty,total_sales,sub_department_description,sale_type,item_ring_type
0,2025-05-11,478636,8259219415.0,NAKED WB STWBAN 15.2 OZ,1.0,3.99,Produce,Sale,ITEM
1,2025-05-11,478776,4062.0,CUCUMBER EA,2.0,2.38,Wic Produce,Sale,ITEM
2,2025-05-11,478683,85002770272.0,OLOPOP RIDGE RUSH SODA 12 OZ,1.0,2.49,Produce,Sale,ITEM
3,2025-05-11,374139,2718256264.0,JALAPENO JACK INFUSED GROUND BEEF PATTY,1.0,10.99,Meat,Sale,ITEM
4,2025-05-11,123807,3100012610.0,BANQ MEGA BOWL BUF CHKN 14 OZ,1.0,3.99,Frozen Food,Sale,ITEM


In [9]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41030 entries, 0 to 41029
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   sale_date                   41030 non-null  datetime64[ns]
 1   sale_id                     41030 non-null  int64         
 2   product_code                41030 non-null  object        
 3   product_description         41030 non-null  object        
 4   qty                         41030 non-null  float64       
 5   total_sales                 41030 non-null  float64       
 6   sub_department_description  41030 non-null  object        
 7   sale_type                   41030 non-null  object        
 8   item_ring_type              41030 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 2.8+ MB


In [40]:
ds['item'] = ds['product_code'].astype(str).str.replace('.0', '', regex=False) + "::" + ds['product_description']

In [41]:
ds.head()

Unnamed: 0,sale_date,sale_id,product_code,product_description,qty,total_sales,sub_department_description,sale_type,item_ring_type,item
0,2025-05-11,478636,8259219415.0,NAKED WB STWBAN 15.2 OZ,1.0,3.99,Produce,Sale,ITEM,8259219415::NAKED WB STWBAN 15.2 OZ
1,2025-05-11,478776,4062.0,CUCUMBER EA,2.0,2.38,Wic Produce,Sale,ITEM,4062::CUCUMBER EA
2,2025-05-11,478683,85002770272.0,OLOPOP RIDGE RUSH SODA 12 OZ,1.0,2.49,Produce,Sale,ITEM,85002770272::OLOPOP RIDGE RUSH SODA 12 OZ
3,2025-05-11,374139,2718256264.0,JALAPENO JACK INFUSED GROUND BEEF PATTY,1.0,10.99,Meat,Sale,ITEM,2718256264::JALAPENO JACK INFUSED GROUND BEEF ...
4,2025-05-11,123807,3100012610.0,BANQ MEGA BOWL BUF CHKN 14 OZ,1.0,3.99,Frozen Food,Sale,ITEM,3100012610::BANQ MEGA BOWL BUF CHKN 14 OZ


## Clean Data

In [31]:
# lets filter this down to just items
ds = ds[ds['item_ring_type'] == 'ITEM']

In [42]:
#now lets get rid of any empty products
ds.dropna(subset=['product_description', 'sale_id', 'product_code'], inplace=True)

In [43]:
# convert sale_id to string
ds['sale_id'] = ds['sale_id'].astype(str)

In [44]:
transactions = ds.groupby('sale_id')['item'].apply(list)

In [45]:
transactions.head()

sale_id
102481     [4::PREMIUM]
102482    [2::UNLEADED]
102483    [2::UNLEADED]
102484    [2::UNLEADED]
102486    [2::UNLEADED]
Name: item, dtype: object

In [46]:
# filter out single item transactions as these wont apply
transactions_filtered = transactions[transactions.apply(lambda x: len(x) > 1)]
transaction_list = transactions_filtered.tolist()


In [47]:
len(transaction_list)

4808

In [48]:
transaction_list[:-20]

[['2412601435::BUNNY HONEY WHEAT 20 OZ',
  '7342053121::DAISY 4% CTGE CHS/PINAPL 6 OZ',
  '7342053124::DAISY PEACH COTT CHS 6 OZ',
  '7003837286::BST CH GRD A LRG 18P EGG 1.5 DZ',
  '7342053124::DAISY PEACH COTT CHS 6 OZ',
  '7342053121::DAISY 4% CTGE CHS/PINAPL 6 OZ',
  '2100004489::VEL SHRP CHDR SNGLS 12 OZ',
  '28029400000::LEAN GROUND BEEF LB',
  '5210007086::MC PURE VANILLA EXTRACT 1 OZ',
  '2840020059::LAYS LIGHTLY SALTED 7.75 OZ',
  '28029400000::LEAN GROUND BEEF LB',
  '9908746090::SUMMER SAUSAGE W/JALAPENO AND CHEESE',
  '4011::BANANA LB',
  '3283::APPLE WA HONEYCRISP LB',
  '3010000133::KEB ZESTA SALTINES 16 OZ',
  '79590254088::CARD MDAY ANYONE',
  '7273022110::PRAIRIE FARM 2% MILK GL 128 OZ'],
 ['84186689452::YOU ARE MY SUNSHINE BQT BQT',
  '84186601109::COUNTRY GIRL CHARM FLOWER BQT'],
 ['81547301900::BORDEN HEAVY WHIPNG CREAM 16 OZ',
  '81547301900::BORDEN HEAVY WHIPNG CREAM 16 OZ',
  '7273021110::PRAIRIE FARM WHOLE MILK 128 OZ',
  '2100030088::BRKSTN RICOTTA CHEESE 15 OZ

## Preprocessing

In [49]:
# one-hot encode the transactions
te = TransactionEncoder()
te_array = te.fit(transaction_list).transform(transaction_list)
ds_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [50]:
ds_encoded.head()

Unnamed: 0,1022810403::CH LINEN SPRAY DISINFECTANT 6 OZ,1037416201::TFT STRWBRY CHEESECAKE 16 OZ,1037416205::CHEESECAKE VARIETY SAMPLER 16 OZ,1037416270::HERSHEY SMORE CHSCK 16OZ,1037442060::FT NY PLAIN CHSECAKE 40 OZ,1037442063::FATHERS TABLE FUDGE B CHEESECAKE 40 OZ,1037442065::4 VARIETY CHEESCAKE 40 OZ,1063144400474::FLDNG CHAIR CNNMN 34.8H,1066807502::CAMS COFFEE BAG STHRN PCN 12 OZ,1066807519::CAMS COFFEE BAG JAMAICA 10 OZ,...,9908746640::BRAUNSCHWEIGER 1 LB,9908746840::OLD WORLD SUMMER SAUSAGE,9955508003::GM KCUP COLOMBIAN 12 COUNT,9990010025::NSTL BUTTERFINGER FUN SIZE 6 PK,9990010087::NSTL BUTRFINGER BAR 1.9 OZ,9990050589::NSTL CRUNCH FS BAG 10 OZ,9990062546::BABYRUTH SNGL 1.9 OZ,9990069423::100 GRAND BAR 1.5 OZ,9990083720::NSTL BABY RUTH FUNSZ LDB 10.2 OZ,9990090836::NESTLE CRUNCH BARS 1.55 OZ
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Create Itemsets

In [51]:
frequent_itemsets = fpgrowth(ds_encoded, min_support=0.002, use_colnames=True)

In [52]:
frequent_itemsets.tail()

Unnamed: 0,support,itemsets
665,0.003328,"(4635::GRAPES RED SEEDLESS LB, 4011::BANANA LB)"
666,0.00208,"(1290000700::DESSERT SHELLS 6 CT, 4246::STRAWB..."
667,0.002496,"(28309600000::SQUASH ZUCCHINI LB, 28328000000:..."
668,0.002288,"(1128400311::JJ PEACH SNACK PIE 4 OZ, 11284003..."
669,0.00208,"(2412601700::LEWIS 1/2 LOAF NUTTY OAT 12 OZ, 4..."


In [53]:
fi = frequent_itemsets.sort_values(by='support', ascending=False)
fi.tail(20)

Unnamed: 0,support,itemsets
583,0.00208,"(4246::STRAWBERRIES 1 LB, 4799::TOMATOES HOT..."
306,0.00208,(7003865399::BST CH SHRD XSHRP CHDR 8 OZ)
600,0.00208,"(4246::STRAWBERRIES 1 LB, 3560081300::KY LEG..."
602,0.00208,"(4011::BANANA LB, 7342000011::DAISY SOUR CREAM..."
260,0.00208,(65272971079::BC HMB HLP CHSY HASHBROWN 5.5 OZ)
219,0.00208,(7273062042::PF ENG TOFFEE 16.5 OZ)
215,0.00208,(2100002522::KR CLBY M/JACK CUBES 6.4 OZ)
607,0.00208,"(89661200040::STAND ANNUAL PLANTS 4 IN, 702::F..."
229,0.00208,(2100061146::KR VELV SPRD SNGL 16 OZ)
227,0.00208,(79590251557::CARD MDAY ANYONE)


## Create rules

In [54]:
rules = association_rules(frequent_itemsets,metric="lift", min_threshold=0.5)
print(rules.sort_values(by='lift', ascending=False).head(10))

                                           antecedents  \
123          (3224756523::MG POTTING MIX SOIL 2 CF EA)   
122                    (29999500000::MG POTTING MIX S)   
222        (1600012479::GM HONEY NUT CHEERIOS 10.8 OZ)   
223  (960000003553::CPN: Buy 2 Save $2 on GM Cereal...   
235            (1128400303::JJ BERRY FRT SNK PIE 4 OZ)   
234              (1128400311::JJ PEACH SNACK PIE 4 OZ)   
232                  (28309600000::SQUASH ZUCCHINI LB)   
233                    (28328000000::SQUASH YELLOW LB)   
173                           (4689::PEPPER YELLOW EA)   
172                           (3121::PEPPER ORANGE EA)   

                                           consequents  antecedent support  \
123                    (29999500000::MG POTTING MIX S)            0.002288   
122          (3224756523::MG POTTING MIX SOIL 2 CF EA)            0.002288   
222  (960000003553::CPN: Buy 2 Save $2 on GM Cereal...            0.004576   
223        (1600012479::GM HONEY NUT CHEERIOS 10.

In [55]:
def format_rule(row):
    antecedent = ' + '.join(sorted(row['antecedents']))
    consequent = ', '.join(sorted(row['consequents']))
    return f"If a customer buys {antecedent}, they also buy {consequent} (Lift: {row['lift']:.2f}, Confidence: {row['confidence']:.0%})"


In [56]:
rules['rule_summary'] = rules.apply(format_rule, axis=1)

In [57]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,rule_summary
0,(2412601435::BUNNY HONEY WHEAT 20 OZ),(4011::BANANA LB),0.015391,0.085067,0.002496,0.162162,1.906297,1.0,0.001187,1.092017,0.482855,0.025478,0.084264,0.095751,If a customer buys 2412601435::BUNNY HONEY WHE...
1,(4011::BANANA LB),(2412601435::BUNNY HONEY WHEAT 20 OZ),0.085067,0.015391,0.002496,0.02934,1.906297,1.0,0.001187,1.01437,0.519626,0.025478,0.014167,0.095751,"If a customer buys 4011::BANANA LB, they also ..."
2,(3283::APPLE WA HONEYCRISP LB),(4011::BANANA LB),0.006864,0.085067,0.002288,0.333333,3.9185,1.0,0.001704,1.3724,0.749948,0.025522,0.27135,0.180114,If a customer buys 3283::APPLE WA HONEYCRISP L...
3,(4011::BANANA LB),(3283::APPLE WA HONEYCRISP LB),0.085067,0.006864,0.002288,0.026895,3.9185,1.0,0.001704,1.020585,0.814049,0.025522,0.02017,0.180114,"If a customer buys 4011::BANANA LB, they also ..."
4,(3283::APPLE WA HONEYCRISP LB),(4246::STRAWBERRIES 1 LB),0.006864,0.048669,0.002496,0.363636,7.471639,1.0,0.002162,1.494949,0.872147,0.047059,0.331081,0.207459,If a customer buys 3283::APPLE WA HONEYCRISP L...


In [169]:
rules.to_csv('../data/association_rules.csv', index=False)

In [58]:
print(rules[['rule_summary', 'lift']].sort_values(by='lift', ascending=False).head(20))

                                          rule_summary        lift
123  If a customer buys 3224756523::MG POTTING MIX ...  437.090909
122  If a customer buys 29999500000::MG POTTING MIX...  437.090909
222  If a customer buys 1600012479::GM HONEY NUT CH...  182.121212
223  If a customer buys 960000003553::CPN: Buy 2 Sa...  182.121212
235  If a customer buys 1128400303::JJ BERRY FRT SN...  135.610256
234  If a customer buys 1128400311::JJ PEACH SNACK ...  135.610256
232  If a customer buys 28309600000::SQUASH ZUCCHIN...   75.125000
233  If a customer buys 28328000000::SQUASH YELLOW ...   75.125000
173  If a customer buys 4689::PEPPER YELLOW EA, the...   57.612200
172  If a customer buys 3121::PEPPER ORANGE EA, the...   57.612200
101  If a customer buys 2840051646::DORITOS NACHO C...   45.695868
100  If a customer buys 2840051631::DORITOS COOL RA...   45.695868
170  If a customer buys 4088::PEPPER  RED EA, they ...   42.737778
171  If a customer buys 4689::PEPPER YELLOW EA, the...   42.73

In [59]:
sorted_rules = rules[['rule_summary', 'lift', 'confidence']].sort_values(by='lift', ascending=False)

In [60]:
sorted_rules

Unnamed: 0,rule_summary,lift,confidence
123,If a customer buys 3224756523::MG POTTING MIX ...,437.090909,1.000000
122,If a customer buys 29999500000::MG POTTING MIX...,437.090909,1.000000
222,If a customer buys 1600012479::GM HONEY NUT CH...,182.121212,0.454545
223,If a customer buys 960000003553::CPN: Buy 2 Sa...,182.121212,0.833333
235,If a customer buys 1128400303::JJ BERRY FRT SN...,135.610256,0.423077
...,...,...,...
18,"If a customer buys 599::SINGLE DONUT, they als...",1.035984,0.050420
89,"If a customer buys 4011::BANANA LB, they also ...",0.984754,0.039120
88,If a customer buys 28517000000::MISC HOT FOODS...,0.984754,0.083770
153,"If a customer buys 4011::BANANA LB, they also ...",0.979625,0.029340


In [61]:
sorted_rules.to_csv('../data/sorted_association_rules.csv', index=False)