[Reference](https://drlee.io/introduction-to-market-basket-analysis-with-real-data-and-a-step-by-step-python-template-for-47b35d174e71)

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

import matplotlib.pyplot as plt
import mlxtend.frequent_patterns
import mlxtend.preprocessing
import numpy

In [2]:
# Load the dataset
url = "https://github.com/fenago/datasets/raw/main/Online%20Retail%20(5).xlsx"
data = pd.read_excel(url, sheet_name="Online Retail", header=0)

  and should_run_async(code)


In [3]:
# Step 1: Create a new column 'IsCPresent' to flag refunds
data['IsCPresent'] = data['InvoiceNo'].astype(str).apply(lambda x: 1 if 'C' in x else 0)

# Print output for verification
print("After Step 1 - Flagging Refunds:")
print("Data dimension (row count, col count):", data.shape)
print("Count of unique invoice numbers:", data['InvoiceNo'].nunique())

# Step 2: Remove transactions with zero or negative quantity
data = data[data['Quantity'] > 0]

# Print output for verification
print("\nAfter Step 2 - Filtering Non-Positive Quantities:")
print("Data dimension (row count, col count):", data.shape)
print("Count of unique invoice numbers:", data['InvoiceNo'].nunique())

# Step 3: Remove canceled transactions using the 'IsCPresent' flag
data = data[data['IsCPresent'] == 0]

# Print output for verification
print("\nAfter Step 3 - Removing Refunded Transactions:")
print("Data dimension (row count, col count):", data.shape)
print("Count of unique invoice numbers:", data['InvoiceNo'].nunique())

# Step 4: Select only the 'InvoiceNo' and 'Description' columns
data = data[['InvoiceNo', 'Description']]

# Print output for verification
print("\nAfter Step 4 - Column Filtering:")
print("Data dimension (row count, col count):", data.shape)
print("Count of unique invoice numbers:", data['InvoiceNo'].nunique())

# Step 5: Remove missing values
data.dropna(subset=['InvoiceNo', 'Description'], inplace=True)

# Print output for verification
print("\nAfter Step 5 - Dropping Missing Values:")
print("Data dimension (row count, col count):", data.shape)
print("Count of unique invoice numbers:", data['InvoiceNo'].nunique())

  and should_run_async(code)


After Step 1 - Flagging Refunds:
Data dimension (row count, col count): (541909, 9)
Count of unique invoice numbers: 25900

After Step 2 - Filtering Non-Positive Quantities:
Data dimension (row count, col count): (531285, 9)
Count of unique invoice numbers: 20728

After Step 3 - Removing Refunded Transactions:
Data dimension (row count, col count): (531285, 9)
Count of unique invoice numbers: 20728

After Step 4 - Column Filtering:
Data dimension (row count, col count): (531285, 2)
Count of unique invoice numbers: 20728

After Step 5 - Dropping Missing Values:
Data dimension (row count, col count): (530693, 2)
Count of unique invoice numbers: 20136


In [4]:
invoice_item_list = []
for num in list(set(data.InvoiceNo.tolist())):
    # Filter dataset down to one invoice number
    tmp_df = data.loc[data['InvoiceNo'] == num]
    # Extract item descriptions and convert to list
    tmp_items = tmp_df.Description.tolist()
    # Append list to invoice_item_list
    invoice_item_list.append(tmp_items)

print(invoice_item_list[1:5])

  and should_run_async(code)


[['HAND WARMER UNION JACK', 'HAND WARMER RED POLKA DOT'], ['ASSORTED COLOUR BIRD ORNAMENT', "POPPY'S PLAYHOUSE BEDROOM ", "POPPY'S PLAYHOUSE KITCHEN", 'FELTCRAFT PRINCESS CHARLOTTE DOLL', 'IVORY KNITTED MUG COSY ', 'BOX OF 6 ASSORTED COLOUR TEASPOONS', 'BOX OF VINTAGE JIGSAW BLOCKS ', 'BOX OF VINTAGE ALPHABET BLOCKS', 'HOME BUILDING BLOCK WORD', 'LOVE BUILDING BLOCK WORD', 'RECIPE BOX WITH METAL HEART', 'DOORMAT NEW ENGLAND'], ['JAM MAKING SET WITH JARS', 'RED COAT RACK PARIS FASHION', 'YELLOW COAT RACK PARIS FASHION', 'BLUE COAT RACK PARIS FASHION'], ['BATH BUILDING BLOCK WORD']]


In [5]:
# At this point - this is a template... copy and paste...
online_encoder = mlxtend.preprocessing.TransactionEncoder()
online_encoder_array = online_encoder.fit_transform(invoice_item_list)
print(online_encoder_array)

  and should_run_async(code)


[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]


In [6]:
# Once you have above... simply encode it... copy and paste code!
online_encoder_df = pd.DataFrame(
    online_encoder_array,
    columns=online_encoder.columns_
)

# this is a very big table, so for more
# easy viewing only a subset is printed
online_encoder_df.loc[
    4970:4979,
    online_encoder_df.columns.tolist()[0:8]
]

  and should_run_async(code)


Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE
4970,False,False,False,False,False,False,False,False
4971,False,False,False,False,False,False,False,False
4972,False,False,False,False,False,False,False,False
4973,False,False,False,False,False,False,False,False
4974,False,False,False,False,False,False,False,False
4975,False,False,False,False,False,False,False,False
4976,False,False,False,False,False,False,False,False
4977,False,False,False,False,False,False,False,False
4978,False,False,False,False,False,False,False,False
4979,False,False,False,False,False,False,False,False


In [7]:
# default minimum support = 0.5
# does not use colnames (item names)

mod = mlxtend.frequent_patterns.apriori(online_encoder_df)
mod

  and should_run_async(code)


Unnamed: 0,support,itemsets


In [8]:
mod_minsupport = mlxtend.frequent_patterns.apriori(
    online_encoder_df,
    min_support=0.05
)
mod_minsupport.loc[0:6]

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.052195,(151)
1,0.072259,(229)
2,0.050407,(1570)
3,0.059644,(1672)
4,0.057708,(1793)
5,0.056267,(1794)
6,0.060489,(1825)


In [9]:
# add colnames for easier interpretability

mod_colnames_minsupport = mlxtend.frequent_patterns.apriori(
    online_encoder_df,
    min_support=0.01,
    use_colnames=True
)
mod_colnames_minsupport.loc[0:6]

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.013359,( SET 2 TEA TOWELS I LOVE LONDON )
1,0.015793,(10 COLOUR SPACEBOY PEN)
2,0.012465,(12 MESSAGE CARDS WITH ENVELOPES)
3,0.01763,(12 PENCIL SMALL TUBE WOODLAND)
4,0.017978,(12 PENCILS SMALL TUBE RED RETROSPOT)
5,0.01763,(12 PENCILS SMALL TUBE SKULL)
6,0.013309,(12 PENCILS TALL TUBE RED RETROSPOT)


In [10]:
# Apply the Apriori algorithm to find frequent itemsets
mod_colnames_minsupport = apriori(
    online_encoder_df,
    min_support=0.01,  # Minimum support threshold
    use_colnames=True  # Use the actual item names rather than column indices
)

# Sort the itemsets by support in descending order
mod_colnames_minsupport_sorted = mod_colnames_minsupport.sort_values(by='support', ascending=False)

# Display the first few results of the sorted frequent itemsets
print(mod_colnames_minsupport_sorted.head(7))

  and should_run_async(code)


      support                              itemsets
771  0.112237  (WHITE HANGING HEART T-LIGHT HOLDER)
341  0.103894             (JUMBO BAG RED RETROSPOT)
550  0.098778            (REGENCY CAKESTAND 3 TIER)
458  0.083731                       (PARTY BUNTING)
381  0.077672             (LUNCH BAG RED RETROSPOT)
48   0.072259       (ASSORTED COLOUR BIRD ORNAMENT)
619  0.068782   (SET OF 3 CAKE TINS PANTRY DESIGN )


In [11]:
# Length is the number of items in the itemsets
mod_colnames_minsupport['length'] = (
    mod_colnames_minsupport['itemsets'].apply(lambda x: len(x))
)

mod_colnames_minsupport.loc[180:296]

  and should_run_async(code)


Unnamed: 0,support,itemsets,length
180,0.029797,(DOORMAT NEW ENGLAND),1
181,0.034615,(DOORMAT RED RETROSPOT),1
182,0.023838,(DOORMAT SPOTTY HOME SWEET HOME),1
183,0.010181,(DOORMAT TOPIARY),1
184,0.030592,(DOORMAT UNION FLAG),1
...,...,...,...
292,0.018772,(HEART IVORY TRELLIS SMALL),1
293,0.045987,(HEART OF WICKER LARGE),1
294,0.059644,(HEART OF WICKER SMALL),1
295,0.011025,(HEART WOODEN CHRISTMAS DECORATION),1


In [12]:
# Calculate the length of each itemset and add it as a new column
mod_colnames_minsupport['length'] = mod_colnames_minsupport['itemsets'].apply(lambda x: len(x))

# Sort the DataFrame by support in descending order
mod_colnames_minsupport_sorted = mod_colnames_minsupport.sort_values(by='support', ascending=False)

# Display the specific rows of the sorted DataFrame
print(mod_colnames_minsupport_sorted.loc[180:296])

       support                                           itemsets  length
180   0.029797                              (DOORMAT NEW ENGLAND)       1
1507  0.029748  (PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...       2
338   0.029549                                  (JUMBO BAG PEARS)       1
177   0.029450                                   (DOORMAT HEARTS)       1
686   0.029251                             (SMALL POPCORN HOLDER)       1
...        ...                                                ...     ...
508   0.012068                            (RAIN PONCHO RETROSPOT)       1
1736  0.012068  (JUMBO STORAGE BAG SUKI, JUMBO BAG RED RETROSP...       3
906   0.012068  (CHARLOTTE BAG SUKI DESIGN, CHARLOTTE BAG VINT...       2
404   0.012068                (MINIATURE ANTIQUE ROSE HOOK IVORY)       1
296   0.012018                                (HERB MARKER BASIL)       1

[1026 rows x 3 columns]


  and should_run_async(code)


In [13]:
mod_colnames_minsupport[
    mod_colnames_minsupport['itemsets'] == frozenset(
        {'10 COLOUR SPACEBOY PEN'}
    )
]

  and should_run_async(code)


Unnamed: 0,support,itemsets,length
1,0.015793,(10 COLOUR SPACEBOY PEN),1


In [14]:
## ORDER OF ITEMSETS DIFFERS

mod_colnames_minsupport[
    (mod_colnames_minsupport['length'] == 2) &
    (mod_colnames_minsupport['support'] >= 0.02) &
    (mod_colnames_minsupport['support'] < 0.021)
]

  and should_run_async(code)


Unnamed: 0,support,itemsets,length
836,0.020759,"(ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...",2
887,0.020362,"(CHARLOTTE BAG SUKI DESIGN, CHARLOTTE BAG PINK...",2
923,0.02061,"(CHARLOTTE BAG SUKI DESIGN, STRAWBERRY CHARLOT...",2
1105,0.02056,"(JUMBO BAG BAROQUE BLACK WHITE, JUMBO BAG PIN...",2
1114,0.020908,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG...",2
1116,0.020957,"(JUMBO BAG BAROQUE BLACK WHITE, JUMBO STORAGE...",2
1129,0.02056,"(JUMBO BAG ALPHABET, JUMBO BAG RED RETROSPOT)",2
1137,0.020163,"(JUMBO BAG PEARS, JUMBO BAG APPLES)",2
1203,0.020709,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",2
1218,0.02056,"(JUMBO BAG RED RETROSPOT, JUMBO STORAGE BAG SK...",2


In [15]:
## ROW ORDER SLIGHTLY DIFFERENT

rules = mlxtend.frequent_patterns.association_rules(
    mod_colnames_minsupport,
    metric="confidence",
    min_threshold=0.6,
    support_only=False
)

rules.loc[0:6]

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE GREEN),0.021255,0.048669,0.013756,0.647196,13.297902,0.012722,2.696488,0.944884
1,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE RED ),0.021255,0.052195,0.014501,0.682243,13.071023,0.013392,2.982798,0.94355
2,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE GREEN),0.0221,0.048669,0.013558,0.613483,12.605201,0.012482,2.461292,0.941474
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.048669,0.052195,0.031784,0.653061,12.511932,0.029244,2.731908,0.967146
4,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.052195,0.048669,0.031784,0.608944,12.511932,0.029244,2.432722,0.970744
5,(ALARM CLOCK BAKELIKE IVORY),(ALARM CLOCK BAKELIKE RED ),0.028308,0.052195,0.018524,0.654386,12.537313,0.017047,2.74238,0.947047
6,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE RED ),0.0221,0.052195,0.014998,0.678652,13.002217,0.013845,2.949463,0.943951


In [16]:
## ROW ORDER SLIGHTLY DIFFERENT

rules = mlxtend.frequent_patterns.association_rules(
    mod_colnames_minsupport,
    metric="confidence",
    min_threshold=0.6,
    support_only=False
)

rules.loc[0:6]

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE GREEN),0.021255,0.048669,0.013756,0.647196,13.297902,0.012722,2.696488,0.944884
1,(ALARM CLOCK BAKELIKE CHOCOLATE),(ALARM CLOCK BAKELIKE RED ),0.021255,0.052195,0.014501,0.682243,13.071023,0.013392,2.982798,0.94355
2,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE GREEN),0.0221,0.048669,0.013558,0.613483,12.605201,0.012482,2.461292,0.941474
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.048669,0.052195,0.031784,0.653061,12.511932,0.029244,2.731908,0.967146
4,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.052195,0.048669,0.031784,0.608944,12.511932,0.029244,2.432722,0.970744
5,(ALARM CLOCK BAKELIKE IVORY),(ALARM CLOCK BAKELIKE RED ),0.028308,0.052195,0.018524,0.654386,12.537313,0.017047,2.74238,0.947047
6,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE RED ),0.0221,0.052195,0.014998,0.678652,13.002217,0.013845,2.949463,0.943951
