In [1]:
import pandas as pd
import openpyxl
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

In [6]:
df_retail = pd.read_excel('Online Retail.xlsx', index_col=0, engine='openpyxl')
df_retail = df_retail.dropna(subset = ['Description'])
df_retail = df_retail.astype({'Description': str}) # to be sure that it's gonna read and evaluate the data. 
df_retail_list = df_retail.groupby(['InvoiceNo'])['Description'].apply(list).to_list() # to create a baskets lists

print('Number of transactions', len(df_retail_list))
print('Number of unique items', len(set(sum(df_retail_list, []))))

Number of transactions 24446
Number of unique items 4223


In [7]:
encoder = TransactionEncoder()
encoded_array = encoder.fit(df_retail_list).transform(df_retail_list)
    #The fit method analyzes the transactions data to identify all the unique items present across all transactions.The transform method then uses this vocabulary to create the one-hot encoded array.For each transaction, it creates a row in the array.For each unique item, it creates a column.
df_itemsets = pd.DataFrame(encoded_array, columns=encoder.columns_)

In [8]:
frequent_items = apriori(df_itemsets, min_support=0.025, use_colnames=True)
print(frequent_items)

      support                                           itemsets
0    0.039311                           (6 RIBBONS RUSTIC CHARM)
1    0.025444                  (60 CAKE CASES VINTAGE CHRISTMAS)
2    0.034198                      (60 TEATIME FAIRY CAKE CASES)
3    0.025321                   (72 SWEETHEART FAIRY CAKE CASES)
4    0.040947                       (ALARM CLOCK BAKELIKE GREEN)
..        ...                                                ...
131  0.027939  (JUMBO BAG RED RETROSPOT, JUMBO SHOPPER VINTAG...
132  0.029984  (JUMBO BAG RED RETROSPOT, JUMBO STORAGE BAG SUKI)
133  0.026507  (LUNCH BAG  BLACK SKULL., LUNCH BAG RED RETROS...
134  0.025076  (LUNCH BAG PINK POLKADOT, LUNCH BAG RED RETROS...
135  0.025117  (ROSES REGENCY TEACUP AND SAUCER , PINK REGENC...

[136 rows x 2 columns]


In [11]:
rules = association_rules(frequent_items, metric="confidence", min_threshold=0.3)
print(rules.iloc[:,:7])

                            antecedents                          consequents  \
0          (ALARM CLOCK BAKELIKE GREEN)          (ALARM CLOCK BAKELIKE RED )   
1           (ALARM CLOCK BAKELIKE RED )         (ALARM CLOCK BAKELIKE GREEN)   
2     (GREEN REGENCY TEACUP AND SAUCER)     (PINK REGENCY TEACUP AND SAUCER)   
3      (PINK REGENCY TEACUP AND SAUCER)    (GREEN REGENCY TEACUP AND SAUCER)   
4     (GREEN REGENCY TEACUP AND SAUCER)   (ROSES REGENCY TEACUP AND SAUCER )   
5    (ROSES REGENCY TEACUP AND SAUCER )    (GREEN REGENCY TEACUP AND SAUCER)   
6             (JUMBO BAG RED RETROSPOT)            (JUMBO BAG PINK POLKADOT)   
7             (JUMBO BAG PINK POLKADOT)            (JUMBO BAG RED RETROSPOT)   
8             (JUMBO BAG RED RETROSPOT)  (JUMBO SHOPPER VINTAGE RED PAISLEY)   
9   (JUMBO SHOPPER VINTAGE RED PAISLEY)            (JUMBO BAG RED RETROSPOT)   
10            (JUMBO BAG RED RETROSPOT)             (JUMBO STORAGE BAG SUKI)   
11             (JUMBO STORAGE BAG SUKI) 