In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


In [4]:
df = pd.read_excel("C:\\Users\\fishk\\Downloads\\online retail.xlsx")    # Read the data

# There is a little cleanup, we need to do. First, some of the descriptions have spaces that need to be removed. We’ll also drop the rows that don’t have invoice numbers and remove the credit transactions (those with invoice numbers containing C).

In [5]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

# consolidate the items into 1 transaction per row with each product 1 hot encoded.

In [6]:
basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 and anything less the 0 is set to 0. This step will complete the one hot encoding of the data and remove the postage column (since that charge is not one we wish to explore):

In [7]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

# we can generate frequent item sets that have a support of at least 7%

In [8]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

# generate the rules with their corresponding support, confidence and lift:

In [9]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

TypeError: __init__() got an unexpected keyword argument 'max_rows'

                    antecedents                   consequents  \
0  (ALARM CLOCK BAKELIKE GREEN)   (ALARM CLOCK BAKELIKE PINK)   
1   (ALARM CLOCK BAKELIKE PINK)  (ALARM CLOCK BAKELIKE GREEN)   
2  (ALARM CLOCK BAKELIKE GREEN)    (ALARM CLOCK BAKELIKE RED)   
3    (ALARM CLOCK BAKELIKE RED)  (ALARM CLOCK BAKELIKE GREEN)   
4    (ALARM CLOCK BAKELIKE RED)   (ALARM CLOCK BAKELIKE PINK)   

   antecedent support  consequent support   support  confidence      lift  \
0            0.096939            0.102041  0.073980    0.763158  7.478947   
1            0.102041            0.096939  0.073980    0.725000  7.478947   
2            0.096939            0.094388  0.079082    0.815789  8.642959   
3            0.094388            0.096939  0.079082    0.837838  8.642959   
4            0.094388            0.102041  0.073980    0.783784  7.681081   

   leverage  conviction  
0  0.064088    3.791383  
1  0.064088    3.283859  
2  0.069932    4.916181  
3  0.069932    5.568878  
4  0.064348    4

# We can filter the dataframe using standard pandas code

In [10]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

TypeError: __init__() got an unexpected keyword argument 'max_rows'

                                          antecedents  \
2                        (ALARM CLOCK BAKELIKE GREEN)   
3                          (ALARM CLOCK BAKELIKE RED)   
17                    (SET/6 RED SPOTTY PAPER PLATES)   
18                    (SET/6 RED SPOTTY PAPER PLATES)   
19                      (SET/6 RED SPOTTY PAPER CUPS)   
20  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   
21  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
22  (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT...   

                             consequents  antecedent support  \
2             (ALARM CLOCK BAKELIKE RED)            0.096939   
3           (ALARM CLOCK BAKELIKE GREEN)            0.094388   
17  (SET/20 RED RETROSPOT PAPER NAPKINS)            0.127551   
18         (SET/6 RED SPOTTY PAPER CUPS)            0.127551   
19       (SET/6 RED SPOTTY PAPER PLATES)            0.137755   
20         (SET/6 RED SPOTTY PAPER CUPS)            0.102041   
21       (SET/6 RED SPOTTY PAPER PLATE

# you may want to look at how much opportunity there is to use the popularity of one product to drive sales of another. For instance, we can see that we sell 340 Green Alarm clocks but only 316 Red Alarm Clocks so maybe we can drive more Red Alarm Clock sales through recommendations?

In [11]:
basket['ALARM CLOCK BAKELIKE GREEN'].sum()

340.0

basket['ALARM CLOCK BAKELIKE RED'].sum()

316.0

316.0

# Let’s check out what some popular combinations might be in Germany:

In [12]:
basket2 = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules2[ (rules2['lift'] >= 4) &
        (rules2['confidence'] >= 0.5)]

TypeError: __init__() got an unexpected keyword argument 'max_rows'

                        antecedents                         consequents  \
1   (PLASTERS IN TIN CIRCUS PARADE)  (PLASTERS IN TIN WOODLAND ANIMALS)   
7        (PLASTERS IN TIN SPACEBOY)  (PLASTERS IN TIN WOODLAND ANIMALS)   
11    (RED RETROSPOT CHARLOTTE BAG)            (WOODLAND CHARLOTTE BAG)   

    antecedent support  consequent support   support  confidence      lift  \
1             0.115974            0.137856  0.067834    0.584906  4.242887   
7             0.107221            0.137856  0.061269    0.571429  4.145125   
11            0.070022            0.126915  0.059081    0.843750  6.648168   

    leverage  conviction  
1   0.051846    2.076984  
7   0.046488    2.011670  
11  0.050194    5.587746  