In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
import copy

## Document

[mlxtend.frequent_patterns](http://rasbt.github.io/mlxtend/api_subpackages/mlxtend.frequent_patterns/)

In [2]:
data = pd.read_excel('datasets/Online_Retail.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
tmp_data = copy.deepcopy(data)

In [4]:
# clean data
# Stripping extra spaces in the description 
data['Description'] = data['Description'].str.strip() 
  
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')] 

In [5]:
# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
basket_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [6]:
basket_UK.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Defining the hot encoding function to make the data suitable  
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1
  # Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_Por.applymap(hot_encode) 
basket_Por = basket_encoded 
  
basket_encoded = basket_Sweden.applymap(hot_encode) 
basket_Sweden = basket_encoded 

In [8]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf
258,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.051020,0.765306,0.051020,1.000000,1.306667,0.011974,inf
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.000000,1.306667,0.012573,inf
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959
302,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796
...,...,...,...,...,...,...,...,...,...
37,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.051020,0.066667,1.244444,0.010022,1.014031
26,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297
97,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297
227,(POSTAGE),"(LUNCH BAG WOODLAND, LUNCH BAG RED RETROSPOT)",0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297


In [9]:
frq_items = apriori(basket_UK, min_support = 0.02, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
166,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.029249,0.050035,0.026410,0.902930,18.046041,0.024947,9.786434
164,"(GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY...",(ROSES REGENCY TEACUP AND SAUCER),0.030910,0.051267,0.026410,0.854419,16.666089,0.024826,6.516893
27,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.037660,0.050035,0.030910,0.820768,16.403939,0.029026,5.300203
172,"(JUMBO STORAGE BAG SUKI, JUMBO BAG PINK POLKADOT)",(JUMBO BAG RED RETROSPOT),0.027053,0.103820,0.021696,0.801980,7.724749,0.018887,4.525711
146,(PINK REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.037660,0.051267,0.029249,0.776671,15.149556,0.027319,4.248149
...,...,...,...,...,...,...,...,...,...
179,(JUMBO BAG RED RETROSPOT),"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO STOR...",0.103820,0.027482,0.020571,0.198142,7.209989,0.017718,1.212832
137,(WHITE HANGING HEART T-LIGHT HOLDER),(NATURAL SLATE HEART CHALKBOARD),0.116034,0.065302,0.021964,0.189289,2.898653,0.014387,1.152936
160,(WHITE HANGING HEART T-LIGHT HOLDER),(WOODEN PICTURE FRAME WHITE FINISH),0.116034,0.057642,0.021642,0.186519,3.235826,0.014954,1.158427
144,(WHITE HANGING HEART T-LIGHT HOLDER),(PARTY BUNTING),0.116034,0.085391,0.020250,0.174515,2.043711,0.010341,1.107966


In [10]:
frq_items = apriori(basket_Por, min_support = 0.1, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
67,(SCANDINAVIAN PAISLEY PICNIC BAG),(PINK VINTAGE PAISLEY PICNIC BAG),0.120690,0.137931,0.120690,1.000000,7.250000,0.104043,inf
204,"(SCANDINAVIAN PAISLEY PICNIC BAG, LUNCH BAG RE...",(PINK VINTAGE PAISLEY PICNIC BAG),0.103448,0.137931,0.103448,1.000000,7.250000,0.089180,inf
209,"(PLASTERS IN TIN CIRCUS PARADE, PLASTERS IN TI...",(PLASTERS IN TIN WOODLAND ANIMALS),0.103448,0.137931,0.103448,1.000000,7.250000,0.089180,inf
280,"(LUNCH BAG PINK POLKADOT, JUMBO SHOPPER VINTAG...","(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, JUMBO BA...",0.103448,0.137931,0.103448,1.000000,7.250000,0.089180,inf
282,"(LUNCH BAG PINK POLKADOT, JUMBO BAG PINK VINTA...","(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",0.103448,0.137931,0.103448,1.000000,7.250000,0.089180,inf
...,...,...,...,...,...,...,...,...,...
78,(RETROSPOT TEA SET CERAMIC 11 PC),(PLASTERS IN TIN VINTAGE PAISLEY),0.241379,0.172414,0.103448,0.428571,2.485714,0.061831,1.448276
62,(RETROSPOT TEA SET CERAMIC 11 PC),(LUNCH BAG RED RETROSPOT),0.241379,0.241379,0.103448,0.428571,1.775510,0.045184,1.327586
63,(LUNCH BAG RED RETROSPOT),(RETROSPOT TEA SET CERAMIC 11 PC),0.241379,0.241379,0.103448,0.428571,1.775510,0.045184,1.327586
1,(POSTAGE),(BAKING SET 9 PIECE RETROSPOT),0.517241,0.206897,0.137931,0.266667,1.288889,0.030916,1.081505


In [11]:
frq_items = apriori(basket_Sweden, min_support = 0.07, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
12,(TREASURE TIN BUFFALO BILL),(BUBBLEGUM RING ASSORTED),0.083333,0.083333,0.083333,1.000000,12.000000,0.076389,inf
13,(BUBBLEGUM RING ASSORTED),(TREASURE TIN BUFFALO BILL),0.083333,0.083333,0.083333,1.000000,12.000000,0.076389,inf
14,(TREASURE TIN GYMKHANA DESIGN),(BUBBLEGUM RING ASSORTED),0.083333,0.083333,0.083333,1.000000,12.000000,0.076389,inf
15,(BUBBLEGUM RING ASSORTED),(TREASURE TIN GYMKHANA DESIGN),0.083333,0.083333,0.083333,1.000000,12.000000,0.076389,inf
32,(MAGIC DRAWING SLATE PURDEY),(MAGIC DRAWING SLATE DOLLY GIRL),0.083333,0.083333,0.083333,1.000000,12.000000,0.076389,inf
...,...,...,...,...,...,...,...,...,...
180,(POSTAGE),"(RED TOADSTOOL LED NIGHT LIGHT, RABBIT NIGHT L...",0.611111,0.083333,0.083333,0.136364,1.636364,0.032407,1.061404
187,(POSTAGE),"(WALL TIDY RETROSPOT, RED RETROSPOT CHARLOTTE ...",0.611111,0.083333,0.083333,0.136364,1.636364,0.032407,1.061404
193,(POSTAGE),"(RED TOADSTOOL LED NIGHT LIGHT, WALL TIDY RETR...",0.611111,0.083333,0.083333,0.136364,1.636364,0.032407,1.061404
71,(POSTAGE),(ROUND SNACK BOXES SET OF 4 FRUITS),0.611111,0.111111,0.083333,0.136364,1.227273,0.015432,1.029240


In [12]:
from mlxtend.frequent_patterns import fpgrowth

In [13]:
fpg = fpgrowth(basket_France,min_support = 0.1,use_colnames = True)
fpg.head()

Unnamed: 0,support,itemsets
0,0.765306,(POSTAGE)
1,0.181122,(RED TOADSTOOL LED NIGHT LIGHT)
2,0.158163,(ROUND SNACK BOXES SET OF4 WOODLAND)
3,0.125,(SPACEBOY LUNCH BOX)
4,0.104592,(MINI PAINT SET VINTAGE)


In [14]:
association_rules(fpg)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(RED TOADSTOOL LED NIGHT LIGHT),(POSTAGE),0.181122,0.765306,0.158163,0.873239,1.141033,0.019549,1.851474
1,(ROUND SNACK BOXES SET OF4 WOODLAND),(POSTAGE),0.158163,0.765306,0.147959,0.935484,1.222366,0.026916,3.637755
2,(LUNCH BOX WITH CUTLERY RETROSPOT),(POSTAGE),0.142857,0.765306,0.114796,0.803571,1.05,0.005466,1.194805
3,(RED RETROSPOT MINI CASES),(POSTAGE),0.137755,0.765306,0.114796,0.833333,1.088889,0.009371,1.408163
4,(LUNCH BAG WOODLAND),(POSTAGE),0.117347,0.765306,0.102041,0.869565,1.136232,0.012234,1.79932
5,(PLASTERS IN TIN WOODLAND ANIMALS),(POSTAGE),0.170918,0.765306,0.137755,0.80597,1.053134,0.00695,1.209576
6,(PLASTERS IN TIN SPACEBOY),(POSTAGE),0.137755,0.765306,0.114796,0.833333,1.088889,0.009371,1.408163
7,(REGENCY CAKESTAND 3 TIER),(POSTAGE),0.125,0.765306,0.104592,0.836735,1.093333,0.008929,1.4375
8,(STRAWBERRY LUNCH BOX WITH CUTLERY),(POSTAGE),0.122449,0.765306,0.114796,0.9375,1.225,0.021085,3.755102
9,(SET/6 RED SPOTTY PAPER CUPS),(POSTAGE),0.137755,0.765306,0.117347,0.851852,1.113086,0.011922,1.584184
