In [15]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [16]:
# Loading the Data
data = pd.read_excel('Online Retail.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [17]:
#Display data columns
print(data.shape)
data.columns

(541909, 8)


Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [18]:
# Exploring the different regions of transactions
data.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [19]:
data['Description'] = data['Description'].str.strip()
data.shape

(541909, 8)

In [20]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()

# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')
# Dropping all transactions which were done on credit
#data = data[~data['InvoiceNo'].str.contains('C')]
print(data.shape)


(541909, 8)


In [21]:
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [22]:
# Transactions done in France
basket_Sweden = (data[data['Country'] == "Sweden"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

In [23]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
	if(x<= 0):
		return 0
	if(x>= 1):
		return 1

# Encoding the datasets
basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded


In [24]:
basket_Sweden

Description,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE WOODLAND,3 PIECE SPACEBOY COOKIE CUTTER SET,3 RAFFIA RIBBONS 50'S CHRISTMAS,3 RAFFIA RIBBONS VINTAGE CHRISTMAS,3 TIER CAKE TIN RED AND CREAM,3 TRADITIONAl BISCUIT CUTTERS SET,36 DOILIES DOLLY GIRL,...,WOODEN STAR CHRISTMAS SCANDINAVIAN,WOODEN TREE CHRISTMAS SCANDINAVIAN,WOODLAND CHARLOTTE BAG,WOODLAND SMALL RED FELT HEART,WORLD WAR 2 GLIDERS ASSTD DESIGNS,WRAP VINTAGE DOILY,WRAP ALPHABET DESIGN,WRAP DOLLY GIRL,WRAP RED VINTAGE DOILY,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
538848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539338,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
540040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
542428,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
542911,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
546161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
546530,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
547645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
551278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Building the model
frq_items = apriori(basket_Sweden, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())




                         antecedents                     consequents  \
12       (TREASURE TIN BUFFALO BILL)       (BUBBLEGUM RING ASSORTED)   
13         (BUBBLEGUM RING ASSORTED)     (TREASURE TIN BUFFALO BILL)   
14    (TREASURE TIN GYMKHANA DESIGN)       (BUBBLEGUM RING ASSORTED)   
15         (BUBBLEGUM RING ASSORTED)  (TREASURE TIN GYMKHANA DESIGN)   
32  (MAGIC DRAWING SLATE DOLLY GIRL)    (MAGIC DRAWING SLATE PURDEY)   

    antecedent support  consequent support   support  confidence       lift  \
12            0.065217            0.065217  0.065217         1.0  15.333333   
13            0.065217            0.065217  0.065217         1.0  15.333333   
14            0.065217            0.065217  0.065217         1.0  15.333333   
15            0.065217            0.065217  0.065217         1.0  15.333333   
32            0.065217            0.065217  0.065217         1.0  15.333333   

    leverage  conviction  
12  0.060964         inf  
13  0.060964         inf  
14  0.06096

# clearly the antecedent support and consequent support are in proportion