## Apriori Algorithm — Know How to Find Frequent Itemsets

##### Import the packages

In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

##### Read the dataset into dataframe

In [3]:
df = pd.read_excel('data/online_retail_II.xlsx')
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


##### list of column names from dataframe

In [4]:
df.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

##### Show unique country from dataframe

In [5]:
df.Country.unique()

array(['United Kingdom', 'France', 'USA', 'Belgium', 'Australia', 'EIRE',
       'Germany', 'Portugal', 'Japan', 'Denmark', 'Nigeria',
       'Netherlands', 'Poland', 'Spain', 'Channel Islands', 'Italy',
       'Cyprus', 'Greece', 'Norway', 'Austria', 'Sweden',
       'United Arab Emirates', 'Finland', 'Switzerland', 'Unspecified',
       'Malta', 'Bahrain', 'RSA', 'Bermuda', 'Hong Kong', 'Singapore',
       'Thailand', 'Israel', 'Lithuania', 'West Indies', 'Lebanon',
       'Korea', 'Brazil', 'Canada', 'Iceland'], dtype=object)

##### Check null value of each col.

In [7]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

##### You'll notice that the data set also contains entries for items returned to the store. The invoice numbers of these entries will begin with the letter "C" indicating that these are credit entries. You will also see a few entries for "Adjusting Bad Debt." The invoice number for such entries will begin with the letter "A." These entries are extraneous to our sales analysis so we'll clean the data set to exclude these entries from the dataframe.

In [13]:
len(df)

525461

In [9]:
#Identify entries with "C" in the invoice number
df['Invoice'] = df['Invoice'].astype('str')
df[df['Invoice'].str.contains('C')].head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01 10:33:00,2.95,16321.0,Australia
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01 10:33:00,1.65,16321.0,Australia
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01 10:33:00,4.25,16321.0,Australia
181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01 10:33:00,2.1,16321.0,Australia
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01 10:33:00,2.95,16321.0,Australia


In [11]:
#Identify entries with "A" in the invoice number
df[df['Invoice'].str.contains('A')]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
179403,A506401,B,Adjust bad debt,1,2010-04-29 13:36:00,-53594.36,,United Kingdom
276274,A516228,B,Adjust bad debt,1,2010-07-19 11:24:00,-44031.79,,United Kingdom
403472,A528059,B,Adjust bad debt,1,2010-10-20 12:04:00,-38925.87,,United Kingdom


##### We'll need to remove the entries with a "C" or an "A" in the invoice number.

In [None]:
df = df[~df['Invoice'].str.contains('C|A')]
len(df)

515252

In [17]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107557
Country             0
dtype: int64

##### Drop null value from dataframe

In [42]:
df.dropna(inplace=True)

In [44]:
df.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

##### Grouping data for specific countries

In [48]:
subset_df = df[df['Country'].isin(['Denmark', 'Germany', 'Italy'])]
subset_df = subset_df.groupby(['Invoice', 'Description'])['Quantity'].sum()
subset_df.head()

Invoice  Description                     
489526   6 RIBBONS RUSTIC CHARM              12
         ANTIQUE SILVER TEA GLASS ETCHED     12
         BIRD DECORATION RED SPOT            24
         BLUE/BROWN DOTS RUFFLED UMBRELLA     3
         EDWARDIAN PARASOL BLACK              6
Name: Quantity, dtype: int64