## Apriori Algorithm — Know How to Find Frequent Itemsets

##### Import the packages

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

##### Read the dataset into dataframe

In [2]:
df = pd.read_excel('data/online_retail_II.xlsx')
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


##### list of column names from dataframe

In [3]:
df.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

##### Show unique country from dataframe

In [4]:
df.Country.unique()

array(['United Kingdom', 'France', 'USA', 'Belgium', 'Australia', 'EIRE',
       'Germany', 'Portugal', 'Japan', 'Denmark', 'Nigeria',
       'Netherlands', 'Poland', 'Spain', 'Channel Islands', 'Italy',
       'Cyprus', 'Greece', 'Norway', 'Austria', 'Sweden',
       'United Arab Emirates', 'Finland', 'Switzerland', 'Unspecified',
       'Malta', 'Bahrain', 'RSA', 'Bermuda', 'Hong Kong', 'Singapore',
       'Thailand', 'Israel', 'Lithuania', 'West Indies', 'Lebanon',
       'Korea', 'Brazil', 'Canada', 'Iceland'], dtype=object)

##### Check null value of each col.

In [5]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

##### Drop null value from dataframe

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

##### You'll notice that the data set also contains entries for items returned to the store. The invoice numbers of these entries will begin with the letter "C" indicating that these are credit entries. You will also see a few entries for "Adjusting Bad Debt." The invoice number for such entries will begin with the letter "A." These entries are extraneous to our sales analysis so we'll clean the data set to exclude these entries from the dataframe.

In [8]:
len(df)

417534

In [9]:
#Identify entries with "C" in the invoice number
df['Invoice'] = df['Invoice'].astype('str')
df[df['Invoice'].str.contains('C')].head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01 10:33:00,2.95,16321.0,Australia
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01 10:33:00,1.65,16321.0,Australia
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01 10:33:00,4.25,16321.0,Australia
181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01 10:33:00,2.1,16321.0,Australia
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01 10:33:00,2.95,16321.0,Australia


In [10]:
#Identify entries with "A" in the invoice number
df[df['Invoice'].str.contains('A')]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country


##### We'll need to remove the entries with a "C" or an "A" in the invoice number.

In [11]:
df = df[~df['Invoice'].str.contains('C|A')]
len(df)

407695

In [12]:
df.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

##### Grouping data for specific countries

In [13]:
subset_df = df[df['Country'].isin(['Denmark', 'Germany', 'Italy'])]
subset_df = subset_df.groupby(['Invoice', 'Description'])['Quantity'].sum()
subset_df.head()

Invoice  Description                     
489526   6 RIBBONS RUSTIC CHARM              12
         ANTIQUE SILVER TEA GLASS ETCHED     12
         BIRD DECORATION RED SPOT            24
         BLUE/BROWN DOTS RUFFLED UMBRELLA     3
         EDWARDIAN PARASOL BLACK              6
Name: Quantity, dtype: int64

##### Next, we'll pivot the dataframe so that each unique entry in the "Description" becomes a column and each unique "InvoiceNo" becomes a row.

In [14]:
#Unpack the data
subset_df = subset_df.unstack().reset_index()
subset_df.head()

Description,Invoice,3 STRIPEY MICE FELTCRAFT,CHERRY BLOSSOM DECORATIVE FLASK,FLAMINGO LIGHTS,RED/WHITE DOT MINI CASES,SET 2 TEA TOWELS I LOVE LONDON,WHITE CHERRY LIGHTS,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,...,YELLOW PURPLE DAISY FELT PURSE KIT,YELLOW SHARK HELICOPTER,YULETIDE IMAGES GIFT WRAP SET,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE DOUBLE PLANTER,ZINC HEART LATTICE PLANTER BOWL,ZINC HEART LATTICE TRAY OVAL,ZINC METAL HEART DECORATION,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
0,489526,,,,,,,,,,...,,,,,,,,24.0,,24.0
1,489831,,,,,,,,,,...,,,,,,,,,,
2,490395,,,,,,,,,,...,,,,,,,,,,
3,490563,,,,,,,,,,...,,,,,,,,,,
4,490564,6.0,,,,,,,,,...,,,,,,,,,,


##### replace all the NaN values with zero and make the invoice number the dataframe's index

In [15]:
subset_df = subset_df.fillna(0)
subset_df = subset_df.set_index('Invoice')
subset_df.head()

Description,3 STRIPEY MICE FELTCRAFT,CHERRY BLOSSOM DECORATIVE FLASK,FLAMINGO LIGHTS,RED/WHITE DOT MINI CASES,SET 2 TEA TOWELS I LOVE LONDON,WHITE CHERRY LIGHTS,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,...,YELLOW PURPLE DAISY FELT PURSE KIT,YELLOW SHARK HELICOPTER,YULETIDE IMAGES GIFT WRAP SET,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE DOUBLE PLANTER,ZINC HEART LATTICE PLANTER BOWL,ZINC HEART LATTICE TRAY OVAL,ZINC METAL HEART DECORATION,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,24.0
489831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490564,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### In the bottom-left corner of the output table, the column count of 1824 includes a column titled "POSTAGE." POSTAGE entries indicate that the items listed in the invoice have been booked to be sent via postal services. Since POSTAGE is a hidden column in the output table, we will print the contents of the column to verify its presence

In [16]:
subset_df.head()['POSTAGE']

Invoice
489526    6.0
489831    0.0
490395    2.0
490563    0.0
490564    7.0
Name: POSTAGE, dtype: float64

In [17]:
subset_df = subset_df.drop("POSTAGE", axis=1)

##### The Apriori algorithm requires this data to be hot encoded, which means all entries with a non-zero value should be true and all entries with a zero value should be false.

In [18]:
#Encode the data
def hot_encode(x):
    if(x<= 0):
        return False
    else:
        return True

subset_df = subset_df.applymap(hot_encode)
subset_df.head()

  subset_df = subset_df.applymap(hot_encode)


Description,3 STRIPEY MICE FELTCRAFT,CHERRY BLOSSOM DECORATIVE FLASK,FLAMINGO LIGHTS,RED/WHITE DOT MINI CASES,SET 2 TEA TOWELS I LOVE LONDON,WHITE CHERRY LIGHTS,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,...,YELLOW PURPLE DAISY FELT PURSE KIT,YELLOW SHARK HELICOPTER,YULETIDE IMAGES GIFT WRAP SET,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE DOUBLE PLANTER,ZINC HEART LATTICE PLANTER BOWL,ZINC HEART LATTICE TRAY OVAL,ZINC METAL HEART DECORATION,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489526,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
489831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
490395,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
490563,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
490564,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


##### Apply Aprioiri Algorithm

##### You can indicate a required minimum support threshold when applying the Apriori algorithm. This means that any item or item set with "support" less than the specified minimum support will be considered infrequent. You can set a minimum support threshold using the argument min_support, which will filter out the less frequently purchased items. The smaller the minimum support threshold value, the more confidently the algorithm will identify relations. For this tutorial, we'll set the min_support value at "0.05."

In [19]:
#Apply the Apriori algorithm
frq_items = apriori(subset_df, min_support=0.05, use_colnames=True)
frq_items.sort_values(['support'], ascending=[False])

Unnamed: 0,support,itemsets
41,0.261845,(ROUND SNACK BOXES SET OF4 WOODLAND )
55,0.197007,(WOODLAND CHARLOTTE BAG)
35,0.172070,(RED TOADSTOOL LED NIGHT LIGHT)
40,0.169576,(ROUND SNACK BOXES SET OF 4 FRUITS )
70,0.127182,"(ROUND SNACK BOXES SET OF 4 FRUITS , ROUND SNA..."
...,...,...
52,0.052369,(TEA BAG PLATE RED SPOTTY )
59,0.052369,"(STRAWBERRY CERAMIC TRINKET BOX, BIG DOUGHNUT ..."
58,0.052369,(WOODLAND WATER TRANSFER TATTOOS )
66,0.052369,"(WOODLAND CHARLOTTE BAG, RED SPOTTY CHARLOTTE ..."


##### Calucate the lift to quantifies the strength of association between the two items. It assesses whether there is enough evidence of a real relationship between the items or if their co-occurrence is merely coincidental.

##### Let's say we have two items, A and B. The lift of the antecedent 'A' to the consequent 'B' is calculated as Support(A ∪ B) / (Support(A) * Support(B)). This could be described as "the support of A given B divided by the support for A times the support for B". We interpret the values like this:

- **Lift = 1**:  
  - A and B are completely independent.  
  - In retail data, this indicates that buying A has no effect on buying B.

- **Lift > 1**:  
  - A and B are positively correlated.  
  - In retail data, this indicates that customers tend to buy A and B together.

- **Lift < 1**:  
  - A and B are negatively correlated.  
  - In retail data, this indicates that customers tend NOT to buy A and B together.


In [20]:
rules=association_rules(frq_items, metric='lift', min_threshold=1,num_itemsets=len(subset_df))
rules.sort_values(['lift'], ascending=[False])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(STRAWBERRY CERAMIC TRINKET BOX),(BIG DOUGHNUT FRIDGE MAGNETS),0.082294,0.084788,0.052369,0.636364,7.505348,1.0,0.045392,2.516833,0.944488,0.456522,0.602675,0.627005
1,(BIG DOUGHNUT FRIDGE MAGNETS),(STRAWBERRY CERAMIC TRINKET BOX),0.084788,0.082294,0.052369,0.617647,7.505348,1.0,0.045392,2.400153,0.947061,0.456522,0.58336,0.627005
16,(RED SPOTTY PLATE ),(RED SPOTTY CUP),0.074813,0.114713,0.062344,0.833333,7.264493,1.0,0.053762,5.311721,0.932075,0.490196,0.811737,0.688406
17,(RED SPOTTY CUP),(RED SPOTTY PLATE ),0.114713,0.074813,0.062344,0.543478,7.264493,1.0,0.053762,2.0266,0.974085,0.490196,0.506563,0.688406
12,(RED SPOTTY BOWL),(RED SPOTTY CUP),0.077307,0.114713,0.062344,0.806452,7.030154,1.0,0.053476,4.573982,0.929622,0.480769,0.781372,0.674965
13,(RED SPOTTY CUP),(RED SPOTTY BOWL),0.114713,0.077307,0.062344,0.543478,7.030154,1.0,0.053476,2.021138,0.968901,0.480769,0.505229,0.674965
4,(PINK SPOTTY CUP),(RED SPOTTY CUP),0.067332,0.114713,0.052369,0.777778,6.780193,1.0,0.044645,3.983791,0.914057,0.403846,0.748983,0.61715
5,(RED SPOTTY CUP),(PINK SPOTTY CUP),0.114713,0.067332,0.052369,0.456522,6.780193,1.0,0.044645,1.71611,0.962978,0.403846,0.417287,0.61715
3,(RED SPOTTY CUP),(BLUE SPOTTY CUP),0.114713,0.072319,0.054863,0.478261,6.613193,1.0,0.046567,1.778055,0.958771,0.415094,0.437588,0.618441
2,(BLUE SPOTTY CUP),(RED SPOTTY CUP),0.072319,0.114713,0.054863,0.758621,6.613193,1.0,0.046567,3.667617,0.914956,0.415094,0.727343,0.618441


##### The output displays the association between the different item sets. Here, "antecedents" represent the items likely to be picked up first, and "consequents" represent the items picked up because of that choice.