# Grocery Product Recommendation using Apriori Algorithm

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations, groupby
from collections import Counter

from apyori import apriori
from mlxtend import frequent_patterns 
from mlxtend.preprocessing import TransactionEncoder

#### [Dataset link from kaggle](https://www.kaggle.com/c/instacart-market-basket-analysis/data)

In [2]:
prior_order_df = pd.read_csv("./instacart-market-basket-analysis/order_products_prior.csv")
prior_order_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [3]:
# Over 32 million grocery orders from more than 200,000 Instacart users
# 50K unique items, 3.2M unique orders
prior_order_df.shape

(32434489, 4)

In [4]:
prior_order_df.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [5]:
products_df = pd.read_csv("./instacart-market-basket-analysis/products.csv")
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
products_df.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64

In [7]:
# Making required dataset
prior_order_df = prior_order_df.drop(columns = ["add_to_cart_order", "reordered"])
products_df = products_df.drop(columns = ["aisle_id","department_id"])

orders_df = prior_order_df.merge(products_df, on="product_id")
orders_df.head()

Unnamed: 0,order_id,product_id,product_name
0,2,33120,Organic Egg Whites
1,26,33120,Organic Egg Whites
2,120,33120,Organic Egg Whites
3,327,33120,Organic Egg Whites
4,390,33120,Organic Egg Whites


In [8]:
# Getting Only a subset to reduce computational cost
subset = orders_df[orders_df['order_id']<10000]

In [9]:
subset.shape

(94482, 3)

In [10]:
subset.head(10)

Unnamed: 0,order_id,product_id,product_name
0,2,33120,Organic Egg Whites
1,26,33120,Organic Egg Whites
2,120,33120,Organic Egg Whites
3,327,33120,Organic Egg Whites
4,390,33120,Organic Egg Whites
5,537,33120,Organic Egg Whites
6,582,33120,Organic Egg Whites
7,608,33120,Organic Egg Whites
8,623,33120,Organic Egg Whites
9,689,33120,Organic Egg Whites


In [11]:
subset[subset["order_id"] == 4]

Unnamed: 0,order_id,product_id,product_name
570685,4,46842,Plain Pre-Sliced Bagels
579585,4,26434,Honey/Lemon Cough Drops
579690,4,39758,Chewy 25% Low Sugar Chocolate Chip Granola
582039,4,27761,Oats & Chocolate Chewy Bars
584216,4,10054,Kellogg's Nutri-Grain Apple Cinnamon Cereal
585445,4,21351,Nutri-Grain Soft Baked Strawberry Cereal Break...
587798,4,22598,Kellogg's Nutri-Grain Blueberry Cereal
588556,4,34862,Tiny Twists Pretzels
593016,4,40285,Traditional Snack Mix
595238,4,17616,Goldfish Cheddar Baked Snack Crackers


In [12]:
subset.drop(columns = ["product_id"], inplace=True)
subset.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,order_id,product_name
0,2,Organic Egg Whites
1,26,Organic Egg Whites
2,120,Organic Egg Whites
3,327,Organic Egg Whites
4,390,Organic Egg Whites


In [13]:
subset["product_name"] = subset["product_name"].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [52]:
subset[subset["product_name"] == "Clementines"]

Unnamed: 0,order_id,product_name
1221921,5,Clementines
1221922,61,Clementines
1221923,152,Clementines
1221924,303,Clementines
1221925,315,Clementines
...,...,...
1221998,9187,Clementines
1221999,9251,Clementines
1222000,9273,Clementines
1222001,9696,Clementines


In [63]:
subset[subset["order_id"] == 61]

Unnamed: 0,order_id,product_name
1221922,61,Clementines
3976390,61,Organic Baby Carrots
4077283,61,Frozen Organic Blueberries
5716668,61,Sea Salt Brown Rice Crackers
7688048,61,Premium Grapefruit Large
7690230,61,Dairy Free Unsweetened Coconut Milk
7698306,61,Original Almondmilk
7702860,61,with Crispy Almonds Cereal
7707190,61,Organic Low Fat Milk
7715628,61,Half And Half Cream


In [14]:
subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94482 entries, 0 to 30093879
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   order_id      94482 non-null  int64 
 1   product_name  94482 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.2+ MB


In [51]:
trans = subset.groupby(['order_id'], as_index = False).agg({'product_name': ','.join})
trans.head()

Unnamed: 0,order_id,product_name
0,2,"Organic Egg Whites,Michigan Organic Kale,Garli..."
1,3,Total 2% with Strawberry Lowfat Greek Strained...
2,4,"Plain Pre-Sliced Bagels,Honey/Lemon Cough Drop..."
3,5,"Bag of Organic Bananas,Just Crisp, Parmesan,Fr..."
4,6,"Cleanse,Dryer Sheets Geranium Scent,Clean Day ..."


In [16]:
# trans[trans["order_id"] == 2]["product_name"]
trans.iloc[1000,1]

'Lemonade,Limes,Red Hot Original Cayenne Pepper Sauce,Original No Pulp 100% Florida Orange Juice,Celery Sticks,The Original Worcestershire Sauce'

# Implementing Apriori algorithm using Apyori Package

In [17]:
trans_list = []
for i in range(trans.shape[0]):
    trans_list.append(trans["product_name"][i].strip().split(","))
    
trans_list

[['Organic Egg Whites',
  'Michigan Organic Kale',
  'Garlic Powder',
  'Coconut Butter',
  'Natural Sweetener',
  'Carrots',
  'Original Unflavored Gelatine Mix',
  'All Natural No Stir Creamy Almond Butter',
  'Classic Blend Cole Slaw'],
 ['Total 2% with Strawberry Lowfat Greek Strained Yogurt',
  'Unsweetened Almondmilk',
  'Lemons',
  'Organic Baby Spinach',
  'Unsweetened Chocolate Almond Breeze Almond Milk',
  'Organic Ginger Root',
  'Air Chilled Organic Boneless Skinless Chicken Breasts',
  'Organic Ezekiel 49 Bread Cinnamon Raisin'],
 ['Plain Pre-Sliced Bagels',
  'Honey/Lemon Cough Drops',
  'Chewy 25% Low Sugar Chocolate Chip Granola',
  'Oats & Chocolate Chewy Bars',
  "Kellogg's Nutri-Grain Apple Cinnamon Cereal",
  'Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars',
  "Kellogg's Nutri-Grain Blueberry Cereal",
  'Tiny Twists Pretzels',
  'Traditional Snack Mix',
  'Goldfish Cheddar Baked Snack Crackers',
  'Original Orange Juice',
  'Sugarfree Energy Drink',
  'Ener

In [18]:
len(trans_list)

9428

In [19]:
rules = apriori(trans_list, min_support=0.01, min_confidence=0.01, min_lift=1.8,
                min_length=2)

Results = list(rules)
Results

Total time required to run apriori algorithm using apyori package:  0.0


[RelationRecord(items=frozenset({' Bag', 'Clementines'}), support=0.011985574883326262, ordered_statistics=[OrderedStatistic(items_base=frozenset({' Bag'}), items_add=frozenset({'Clementines'}), confidence=0.8828125, lift=42.902867268041234), OrderedStatistic(items_base=frozenset({'Clementines'}), items_add=frozenset({' Bag'}), confidence=0.5824742268041236, lift=42.902867268041234)]),
 RelationRecord(items=frozenset({'Organic Baby Spinach', 'Bag of Organic Bananas'}), support=0.015803988120492152, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Bag of Organic Bananas'}), items_add=frozenset({'Organic Baby Spinach'}), confidence=0.13104661389621813, lift=1.8010313058506482), OrderedStatistic(items_base=frozenset({'Organic Baby Spinach'}), items_add=frozenset({'Bag of Organic Bananas'}), confidence=0.21720116618075805, lift=1.8010313058506482)]),
 RelationRecord(items=frozenset({'Bag of Organic Bananas', 'Organic Hass Avocado'}), support=0.019304200254560883, ordered_statist

In [20]:
df_results = pd.DataFrame(Results)
df_results.head()

Unnamed: 0,items,support,ordered_statistics
0,"( Bag, Clementines)",0.011986,"[(( Bag), (Clementines), 0.8828125, 42.9028672..."
1,"(Organic Baby Spinach, Bag of Organic Bananas)",0.015804,"[((Bag of Organic Bananas), (Organic Baby Spin..."
2,"(Bag of Organic Bananas, Organic Hass Avocado)",0.019304,"[((Bag of Organic Bananas), (Organic Hass Avoc..."
3,"(Organic Raspberries, Bag of Organic Bananas)",0.01294,"[((Bag of Organic Bananas), (Organic Raspberri..."
4,"(Banana, Honeycrisp Apple)",0.010182,"[((Banana), (Honeycrisp Apple), 0.068181818181..."


In [64]:
df_results["ordered_statistics"][1]

[OrderedStatistic(items_base=frozenset({'Bag of Organic Bananas'}), items_add=frozenset({'Organic Baby Spinach'}), confidence=0.13104661389621813, lift=1.8010313058506482),
 OrderedStatistic(items_base=frozenset({'Organic Baby Spinach'}), items_add=frozenset({'Bag of Organic Bananas'}), confidence=0.21720116618075805, lift=1.8010313058506482)]

In [22]:
support = df_results["support"]

In [23]:
#all four empty list which will contain lhs, rhs, confidance and lift respectively.

first_values = []
second_values = []
third_values = []
fourth_value = []

# loop number of rows time and append 1 by 1 value in a separate list.. first and second element was frozenset which need to be converted in list..
for i in range(df_results.shape[0]):
    single_list = df_results['ordered_statistics'][i][0]
    first_values.append(list(single_list[0]))
    second_values.append(list(single_list[1]))
    third_values.append(single_list[2])
    fourth_value.append(single_list[3])

In [24]:
lhs = pd.DataFrame(first_values)
rhs= pd.DataFrame(second_values)
confidance=pd.DataFrame(third_values,columns=['Confidance'])
lift=pd.DataFrame(fourth_value,columns=['lift'])

In [25]:
lhs.head()

Unnamed: 0,0
0,Bag
1,Bag of Organic Bananas
2,Bag of Organic Bananas
3,Bag of Organic Bananas
4,Banana


In [26]:
rhs.head()

Unnamed: 0,0
0,Clementines
1,Organic Baby Spinach
2,Organic Hass Avocado
3,Organic Raspberries
4,Honeycrisp Apple


In [27]:
df_final = pd.concat([lhs,rhs,support,confidance,lift], axis=1)
df_final

Unnamed: 0,0,0.1,support,Confidance,lift
0,Bag,Clementines,0.011986,0.882812,42.902867
1,Bag of Organic Bananas,Organic Baby Spinach,0.015804,0.131047,1.801031
2,Bag of Organic Bananas,Organic Hass Avocado,0.019304,0.16007,2.469956
3,Bag of Organic Bananas,Organic Raspberries,0.01294,0.1073,2.491684
4,Banana,Honeycrisp Apple,0.010182,0.068182,2.581599
5,Banana,Organic Avocado,0.015592,0.104403,1.878464
6,Banana,Organic Fuji Apple,0.010395,0.069602,2.466956
7,Banana,Strawberries,0.013577,0.090909,1.993235
8,Organic Baby Spinach,Organic Strawberries,0.012516,0.172012,2.168083
9,Organic Hass Avocado,Organic Strawberries,0.011773,0.181669,2.289812


In [28]:
df_final.shape

(11, 5)

In [29]:
df_final.columns = [["antecedents","consequents","support","confidence","lift"]]

In [30]:
df_final

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,Bag,Clementines,0.011986,0.882812,42.902867
1,Bag of Organic Bananas,Organic Baby Spinach,0.015804,0.131047,1.801031
2,Bag of Organic Bananas,Organic Hass Avocado,0.019304,0.16007,2.469956
3,Bag of Organic Bananas,Organic Raspberries,0.01294,0.1073,2.491684
4,Banana,Honeycrisp Apple,0.010182,0.068182,2.581599
5,Banana,Organic Avocado,0.015592,0.104403,1.878464
6,Banana,Organic Fuji Apple,0.010395,0.069602,2.466956
7,Banana,Strawberries,0.013577,0.090909,1.993235
8,Organic Baby Spinach,Organic Strawberries,0.012516,0.172012,2.168083
9,Organic Hass Avocado,Organic Strawberries,0.011773,0.181669,2.289812


# Implementing Apriori algorithm using Mlxtend package

In [31]:
subset.head(10)

Unnamed: 0,order_id,product_name
0,2,Organic Egg Whites
1,26,Organic Egg Whites
2,120,Organic Egg Whites
3,327,Organic Egg Whites
4,390,Organic Egg Whites
5,537,Organic Egg Whites
6,582,Organic Egg Whites
7,608,Organic Egg Whites
8,623,Organic Egg Whites
9,689,Organic Egg Whites


In [32]:
data = subset[subset['order_id']<10000]

In [33]:
te = TransactionEncoder()
te_array = te.fit(trans_list).transform(trans_list)

In [34]:
df_trans = pd.DataFrame(te_array, columns=te.columns_)
df_trans.head()

Unnamed: 0,Unnamed: 1,Apricot & Banana Stage 2 Baby Food,& Baby Wipes,& Cheese Biscuit,& Cheese Croissant,& Cheese Sandwiches,& Cheese Sauce,& Grape Ice Pops,& Peas,& Raisin,...,of Norwich Original English Mustard Powder Double Superfine,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,w/Banana Pulp Free Juice,with Crispy Almonds Cereal,with Olive Oil Mayonnaise,with Olive Oil Mayonnaise Dressing,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,Lightly Seasoned with Rosemary and Roasted Garlic Family Size Herb Chicken Tortellini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
freq_itemsets = frequent_patterns.apriori(df_trans,min_support=0.01,use_colnames=True)
freq_itemsets

Unnamed: 0,support,itemsets
0,0.013577,( Bag)
1,0.015062,( Bunch)
2,0.021744,( Organic)
3,0.011986,( Strawberry)
4,0.012304,(100% Raw Coconut Water)
...,...,...
116,0.018562,"(Banana, Organic Strawberries)"
117,0.013577,"(Banana, Strawberries)"
118,0.012516,"(Organic Baby Spinach, Organic Strawberries)"
119,0.011773,"(Organic Strawberries, Organic Hass Avocado)"


In [36]:
rules = frequent_patterns.association_rules(freq_itemsets,metric='lift',min_threshold=1)
rules.sort_values(by='confidence',ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,( Bag),(Clementines),0.013577,0.020577,0.011986,0.882812,42.902867,0.011706,8.357743
1,(Clementines),( Bag),0.020577,0.013577,0.011986,0.582474,42.902867,0.011706,2.362545
11,(Honeycrisp Apple),(Banana),0.026411,0.149342,0.010182,0.385542,2.581599,0.006238,1.384404
17,(Organic Fuji Apple),(Banana),0.028214,0.149342,0.010395,0.368421,2.466956,0.006181,1.346875
6,(Organic Raspberries),(Bag of Organic Bananas),0.043063,0.120598,0.01294,0.300493,2.491684,0.007747,1.257173
5,(Organic Hass Avocado),(Bag of Organic Bananas),0.064807,0.120598,0.019304,0.297872,2.469956,0.011489,1.252481
23,(Strawberries),(Banana),0.045609,0.149342,0.013577,0.297674,1.993235,0.006765,1.211201
13,(Organic Avocado),(Banana),0.055579,0.149342,0.015592,0.280534,1.878464,0.007292,1.182346
29,(Organic Raspberries),(Organic Strawberries),0.043063,0.079338,0.010076,0.23399,2.949277,0.00666,1.201893
21,(Organic Strawberries),(Banana),0.079338,0.149342,0.018562,0.233957,1.566583,0.006713,1.110457


In [37]:
rules[(rules['confidence']>0.16) & (rules['lift']>2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,( Bag),(Clementines),0.013577,0.020577,0.011986,0.882812,42.902867,0.011706,8.357743
1,(Clementines),( Bag),0.020577,0.013577,0.011986,0.582474,42.902867,0.011706,2.362545
4,(Bag of Organic Bananas),(Organic Hass Avocado),0.120598,0.064807,0.019304,0.16007,2.469956,0.011489,1.113418
5,(Organic Hass Avocado),(Bag of Organic Bananas),0.064807,0.120598,0.019304,0.297872,2.469956,0.011489,1.252481
6,(Organic Raspberries),(Bag of Organic Bananas),0.043063,0.120598,0.01294,0.300493,2.491684,0.007747,1.257173
11,(Honeycrisp Apple),(Banana),0.026411,0.149342,0.010182,0.385542,2.581599,0.006238,1.384404
17,(Organic Fuji Apple),(Banana),0.028214,0.149342,0.010395,0.368421,2.466956,0.006181,1.346875
24,(Organic Baby Spinach),(Organic Strawberries),0.072762,0.079338,0.012516,0.172012,2.168083,0.006743,1.111926
27,(Organic Hass Avocado),(Organic Strawberries),0.064807,0.079338,0.011773,0.181669,2.289812,0.006632,1.125049
29,(Organic Raspberries),(Organic Strawberries),0.043063,0.079338,0.010076,0.23399,2.949277,0.00666,1.201893


In [40]:
rules.antecedents.values

array([frozenset({' Bag'}), frozenset({'Clementines'}),
       frozenset({'Organic Baby Spinach'}),
       frozenset({'Bag of Organic Bananas'}),
       frozenset({'Bag of Organic Bananas'}),
       frozenset({'Organic Hass Avocado'}),
       frozenset({'Organic Raspberries'}),
       frozenset({'Bag of Organic Bananas'}),
       frozenset({'Organic Strawberries'}),
       frozenset({'Bag of Organic Bananas'}), frozenset({'Banana'}),
       frozenset({'Honeycrisp Apple'}), frozenset({'Banana'}),
       frozenset({'Organic Avocado'}), frozenset({'Banana'}),
       frozenset({'Organic Baby Spinach'}), frozenset({'Banana'}),
       frozenset({'Organic Fuji Apple'}), frozenset({'Banana'}),
       frozenset({'Organic Hass Avocado'}), frozenset({'Banana'}),
       frozenset({'Organic Strawberries'}), frozenset({'Banana'}),
       frozenset({'Strawberries'}), frozenset({'Organic Baby Spinach'}),
       frozenset({'Organic Strawberries'}),
       frozenset({'Organic Strawberries'}),
       fro

In [44]:
rules[rules["antecedents"] == frozenset({'Bag of Organic Bananas'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,(Bag of Organic Bananas),(Organic Baby Spinach),0.120598,0.072762,0.015804,0.131047,1.801031,0.007029,1.067075
4,(Bag of Organic Bananas),(Organic Hass Avocado),0.120598,0.064807,0.019304,0.16007,2.469956,0.011489,1.113418
7,(Bag of Organic Bananas),(Organic Raspberries),0.120598,0.043063,0.01294,0.1073,2.491684,0.007747,1.071958
9,(Bag of Organic Bananas),(Organic Strawberries),0.120598,0.079338,0.017183,0.14248,1.79586,0.007615,1.073633
