In [None]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

root = '/content/drive/MyDrive/instacart-market-basket-analysis/'

In [None]:
orders = pd.read_csv(root + 'orders.csv')
order_products_prior = pd.read_csv(root + 'order_products__prior.csv')
order_products_train = pd.read_csv(root + 'order_products__train.csv')
products = pd.read_csv(root + 'products.csv')

In [None]:
order_products = pd.concat([order_products_prior, order_products_train])
order_products.shape

(15440124, 4)

In [None]:
order_products.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120.0,1.0,1.0
1,2,28985.0,2.0,1.0
2,2,9327.0,3.0,0.0
3,2,45918.0,4.0,1.0
4,2,30035.0,5.0,0.0


In [None]:
order_products.product_id.nunique()


49337

In [None]:
product_counts = order_products.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
product_counts = product_counts.merge(products, on = 'product_id', how = 'left')
product_counts.head(10)

Unnamed: 0,product_id,frequency,product_name,aisle_id,department_id
0,24852.0,224191,Banana,24,4
1,13176.0,180061,Bag of Organic Bananas,24,4
2,21137.0,126192,Organic Strawberries,24,4
3,21903.0,114244,Organic Baby Spinach,123,4
4,47209.0,99913,Organic Hass Avocado,24,4
5,47766.0,83664,Organic Avocado,24,4
6,47626.0,74143,Large Lemon,24,4
7,16797.0,68289,Strawberries,24,4
8,26209.0,66933,Limes,24,4
9,27845.0,64718,Organic Whole Milk,84,16


In [None]:
freq_products = list(product_counts.product_id)
freq_products[1:10]

[13176.0,
 21137.0,
 21903.0,
 47209.0,
 47766.0,
 47626.0,
 16797.0,
 26209.0,
 27845.0]

In [None]:
len(freq_products)

100

In [None]:
order_products = order_products[order_products.product_id.isin(freq_products)]
order_products.shape

(3555405, 4)

In [None]:
order_products.order_id.nunique()


1113110

In [None]:
order_products = order_products.merge(products, on = 'product_id', how='left')
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,28985.0,2.0,1.0,Michigan Organic Kale,83,4
1,2,17794.0,6.0,1.0,Carrots,83,4
2,3,24838.0,2.0,1.0,Unsweetened Almondmilk,91,16
3,3,21903.0,4.0,1.0,Organic Baby Spinach,123,4
4,3,46667.0,6.0,1.0,Organic Ginger Root,83,4


In [None]:
basket = order_products.groupby(['order_id', 'product_name'])['reordered'].count().unstack().reset_index().fillna(0).set_index('order_id')
basket.head()

product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
del product_counts, products, order_products, order_products_prior, order_products_train


In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket = basket.applymap(encode_units)
basket.head()

  basket = basket.applymap(encode_units)


product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
basket.size


111311000

In [None]:
basket.shape


(1113110, 100)

In [None]:
frequent_items = apriori(basket, min_support=0.01, use_colnames=True, low_memory=True)
frequent_items.head()



Unnamed: 0,support,itemsets
0,0.015855,(100% Raw Coconut Water)
1,0.025919,(100% Whole Wheat Bread)
2,0.015821,(2% Reduced Fat Milk)
3,0.035209,(Apple Honeycrisp Organic)
4,0.029382,(Asparagus)


In [None]:
frequent_items.tail()


Unnamed: 0,support,itemsets
124,0.01041,"(Organic Strawberries, Organic Blueberries)"
125,0.010872,"(Organic Hass Avocado, Organic Raspberries)"
126,0.017243,"(Organic Strawberries, Organic Hass Avocado)"
127,0.014712,"(Organic Strawberries, Organic Raspberries)"
128,0.010222,"(Organic Whole Milk, Organic Strawberries)"


In [None]:
frequent_items.shape


(129, 2)

In [None]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
35,(Limes),(Large Lemon),0.060132,0.066609,0.012113,0.20144,3.024226,1.0,0.008108,1.168843,0.71216,0.105672,0.144453,0.191646
34,(Large Lemon),(Limes),0.066609,0.060132,0.012113,0.181851,3.024226,1.0,0.008108,1.148775,0.717102,0.105672,0.129507,0.191646
52,(Organic Strawberries),(Organic Raspberries),0.113369,0.058017,0.014712,0.129771,2.236777,1.0,0.008135,1.082454,0.623628,0.093902,0.076173,0.191676
53,(Organic Raspberries),(Organic Strawberries),0.058017,0.113369,0.014712,0.253581,2.236777,1.0,0.008135,1.187846,0.586983,0.093902,0.15814,0.191676
36,(Organic Avocado),(Large Lemon),0.075162,0.066609,0.010747,0.142989,2.14669,1.0,0.005741,1.089123,0.577579,0.082026,0.08183,0.152169
37,(Large Lemon),(Organic Avocado),0.066609,0.075162,0.010747,0.16135,2.14669,1.0,0.005741,1.10277,0.572286,0.082026,0.093193,0.152169
47,(Organic Blueberries),(Organic Strawberries),0.043487,0.113369,0.01041,0.239371,2.111437,1.0,0.005479,1.165655,0.550321,0.071081,0.142114,0.165596
46,(Organic Strawberries),(Organic Blueberries),0.113369,0.043487,0.01041,0.09182,2.111437,1.0,0.005479,1.05322,0.593695,0.071081,0.050531,0.165596
49,(Organic Raspberries),(Organic Hass Avocado),0.058017,0.08976,0.010872,0.187398,2.087766,1.0,0.005665,1.120155,0.553109,0.079415,0.107266,0.154262
48,(Organic Hass Avocado),(Organic Raspberries),0.08976,0.058017,0.010872,0.121125,2.087766,1.0,0.005665,1.071806,0.572398,0.079415,0.066996,0.154262
