Download the groceries dataset. Write a python program to read the dataset and display its 
information. Preprocess the data (drop null values etc.) Convert the categorical values into numeric 
format. Apply the apriori algorithm on the above dataset to generate the frequent itemsets and association 
rules

In [8]:

# importing the libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from csv import reader
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [10]:
# reading the dataset
groceries = []
with open('groceries.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        groceries.append(row)


In [13]:
items = set(sum(groceries, []))
df = pd.DataFrame(columns=items)
print(df)

Empty DataFrame
Columns: [sparkling wine, cream cheese , frozen potato products, fish, specialty cheese, dog food, tea, frankfurter, onions, skin care, red/blush wine, chicken, pastry, newspapers, soups, flower (seeds), toilet cleaner, honey, detergent, ham, bags, bottled beer, shopping bags, specialty chocolate, soft cheese, prosecco, curd cheese, napkins, specialty fat, fruit/vegetable juice, domestic eggs, tidbits, salt, yogurt, snack products, candles, house keeping products, hair spray, ready soups, other vegetables, pet care, chocolate marshmallow, canned vegetables, frozen fruits, softener, sauces, hard cheese, organic sausage, white wine, sound storage medium, cream, UHT-milk, brandy, packaged fruit/vegetables, sugar, specialty vegetables, curd, potato products, chewing gum, light bulbs, frozen vegetables, canned beer, salty snack, dish cleaner, liver loaf, semi-finished bread, herbs, hamburger meat, tropical fruit, zwieback, flour, mustard, whisky, frozen fish, liquor, citrus 

In [14]:
# fitting the list and converting the transactions to true and false
encoder = TransactionEncoder()
transactions = encoder.fit(groceries).transform(groceries)

In [15]:
# converting the true and false to 1 and 0
transactions = transactions.astype('int')

In [16]:
# converting the transactions array to a datafrmae
df = pd.DataFrame(transactions, columns=encoder.columns_)

In [17]:
# viewing the first few rows of the dataframe
df.head()


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
# How many transactions and items are there in the data set?

df.shape


(9835, 169)

In [19]:
# applying the apriori algorithm
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets



Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1
3,0.026029,(beverages),1
4,0.080529,(bottled beer),1
...,...,...,...
117,0.032232,"(whipped/sour cream, whole milk)",2
118,0.020742,"(whipped/sour cream, yogurt)",2
119,0.056024,"(yogurt, whole milk)",2
120,0.023183,"(root vegetables, other vegetables, whole milk)",3


In [21]:
# sorting the dataframe
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print(frequent_itemsets)

     support                         itemsets  length
57  0.255516                     (whole milk)       1
39  0.193493               (other vegetables)       1
43  0.183935                     (rolls/buns)       1
49  0.174377                           (soda)       1
58  0.139502                         (yogurt)       1
..       ...                              ...     ...
75  0.020539        (frankfurter, whole milk)       2
60  0.020437       (bottled beer, whole milk)       2
76  0.020437  (whole milk, frozen vegetables)       2
96  0.020437      (tropical fruit, pip fruit)       2
67  0.020031       (butter, other vegetables)       2

[122 rows x 3 columns]


In [22]:
# finding top 5 items with minimum support of 2%
frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.02) ][0:5]

Unnamed: 0,support,itemsets,length
57,0.255516,(whole milk),1
39,0.193493,(other vegetables),1
43,0.183935,(rolls/buns),1
49,0.174377,(soda),1
58,0.139502,(yogurt),1


In [23]:
# finding itemsets having length 2 and minimum support of 2%
frequent_itemsets[(frequent_itemsets['length'] == 2) & 
                  (frequent_itemsets['support'] >= 0.02)]

Unnamed: 0,support,itemsets,length
91,0.074835,"(other vegetables, whole milk)",2
103,0.056634,"(whole milk, rolls/buns)",2
119,0.056024,"(yogurt, whole milk)",2
106,0.048907,"(root vegetables, whole milk)",2
85,0.047382,"(root vegetables, other vegetables)",2
...,...,...,...
75,0.020539,"(frankfurter, whole milk)",2
60,0.020437,"(bottled beer, whole milk)",2
76,0.020437,"(whole milk, frozen vegetables)",2
96,0.020437,"(tropical fruit, pip fruit)",2


In [24]:
# finding top 10 association rules with minimum support of 2%
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.02)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(other vegetables),(whole milk),0.193493,0.255516,0.074835,0.386758,1.513634,0.025394,1.214013
1,(whole milk),(other vegetables),0.255516,0.193493,0.074835,0.292877,1.513634,0.025394,1.140548
2,(whole milk),(rolls/buns),0.255516,0.183935,0.056634,0.221647,1.205032,0.009636,1.048452
3,(rolls/buns),(whole milk),0.183935,0.255516,0.056634,0.307905,1.205032,0.009636,1.075696
4,(yogurt),(whole milk),0.139502,0.255516,0.056024,0.401603,1.571735,0.020379,1.244132
...,...,...,...,...,...,...,...,...,...
129,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,0.008149,1.294636
130,(tropical fruit),(pip fruit),0.104931,0.075648,0.020437,0.194767,2.574648,0.012499,1.147931
131,(pip fruit),(tropical fruit),0.075648,0.104931,0.020437,0.270161,2.574648,0.012499,1.226392
132,(butter),(other vegetables),0.055414,0.193493,0.020031,0.361468,1.868122,0.009308,1.263065


In [25]:
# finding association rules with minimum support of 2% and having lift more than 1
rules[(rules['support'] >= 0.02) &
      (rules['lift'] > 1.0)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(other vegetables),(whole milk),0.193493,0.255516,0.074835,0.386758,1.513634,0.025394,1.214013
1,(whole milk),(other vegetables),0.255516,0.193493,0.074835,0.292877,1.513634,0.025394,1.140548
2,(whole milk),(rolls/buns),0.255516,0.183935,0.056634,0.221647,1.205032,0.009636,1.048452
3,(rolls/buns),(whole milk),0.183935,0.255516,0.056634,0.307905,1.205032,0.009636,1.075696
4,(yogurt),(whole milk),0.139502,0.255516,0.056024,0.401603,1.571735,0.020379,1.244132
...,...,...,...,...,...,...,...,...,...
129,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,0.008149,1.294636
130,(tropical fruit),(pip fruit),0.104931,0.075648,0.020437,0.194767,2.574648,0.012499,1.147931
131,(pip fruit),(tropical fruit),0.075648,0.104931,0.020437,0.270161,2.574648,0.012499,1.226392
132,(butter),(other vegetables),0.055414,0.193493,0.020031,0.361468,1.868122,0.009308,1.263065
