Create your own transactions dataset and apply the above process on your dataset

In [1]:

# importing the libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from csv import reader
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [2]:
# reading the dataset
groceries = []
with open('groceries.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        groceries.append(row)

In [3]:
items = set(sum(groceries, []))
df = pd.DataFrame(columns=items)
print(df)

Empty DataFrame
Columns: [cleaner, tidbits, rice, chewing gum, sound storage medium, soda, other vegetables, ham, sparkling wine, organic sausage, dental care, pasta, meat spreads, chocolate marshmallow, cream cheese , syrup, vinegar, salad dressing, frozen vegetables, skin care, softener, preservation products, napkins, baking powder, zwieback, salt, pork, bottled water, long life bakery product, specialty cheese, tea, turkey, white bread, flower (seeds), oil, newspapers, canned fruit, frozen dessert, male cosmetics, nut snack, whisky, hard cheese, mustard, house keeping products, bottled beer, tropical fruit, cling film/bags, red/blush wine, rubbing alcohol, flower soil/fertilizer, cookware, cooking chocolate, liqueur, fish, Instant food products, packaged fruit/vegetables, brandy, sausage, candles, liquor, UHT-milk, butter milk, chicken, artif. sweetener, herbs, flour, curd cheese, decalcifier, brown bread, curd, cocoa drinks, toilet cleaner, frankfurter, fruit/vegetable juice, meat

In [4]:
# fitting the list and converting the transactions to true and false
encoder = TransactionEncoder()
transactions = encoder.fit(groceries).transform(groceries)

In [5]:
# converting the true and false to 1 and 0
transactions = transactions.astype('int')

In [6]:
# converting the transactions array to a datafrmae
df = pd.DataFrame(transactions, columns=encoder.columns_)

In [7]:
# viewing the first few rows of the dataframe
df.head()


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
# How many transactions and items are there in the data set?

df.shape

(9835, 169)

In [9]:
# applying the apriori algorithm
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets



Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1
3,0.026029,(beverages),1
4,0.080529,(bottled beer),1
...,...,...,...
117,0.032232,"(whipped/sour cream, whole milk)",2
118,0.020742,"(yogurt, whipped/sour cream)",2
119,0.056024,"(yogurt, whole milk)",2
120,0.023183,"(root vegetables, other vegetables, whole milk)",3


In [10]:
# sorting the dataframe
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print(frequent_itemsets)

     support                         itemsets  length
57  0.255516                     (whole milk)       1
39  0.193493               (other vegetables)       1
43  0.183935                     (rolls/buns)       1
49  0.174377                           (soda)       1
58  0.139502                         (yogurt)       1
..       ...                              ...     ...
75  0.020539        (frankfurter, whole milk)       2
60  0.020437       (bottled beer, whole milk)       2
76  0.020437  (frozen vegetables, whole milk)       2
96  0.020437      (pip fruit, tropical fruit)       2
67  0.020031       (butter, other vegetables)       2

[122 rows x 3 columns]


In [11]:
# finding top 5 items with minimum support of 2%
frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.02) ][0:5]

Unnamed: 0,support,itemsets,length
57,0.255516,(whole milk),1
39,0.193493,(other vegetables),1
43,0.183935,(rolls/buns),1
49,0.174377,(soda),1
58,0.139502,(yogurt),1


In [12]:
# finding itemsets having length 2 and minimum support of 2%
frequent_itemsets[(frequent_itemsets['length'] == 2) & 
                  (frequent_itemsets['support'] >= 0.02)]

Unnamed: 0,support,itemsets,length
91,0.074835,"(other vegetables, whole milk)",2
103,0.056634,"(rolls/buns, whole milk)",2
119,0.056024,"(yogurt, whole milk)",2
106,0.048907,"(root vegetables, whole milk)",2
85,0.047382,"(root vegetables, other vegetables)",2
...,...,...,...
75,0.020539,"(frankfurter, whole milk)",2
60,0.020437,"(bottled beer, whole milk)",2
76,0.020437,"(frozen vegetables, whole milk)",2
96,0.020437,"(pip fruit, tropical fruit)",2


In [13]:
# finding top 10 association rules with minimum support of 2%
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.02)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(other vegetables),(whole milk),0.193493,0.255516,0.074835,0.386758,1.513634,0.025394,1.214013
1,(whole milk),(other vegetables),0.255516,0.193493,0.074835,0.292877,1.513634,0.025394,1.140548
2,(rolls/buns),(whole milk),0.183935,0.255516,0.056634,0.307905,1.205032,0.009636,1.075696
3,(whole milk),(rolls/buns),0.255516,0.183935,0.056634,0.221647,1.205032,0.009636,1.048452
4,(yogurt),(whole milk),0.139502,0.255516,0.056024,0.401603,1.571735,0.020379,1.244132
...,...,...,...,...,...,...,...,...,...
129,(whole milk),(frozen vegetables),0.255516,0.048094,0.020437,0.079984,1.663094,0.008149,1.034663
130,(pip fruit),(tropical fruit),0.075648,0.104931,0.020437,0.270161,2.574648,0.012499,1.226392
131,(tropical fruit),(pip fruit),0.104931,0.075648,0.020437,0.194767,2.574648,0.012499,1.147931
132,(butter),(other vegetables),0.055414,0.193493,0.020031,0.361468,1.868122,0.009308,1.263065


In [14]:
# finding association rules with minimum support of 2% and having lift more than 1
rules[(rules['support'] >= 0.02) &
      (rules['lift'] > 1.0)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(other vegetables),(whole milk),0.193493,0.255516,0.074835,0.386758,1.513634,0.025394,1.214013
1,(whole milk),(other vegetables),0.255516,0.193493,0.074835,0.292877,1.513634,0.025394,1.140548
2,(rolls/buns),(whole milk),0.183935,0.255516,0.056634,0.307905,1.205032,0.009636,1.075696
3,(whole milk),(rolls/buns),0.255516,0.183935,0.056634,0.221647,1.205032,0.009636,1.048452
4,(yogurt),(whole milk),0.139502,0.255516,0.056024,0.401603,1.571735,0.020379,1.244132
...,...,...,...,...,...,...,...,...,...
129,(whole milk),(frozen vegetables),0.255516,0.048094,0.020437,0.079984,1.663094,0.008149,1.034663
130,(pip fruit),(tropical fruit),0.075648,0.104931,0.020437,0.270161,2.574648,0.012499,1.226392
131,(tropical fruit),(pip fruit),0.104931,0.075648,0.020437,0.194767,2.574648,0.012499,1.147931
132,(butter),(other vegetables),0.055414,0.193493,0.020031,0.361468,1.868122,0.009308,1.263065
