In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
ds = pd.read_csv('Groceries_dataset.csv')
ds.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [3]:
ds.info()
ds.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


Member_number      0
Date               0
itemDescription    0
dtype: int64

In [6]:
#changing the datatype of date from object to date-time
ds['Date']= pd.to_datetime(ds['Date'])
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Member_number    38765 non-null  int64         
 1   Date             38765 non-null  datetime64[ns]
 2   itemDescription  38765 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 908.7+ KB


In [7]:
#removing all duplicate entries
ds = ds.drop_duplicates()

In [10]:
#Grouping the data by member number and date.
#The resulting dataframe will give us all items bought by a customer on a specific day. 
f_data = ds.groupby(['Member_number','Date']).agg({'itemDescription': lambda x: ','.join(x)}).reset_index()
f_data.head(10)

Unnamed: 0,Member_number,Date,itemDescription
0,1000,2014-06-24,"whole milk,pastry,salty snack"
1,1000,2015-03-15,"sausage,whole milk,semi-finished bread,yogurt"
2,1000,2015-05-27,"soda,pickled vegetables"
3,1000,2015-07-24,"canned beer,misc. beverages"
4,1000,2015-11-25,"sausage,hygiene articles"
5,1001,2014-07-02,"sausage,whole milk,rolls/buns"
6,1001,2014-12-12,"whole milk,soda"
7,1001,2015-01-20,"frankfurter,soda,whipped/sour cream"
8,1001,2015-02-05,"frankfurter,curd"
9,1001,2015-04-14,"beef,white bread"


In [14]:
#converting the dataframe to a list
transaction = []
for row in range(0,len(f_data)):
    transaction.append(f_data['itemDescription'][row].split(','))
    
transaction[:4]

[['whole milk', 'pastry', 'salty snack'],
 ['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['soda', 'pickled vegetables'],
 ['canned beer', 'misc. beverages']]

In [17]:
#Using TransactionEncoder to convert "transcations" to a binary matrix representaion.
#TransactionEncoder is a function of mlxtend
te = TransactionEncoder()
te_ary = te.fit(transaction).transform(transaction)
df = pd.DataFrame(te_ary,columns=te.columns_)
df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# The binary matrix is used as input in apriori and association rules
# minimum Support will be set to 0.001
frequent_items = apriori(df,min_support=0.001,use_colnames=True)
rules = association_rules(frequent_items,metric="lift")
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(UHT-milk),(bottled water),0.021386,0.060683,0.001069,0.05,0.823954,-0.000228,0.988755,-0.179204
1,(bottled water),(UHT-milk),0.060683,0.021386,0.001069,0.017621,0.823954,-0.000228,0.996168,-0.185312
2,(other vegetables),(UHT-milk),0.122101,0.021386,0.002139,0.017515,0.818993,-0.000473,0.99606,-0.201119
3,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.1,0.818993,-0.000473,0.975443,-0.184234
4,(sausage),(UHT-milk),0.060349,0.021386,0.001136,0.018826,0.880298,-0.000154,0.997391,-0.126418


The Antecedents and Consequents columns show items that are frequently purchased together.

To get the most frequent item combinations in the entire dataset we sort the dataset by support, confidence, and lift.

In [31]:
rules.sort_values(["support", "confidence","lift"],axis = 0, ascending = False)
rules.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
731,(sausage),"(yogurt, whole milk)",0.060349,0.011161,0.00147,0.024363,2.182917,0.000797,1.013532,0.576701
730,"(yogurt, whole milk)",(sausage),0.011161,0.060349,0.00147,0.131737,2.182917,0.000797,1.082219,0.548014
729,"(sausage, whole milk)",(yogurt),0.008955,0.085879,0.00147,0.164179,1.91176,0.000701,1.093681,0.481231
732,(yogurt),"(sausage, whole milk)",0.085879,0.008955,0.00147,0.017121,1.91176,0.000701,1.008307,0.521727
246,(citrus fruit),(specialty chocolate),0.053131,0.015973,0.001403,0.026415,1.653762,0.000555,1.010726,0.4175
247,(specialty chocolate),(citrus fruit),0.015973,0.053131,0.001403,0.087866,1.653762,0.000555,1.038081,0.401735
728,"(sausage, yogurt)",(whole milk),0.005748,0.157923,0.00147,0.255814,1.619866,0.000563,1.131541,0.384877
733,(whole milk),"(sausage, yogurt)",0.157923,0.005748,0.00147,0.00931,1.619866,0.000563,1.003596,0.45443
331,(tropical fruit),(flour),0.067767,0.009757,0.001069,0.015779,1.617141,0.000408,1.006118,0.409366
330,(flour),(tropical fruit),0.009757,0.067767,0.001069,0.109589,1.617141,0.000408,1.046969,0.385385


The above output gives the set of most popular items that were bouhgt together.


In [32]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
731,(sausage),"(yogurt, whole milk)",0.060349,0.011161,0.001470,0.024363,2.182917,0.000797,1.013532,0.576701
730,"(yogurt, whole milk)",(sausage),0.011161,0.060349,0.001470,0.131737,2.182917,0.000797,1.082219,0.548014
729,"(sausage, whole milk)",(yogurt),0.008955,0.085879,0.001470,0.164179,1.911760,0.000701,1.093681,0.481231
732,(yogurt),"(sausage, whole milk)",0.085879,0.008955,0.001470,0.017121,1.911760,0.000701,1.008307,0.521727
246,(citrus fruit),(specialty chocolate),0.053131,0.015973,0.001403,0.026415,1.653762,0.000555,1.010726,0.417500
...,...,...,...,...,...,...,...,...,...,...
572,(pastry),(tropical fruit),0.051728,0.067767,0.002807,0.054264,0.800735,-0.000699,0.985722,-0.207875
245,(rolls/buns),(citrus fruit),0.110005,0.053131,0.004678,0.042527,0.800423,-0.001166,0.988925,-0.218846
244,(citrus fruit),(rolls/buns),0.053131,0.110005,0.004678,0.088050,0.800423,-0.001166,0.975926,-0.208441
703,(other vegetables),"(soda, whole milk)",0.122101,0.011629,0.001136,0.009305,0.800165,-0.000284,0.997654,-0.221473
