# The Business Problem:
At first, there's a dataset from a real grocery store and the goal is to find association rules between the data, that are organized as transactions that came from a market basket. let's go! :)

# importing libraries

In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Loading Data

In [2]:
df = pd.read_table('../input/grocery-store-data/Grocery Store.csv')
df.head()

Unnamed: 0,"citrus fruit,semi-finished bread,margarine,ready soups"
0,"tropical fruit,yogurt,coffee"
1,whole milk
2,"pip fruit,yogurt,cream cheese,meat spreads"
3,"other vegetables,whole milk,condensed milk,lon..."
4,"whole milk,butter,yogurt,rice,abrasive cleaner"


# Processing the data...

In [3]:
lista = []
df.columns = ['column']
for i in df['column']:
    lista.append(i.split(sep=','))

# Creating Encoder for Transactions

In [4]:
te = TransactionEncoder()
te_array = te.fit(lista).transform(lista)
data = pd.DataFrame(te_array, columns = te.columns_)
data.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False


# Calculating Support of itemset
There's a lot of items on itemset (170) what means that our minimum support can't be a high value, cause it spreads through the high volume of transactions, that which led me to use 1% as a value of this support, which despite being low, can bring us good results.

In [5]:
frequent_itemset = apriori(data,min_support= 0.01, use_colnames= True)
frequent_itemset

Unnamed: 0,support,itemsets
0,0.033455,(UHT-milk)
1,0.017694,(baking powder)
2,0.052471,(beef)
3,0.033252,(berries)
4,0.026032,(beverages)
...,...,...
328,0.011999,"(tropical fruit, whole milk, root vegetables)"
329,0.014541,"(yogurt, whole milk, root vegetables)"
330,0.010474,"(soda, yogurt, whole milk)"
331,0.015152,"(tropical fruit, yogurt, whole milk)"


# Looking for association rules
let's find the 5 strongest association rules on the list and this will be our filter:
1. keep the confidence level as 50%
2. rank by lift metric
3. if the lift is tied with another (0.05 next to another is consider a tie), use the higher conviction value to break the tie


In [6]:
rules = association_rules(frequent_itemset,metric = 'confidence', min_threshold= 0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(other vegetables, butter)",(whole milk),0.020033,0.255542,0.011491,0.573604,2.244657,0.006372,1.745931
1,"(citrus fruit, root vegetables)",(other vegetables),0.017694,0.193512,0.010372,0.586207,3.0293,0.006948,1.949012
2,"(yogurt, curd)",(whole milk),0.017287,0.255542,0.010067,0.582353,2.278893,0.00565,1.782505
3,"(other vegetables, domestic eggs)",(whole milk),0.02227,0.255542,0.012304,0.552511,2.162116,0.006613,1.663636
4,"(other vegetables, pip fruit)",(whole milk),0.026134,0.255542,0.013525,0.51751,2.025146,0.006846,1.542949
5,"(rolls/buns, root vegetables)",(other vegetables),0.024303,0.193512,0.012203,0.502092,2.594626,0.0075,1.619753
6,"(tropical fruit, root vegetables)",(other vegetables),0.021049,0.193512,0.012304,0.584541,3.020692,0.008231,1.941197
7,"(yogurt, root vegetables)",(other vegetables),0.025829,0.193512,0.012914,0.5,2.583815,0.007916,1.612975
8,"(other vegetables, whipped/sour cream)",(whole milk),0.028879,0.255542,0.014643,0.507042,1.984184,0.007263,1.510186
9,"(other vegetables, yogurt)",(whole milk),0.043421,0.255542,0.02227,0.512881,2.00703,0.011174,1.528286


In [7]:
rules_ranked = rules.sort_values('lift',ascending=False)
sorted_rules = rules_ranked.head(5)
sorted_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,"(citrus fruit, root vegetables)",(other vegetables),0.017694,0.193512,0.010372,0.586207,3.0293,0.006948,1.949012
6,"(tropical fruit, root vegetables)",(other vegetables),0.021049,0.193512,0.012304,0.584541,3.020692,0.008231,1.941197
5,"(rolls/buns, root vegetables)",(other vegetables),0.024303,0.193512,0.012203,0.502092,2.594626,0.0075,1.619753
7,"(yogurt, root vegetables)",(other vegetables),0.025829,0.193512,0.012914,0.5,2.583815,0.007916,1.612975
2,"(yogurt, curd)",(whole milk),0.017287,0.255542,0.010067,0.582353,2.278893,0.00565,1.782505


# Rules analysis

4 of 5 strongest association rules has root vegetables as antecedents and other vegetables as consequents, but it doesn't tell us a lot about these other vegetables, so what actually we have as information is that citrus fruit, tropical fruit and root vegetable are very important for sales as a booster for other vegetables and a good suggestion is to promote offer for these 3 items that bring new sales of other vegetables.