## Generating Frequest Itemset & Association Rule for bread Dataset

In [1]:
#importing libraries
import pandas as pd
import numpy as np

In [2]:
#Reading csv file
df = pd.read_csv("breadData.csv")
df

Unnamed: 0,"Date,Time,Transaction,Item"
0,"2016-10-30,09:58:11,1,Bread"
1,"2016-10-30,10:05:34,2,Scandinavian"
2,"2016-10-30,10:05:34,2,Scandinavian"
3,"2016-10-30,10:07:57,3,Hot chocolate"
4,"2016-10-30,10:07:57,3,Jam"
...,...
21288,"2017-04-09,14:32:58,9682,Coffee"
21289,"2017-04-09,14:32:58,9682,Tea"
21290,"2017-04-09,14:57:06,9683,Coffee"
21291,"2017-04-09,14:57:06,9683,Pastry"


So, in this csv file, we have to seperate values from one single column to different columns. There are two ways:
either we can write python code to change data format or we can change it in excel file itself.
Also, we have to drop duplicate data.

In [3]:
# Here, I have already converted the raw data into appropriate format.
example = pd.read_csv("bread.csv")
example.head(10)

Unnamed: 0,Date,Time,Transaction,Item
0,30-10-16,9:58:11,1,Bread
1,30-10-16,10:05:34,2,Scandinavian
2,30-10-16,10:05:34,2,Scandinavian
3,30-10-16,10:07:57,3,Hot chocolate
4,30-10-16,10:07:57,3,Jam
5,30-10-16,10:07:57,3,Cookies
6,30-10-16,10:08:41,4,Muffin
7,30-10-16,10:13:03,5,Coffee
8,30-10-16,10:13:03,5,Pastry
9,30-10-16,10:13:03,5,Bread


But, I'll be using python code to change format of data.

In [4]:
#to drop dupliacte data
df = df.drop_duplicates()

In [5]:
df_new = df['Date,Time,Transaction,Item'].str.split(',',n=3,expand=True)

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
df['Date'] = df_new[0]
df['Time'] = df_new[1]
df['Transaction'] = df_new[2]
df['Item'] = df_new[3]

In [8]:
df[['Date', 'Time', 'Transaction', 'Item']].head(10)

Unnamed: 0,Date,Time,Transaction,Item
0,2016-10-30,09:58:11,1,Bread
1,2016-10-30,10:05:34,2,Scandinavian
3,2016-10-30,10:07:57,3,Hot chocolate
4,2016-10-30,10:07:57,3,Jam
5,2016-10-30,10:07:57,3,Cookies
6,2016-10-30,10:08:41,4,Muffin
7,2016-10-30,10:13:03,5,Coffee
8,2016-10-30,10:13:03,5,Pastry
9,2016-10-30,10:13:03,5,Bread
10,2016-10-30,10:16:55,6,Medialuna


We should convert this data into binary format where each row corresponds to a transaction and each column corresponds to an item. So, lets make a binary matrix.

In [9]:
tr = pd.crosstab(index= df['Transaction'], columns= df['Item'])
tr

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


 we have one unwanted column named "NONE", we should remove it

In [10]:
tr = tr.drop(['NONE'], axis = 1)
tr

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Creating APRIORI function to generate frequent itemsets. 
#### Given - minimum threshold support = 0.04

In [11]:
def APRIORI(data, min_support=0.04,  max_length = 4):
    import numpy as np
    import pandas as pd
    from itertools import combinations
    
    support = {} 
    L = list(data.columns)

    for i in range(1, max_length+1):
        c = set(combinations(L,i))
        
        L =set()     
        for j in list(c):
            sup = data.loc[:,j].product(axis=1).sum()/len(data.index)
            if sup > min_support:
                support[j] = sup
                L = list(set(L) | set(j))
        
    result = pd.DataFrame(list(support.items()), columns = ["Items", "Support"])
    return(result)

In [12]:
freq_itemset = APRIORI(tr, 0.04, 3)
freq_itemset.sort_values(by = 'Support', ascending = False)

Unnamed: 0,Items,Support
2,"(Coffee,)",0.475081
4,"(Bread,)",0.32494
7,"(Tea,)",0.141643
1,"(Cake,)",0.103137
11,"(Coffee, Bread)",0.089393
6,"(Pastry,)",0.08551
0,"(Sandwich,)",0.071346
8,"(Medialuna,)",0.061379
3,"(Hot chocolate,)",0.057916
12,"(Coffee, Cake)",0.054349


## Creating ASSOCIATION_RULE function to generate itemset 

In [13]:
def ASSOCIATION_RULE(df, min_threshold=0.5):
    import pandas as pd
    from itertools import permutations
    
    support = pd.Series(df.Support.values, index=df.Items).to_dict()
    data = []
    L= df.Items.values
    
    p = list(permutations(L, 2))
    
    for i in p:    
        if set(i[0]).issubset(i[1]):
            conf = support[i[1]]/support[i[0]]
            if conf > min_threshold:
                j = i[1][not i[1].index(i[0][0])]
                lift = support[i[1]]/(support[i[0]]* support[(j,)])
                leverage = support[i[1]] - (support[i[0]]* support[(j,)])
                convection = (1 - support[(j,)])/(1- conf)
                data.append([i[0], (j,), support[i[0]], support[(j,)], support[i[1]], conf, lift, leverage, convection])
                
    result = pd.DataFrame(data, columns = ["antecedents", "consequents", "antecedent support", "consequent support",
                                        "support", "confidence", "Lift", "Leverage", "Convection"])
    return(result)

#### Minimum Confidence = 50%

In [14]:
rule = ASSOCIATION_RULE(freq_itemset, 0.5)
rule

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,Lift,Leverage,Convection
0,"(Cake,)","(Coffee,)",0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667
1,"(Pastry,)","(Coffee,)",0.08551,0.475081,0.047214,0.552147,1.162216,0.00659,1.172079


## Finally sorting results by Lift to get highly associated itemsets.

In [15]:
rule.sort_values(by='Lift', ascending= False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,Lift,Leverage,Convection
1,"(Pastry,)","(Coffee,)",0.08551,0.475081,0.047214,0.552147,1.162216,0.00659,1.172079
0,"(Cake,)","(Coffee,)",0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667


## Conclusion
##### it is observed that "Toast" & "Coffee" are highly associated with lift 1.48.
##### Coffee has been brought most frequently with 47.5% of all the transcaction