In [1]:
import pandas as pd
import xlrd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import plotly.express as px

In [2]:
data = pd.read_excel ('set.xlsx', sheet_name='Sheet7', engine='openpyxl')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,Product,Quantity,Age
0,1,Mlijeko,1,46_55
1,1,Voda,1,46_55
2,1,Sokovi,1,46_55
3,1,Kafa,1,46_55
4,1,Čokolade,1,46_55


In [4]:
data.columns

Index(['InvoiceNo', 'Product', 'Quantity', 'Age'], dtype='object')

In [5]:
data.Product.unique()

array(['Mlijeko', 'Voda', 'Sokovi', 'Kafa', 'Čokolade', 'Bombone', 'Keks',
       'Vegeta', 'So', 'Supa', 'Brašno', 'Ulje', 'Tjestenina', 'Začini',
       'Sir', 'Jogurt', 'Piletina', 'Teletina', 'Gljive', 'Jabuke',
       'Kruške', 'Banane', 'Limun', 'Narandža', 'Čips', 'Kolači',
       'Deterdžent', 'Omekšivač', 'Šampon', 'Regenerator', 'Sapun',
       'Sredstvazačišćenje', 'Krompir', 'Luk', 'Kupus', 'Paradajz',
       'Krastavac', 'Kiselasalata', 'Salata', 'Šećer', 'Nescafe',
       'Alkoholnapića', 'Pastazazube', 'Toaletpapir', 'Riža', 'Kukuruz',
       'Kreme', 'Smoki', 'Grisini', 'Kikiriki', 'Kokice', 'Žvake',
       'Mineralnavoda', 'Ostalagaziranapića', 'Puding', 'Šlag',
       'Vrhnjezakuhanje', 'Kockazakafu', 'Puder', 'Maskara', 'Labelo',
       'Četkazakosu', 'Parfem', 'Lakzanokte', 'Lakzakosu', 'Aceton',
       'Energetskapića', 'Lijekovi', 'Biljnipreparati', 'Čajevi', 'Jaja',
       'Hljeb', 'Sjajzausne', 'Karmin', 'Korektor'], dtype=object)

In [6]:
data.Age.unique()

array(['46_55', '20_26', '27_35', '36_45'], dtype=object)

In [7]:
data['Product'] = data['Product'].str.strip()

In [8]:
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

In [9]:
# Buyings for 46-55
basket_46_55 = (data[data['Age'] =="46_55"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [10]:
# Buyings for 20-26
basket_20_26 = (data[data['Age'] =="20_26"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [11]:
# Buyings for 27-35
basket_27_35 = (data[data['Age'] =="27_35"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [12]:
# Buyings for 36-45
basket_36_45 = (data[data['Age'] =="36_45"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [13]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
 
# Encoding the datasets
basket_encoded = basket_46_55.applymap(hot_encode)
basket_46_55 = basket_encoded
 
basket_encoded = basket_20_26.applymap(hot_encode)
basket_20_26 = basket_encoded

basket_encoded = basket_27_35.applymap(hot_encode)
basket_27_35 = basket_encoded

basket_encoded = basket_36_45.applymap(hot_encode)
basket_36_45 = basket_encoded

In [14]:
# Building the model for 46-55
frq_items = apriori(basket_46_55, min_support = 0.6, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                antecedents             consequents  antecedent support  \
215  (Limun, Kafa, Mlijeko)              (Teletina)              0.6250   
226              (Teletina)  (Limun, Kafa, Mlijeko)              0.6250   
8                    (Ulje)                (Brašno)              0.6875   
9                  (Brašno)                  (Ulje)              0.6875   
54            (Ulje, Hljeb)                (Brašno)              0.6250   

     consequent support  support  confidence      lift  leverage  conviction  
215              0.6250   0.6250         1.0  1.600000  0.234375         inf  
226              0.6250   0.6250         1.0  1.600000  0.234375         inf  
8                0.6875   0.6875         1.0  1.454545  0.214844         inf  
9                0.6875   0.6875         1.0  1.454545  0.214844         inf  
54               0.6875   0.6250         1.0  1.454545  0.195312         inf  


In [15]:
# Building the model for 20-26
frq_items = apriori(basket_20_26, min_support = 0.6, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

  antecedents consequents  antecedent support  consequent support   support  \
0   (Mlijeko)     (Hljeb)            0.764706            0.764706  0.647059   
1     (Hljeb)   (Mlijeko)            0.764706            0.764706  0.647059   

   confidence      lift  leverage  conviction  
0    0.846154  1.106509  0.062284    1.529412  
1    0.846154  1.106509  0.062284    1.529412  


In [16]:
# Building the model for 27-35
frq_items = apriori(basket_27_35, min_support = 0.6, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

    antecedents   consequents  antecedent support  consequent support  \
1     (Mlijeko)       (Hljeb)            0.760870            0.782609   
0       (Hljeb)     (Mlijeko)            0.782609            0.760870   
5  (Tjestenina)    (Piletina)            0.739130            0.782609   
4    (Piletina)  (Tjestenina)            0.782609            0.739130   
3     (Mlijeko)    (Piletina)            0.760870            0.782609   

    support  confidence      lift  leverage  conviction  
1  0.673913    0.885714  1.131746  0.078450    1.902174  
0  0.673913    0.861111  1.131746  0.078450    1.721739  
5  0.630435    0.852941  1.089869  0.051985    1.478261  
4  0.630435    0.805556  1.089869  0.051985    1.341615  
3  0.608696    0.800000  1.022222  0.013233    1.086957  


In [17]:
# Building the model for 36-45
frq_items = apriori(basket_36_45, min_support = 0.6, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

    antecedents   consequents  antecedent support  consequent support  \
3  (Tjestenina)        (Kafa)            0.714286            0.857143   
0       (Hljeb)        (Kafa)            0.714286            0.857143   
2        (Kafa)  (Tjestenina)            0.857143            0.714286   
1        (Kafa)       (Hljeb)            0.857143            0.714286   

    support  confidence      lift  leverage  conviction  
3  0.666667    0.933333  1.088889  0.054422    2.142857  
0  0.619048    0.866667  1.011111  0.006803    1.071429  
2  0.666667    0.777778  1.088889  0.054422    1.285714  
1  0.619048    0.722222  1.011111  0.006803    1.028571  
