In [1]:
import pandas as pd
import xlrd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import plotly.express as px

In [2]:
data = pd.read_excel ('set.xlsx', sheet_name='Sheet9', engine='openpyxl')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,Product,Quantity,AnnualIncome
0,1,Mlijeko,1,twelve
1,1,Voda,1,twelve
2,1,Sokovi,1,twelve
3,1,Kafa,1,twelve
4,1,Čokolade,1,twelve


In [4]:
data.columns

Index(['InvoiceNo', 'Product', 'Quantity', 'AnnualIncome'], dtype='object')

In [5]:
data.Product.unique()

array(['Mlijeko', 'Voda', 'Sokovi', 'Kafa', 'Čokolade', 'Bombone', 'Keks',
       'Vegeta', 'So', 'Supa', 'Brašno', 'Ulje', 'Tjestenina', 'Začini',
       'Sir', 'Jogurt', 'Piletina', 'Teletina', 'Gljive', 'Jabuke',
       'Kruške', 'Banane', 'Limun', 'Narandža', 'Čips', 'Kolači',
       'Deterdžent', 'Omekšivač', 'Šampon', 'Regenerator', 'Sapun',
       'Sredstvazačišćenje', 'Krompir', 'Luk', 'Kupus', 'Paradajz',
       'Krastavac', 'Kiselasalata', 'Salata', 'Šećer', 'Nescafe',
       'Alkoholnapića', 'Pastazazube', 'Toaletpapir', 'Riža', 'Kukuruz',
       'Kreme', 'Smoki', 'Grisini', 'Kikiriki', 'Kokice', 'Žvake',
       'Mineralnavoda', 'Ostalagaziranapića', 'Puding', 'Šlag',
       'Vrhnjezakuhanje', 'Kockazakafu', 'Puder', 'Maskara', 'Labelo',
       'Četkazakosu', 'Parfem', 'Lakzanokte', 'Lakzakosu', 'Aceton',
       'Energetskapića', 'Lijekovi', 'Biljnipreparati', 'Čajevi', 'Jaja',
       'Hljeb', 'Sjajzausne', 'Karmin', 'Korektor'], dtype=object)

In [6]:
data.AnnualIncome.unique()

array(['twelve', 'eighteen', 'twentyfour', 'six'], dtype=object)

In [7]:
data['Product'] = data['Product'].str.strip()

In [8]:
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

In [9]:
# Buyings for 6
basket_six = (data[data['AnnualIncome'] =="six"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [10]:
# Buyings for 12
basket_twelve = (data[data['AnnualIncome'] =="twelve"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [11]:
# Buyings for 18
basket_eighteen = (data[data['AnnualIncome'] =="eighteen"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [12]:
# Buyings for 24
basket_twentyfour = (data[data['AnnualIncome'] =="twentyfour"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [13]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
 
# Encoding the datasets
basket_encoded = basket_six.applymap(hot_encode)
basket_six = basket_encoded
 
basket_encoded = basket_twelve.applymap(hot_encode)
basket_twelve = basket_encoded

basket_encoded = basket_eighteen.applymap(hot_encode)
basket_eighteen = basket_encoded

basket_encoded = basket_twentyfour.applymap(hot_encode)
basket_twentyfour = basket_encoded

In [17]:
# Building the model for 6
frq_items = apriori(basket_six, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

     antecedents   consequents  antecedent support  consequent support  \
22  (Deterdžent)      (Brašno)                 0.5                 0.5   
23      (Brašno)  (Deterdžent)                 0.5                 0.5   
28      (Brašno)          (So)                 0.5                 0.5   
29          (So)      (Brašno)                 0.5                 0.5   
36      (Vegeta)      (Brašno)                 0.5                 0.5   

    support  confidence  lift  leverage  conviction  
22      0.5         1.0   2.0      0.25         inf  
23      0.5         1.0   2.0      0.25         inf  
28      0.5         1.0   2.0      0.25         inf  
29      0.5         1.0   2.0      0.25         inf  
36      0.5         1.0   2.0      0.25         inf  


In [18]:
# Building the model for 12
frq_items = apriori(basket_twelve, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

           antecedents       consequents  antecedent support  \
2905     (Sir, Začini)  (Šampon, Brašno)            0.521739   
2908  (Šampon, Brašno)     (Sir, Začini)            0.521739   
3759     (Sir, Začini)  (Gljive, Šampon)            0.521739   
3762  (Gljive, Šampon)     (Sir, Začini)            0.521739   
4389     (Sir, Začini)    (Šampon, Kafa)            0.521739   

      consequent support   support  confidence      lift  leverage  conviction  
2905            0.521739  0.521739         1.0  1.916667  0.249527         inf  
2908            0.521739  0.521739         1.0  1.916667  0.249527         inf  
3759            0.521739  0.521739         1.0  1.916667  0.249527         inf  
3762            0.521739  0.521739         1.0  1.916667  0.249527         inf  
4389            0.521739  0.521739         1.0  1.916667  0.249527         inf  


In [19]:
# Building the model for 18
frq_items = apriori(basket_eighteen, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

              antecedents consequents  antecedent support  consequent support  \
18    (Tjestenina, Hljeb)   (Mlijeko)                0.52                0.74   
16  (Mlijeko, Tjestenina)     (Hljeb)                0.54                0.74   
12        (Mlijeko, Kafa)     (Hljeb)                0.56                0.74   
13          (Kafa, Hljeb)   (Mlijeko)                0.56                0.74   
0               (Mlijeko)     (Hljeb)                0.74                0.74   

    support  confidence      lift  leverage  conviction  
18     0.50    0.961538  1.299376    0.1152    6.760000  
16     0.50    0.925926  1.251251    0.1004    3.510000  
12     0.50    0.892857  1.206564    0.0856    2.426667  
13     0.50    0.892857  1.206564    0.0856    2.426667  
0      0.66    0.891892  1.205259    0.1124    2.405000  


In [20]:
# Building the model for 24
frq_items = apriori(basket_twentyfour, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

               antecedents            consequents  antecedent support  \
175    (Deterdžent, Hljeb)          (Toaletpapir)            0.529412   
178          (Toaletpapir)    (Deterdžent, Hljeb)            0.529412   
180  (Mlijeko, Deterdžent)            (Omekšivač)            0.529412   
185            (Omekšivač)  (Mlijeko, Deterdžent)            0.529412   
216   (Deterdžent, Šampon)            (Omekšivač)            0.529412   

     consequent support   support  confidence      lift  leverage  conviction  
175            0.529412  0.529412         1.0  1.888889  0.249135         inf  
178            0.529412  0.529412         1.0  1.888889  0.249135         inf  
180            0.529412  0.529412         1.0  1.888889  0.249135         inf  
185            0.529412  0.529412         1.0  1.888889  0.249135         inf  
216            0.529412  0.529412         1.0  1.888889  0.249135         inf  
