In [1]:
import pandas as pd
import xlrd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import plotly.express as px

In [2]:
data = pd.read_excel ('set.xlsx', sheet_name='Sheet5', engine='openpyxl')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,Product,Quantity,Education
0,1,Mlijeko,1,University
1,1,Voda,1,University
2,1,Sokovi,1,University
3,1,Kafa,1,University
4,1,Čokolade,1,University


In [4]:
data.Product.unique()

array(['Mlijeko', 'Voda', 'Sokovi', 'Kafa', 'Čokolade', 'Bombone', 'Keks',
       'Vegeta', 'So', 'Supa', 'Brašno', 'Ulje', 'Tjestenina', 'Začini',
       'Sir', 'Jogurt', 'Piletina', 'Teletina', 'Gljive', 'Jabuke',
       'Kruške', 'Banane', 'Limun', 'Narandža', 'Čips', 'Kolači',
       'Deterdžent', 'Omekšivač', 'Šampon', 'Regenerator', 'Sapun',
       'Sredstvazačišćenje', 'Krompir', 'Luk', 'Kupus', 'Paradajz',
       'Krastavac', 'Kiselasalata', 'Salata', 'Šećer', 'Nescafe',
       'Alkoholnapića', 'Pastazazube', 'Toaletpapir', 'Riža', 'Kukuruz',
       'Kreme', 'Smoki', 'Grisini', 'Kikiriki', 'Kokice', 'Žvake',
       'Mineralnavoda', 'Ostalagaziranapića', 'Puding', 'Šlag',
       'Vrhnjezakuhanje', 'Kockazakafu', 'Puder', 'Maskara', 'Labelo',
       'Četkazakosu', 'Parfem', 'Lakzanokte', 'Lakzakosu', 'Aceton',
       'Energetskapića', 'Lijekovi', 'Biljnipreparati', 'Čajevi', 'Jaja',
       'Hljeb', 'Sjajzausne', 'Karmin', 'Korektor'], dtype=object)

In [5]:
data.Education.unique()

array(['University', 'HS'], dtype=object)

In [6]:
data['Product'] = data['Product'].str.strip()

In [7]:
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

In [8]:
# Buyings for High school level
basket_HS = (data[data['Education'] =="HS"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [9]:
# Buyings for University level
basket_University = (data[data['Education'] =="University"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [10]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
 
# Encoding the datasets
basket_encoded = basket_HS.applymap(hot_encode)
basket_HS = basket_encoded
 
basket_encoded = basket_University.applymap(hot_encode)
basket_University = basket_encoded
 



In [12]:
# Building the model for HS
frq_items = apriori(basket_HS, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

               antecedents consequents  antecedent support  \
73        (Kafa, Piletina)   (Mlijeko)            0.581395   
2                 (Brašno)      (Ulje)            0.558140   
92  (Piletina, Tjestenina)   (Mlijeko)            0.558140   
96         (Mlijeko, Ulje)  (Piletina)            0.534884   
98        (Ulje, Piletina)   (Mlijeko)            0.534884   

    consequent support   support  confidence      lift  leverage  conviction  
73            0.813953  0.558140    0.960000  1.179429  0.084911    4.651163  
2             0.581395  0.534884    0.958333  1.648333  0.210384   10.046512  
92            0.813953  0.534884    0.958333  1.177381  0.080584    4.465116  
96            0.697674  0.511628    0.956522  1.371014  0.138453    6.953488  
98            0.813953  0.511628    0.956522  1.175155  0.076257    4.279070  


In [15]:
# Building the model for University
frq_items = apriori(basket_University, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

   antecedents consequents  antecedent support  consequent support  support  \
0     (Brašno)      (Ulje)            0.517241            0.551724      0.5   
1       (Ulje)    (Brašno)            0.551724            0.517241      0.5   
17      (Ulje)   (Mlijeko)            0.551724            0.689655      0.5   
18    (Šampon)  (Piletina)            0.568966            0.689655      0.5   
11      (Riža)      (Kafa)            0.586207            0.758621      0.5   

    confidence      lift  leverage  conviction  
0     0.966667  1.752083  0.214625   13.448276  
1     0.906250  1.752083  0.214625    5.149425  
17    0.906250  1.314062  0.119501    3.310345  
18    0.878788  1.274242  0.107610    2.560345  
11    0.852941  1.124332  0.055291    1.641379  


In [None]:
# Buyings for Bachelor level
basket_Bachelor = (data[data['Education'] =="Bachelor"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))


# Buyings for Master level
basket_Master = (data[data['Education'] =="Master"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))


basket_encoded = basket_Bachelor.applymap(hot_encode)
basket_Bachelor = basket_encoded
 
basket_encoded = basket_Master.applymap(hot_encode)
basket_Master = basket_encoded


# Building the model for Master
frq_items = apriori(basket_Master, min_support = 0.6, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())
