In [1]:
import pandas as pd
import xlrd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import plotly.express as px

In [2]:
data = pd.read_excel ('set.xlsx', sheet_name='Sheet6', engine='openpyxl')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,Product,Quantity,Gender
0,1,Mlijeko,1,Female
1,1,Voda,1,Female
2,1,Sokovi,1,Female
3,1,Kafa,1,Female
4,1,Čokolade,1,Female


In [4]:
data.columns

Index(['InvoiceNo', 'Product', 'Quantity', 'Gender'], dtype='object')

In [5]:
data.Product.unique()

array(['Mlijeko', 'Voda', 'Sokovi', 'Kafa', 'Čokolade', 'Bombone', 'Keks',
       'Vegeta', 'So', 'Supa', 'Brašno', 'Ulje', 'Tjestenina', 'Začini',
       'Sir', 'Jogurt', 'Piletina', 'Teletina', 'Gljive', 'Jabuke',
       'Kruške', 'Banane', 'Limun', 'Narandža', 'Čips', 'Kolači',
       'Deterdžent', 'Omekšivač', 'Šampon', 'Regenerator', 'Sapun',
       'Sredstvazačišćenje', 'Krompir', 'Luk', 'Kupus', 'Paradajz',
       'Krastavac', 'Kiselasalata', 'Salata', 'Šećer', 'Nescafe',
       'Alkoholnapića', 'Pastazazube', 'Toaletpapir', 'Riža', 'Kukuruz',
       'Kreme', 'Smoki', 'Grisini', 'Kikiriki', 'Kokice', 'Žvake',
       'Mineralnavoda', 'Ostalagaziranapića', 'Puding', 'Šlag',
       'Vrhnjezakuhanje', 'Kockazakafu', 'Puder', 'Maskara', 'Labelo',
       'Četkazakosu', 'Parfem', 'Lakzanokte', 'Lakzakosu', 'Aceton',
       'Energetskapića', 'Lijekovi', 'Biljnipreparati', 'Čajevi', 'Jaja',
       'Hljeb', 'Sjajzausne', 'Karmin', 'Korektor'], dtype=object)

In [6]:
data.Gender.unique()

array(['Female', 'Male'], dtype=object)

In [7]:
data['Product'] = data['Product'].str.strip()

In [8]:
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

In [9]:
# Buyings for Female
basket_Female = (data[data['Gender'] =="Female"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [10]:
# Buyings for Male
basket_Male = (data[data['Gender'] =="Male"]
          .groupby(['InvoiceNo', 'Product'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [11]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
 
# Encoding the datasets
basket_encoded = basket_Female.applymap(hot_encode)
basket_Female = basket_encoded
 
basket_encoded = basket_Male.applymap(hot_encode)
basket_Male = basket_encoded
 

In [21]:
# Building the model for Female
frq_items = apriori(basket_Female, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

              antecedents    consequents  antecedent support  \
80     (Piletina, Brašno)         (Ulje)            0.508197   
7                (Brašno)         (Ulje)            0.590164   
56                  (Sir)  (Toaletpapir)            0.524590   
74      (Brašno, Mlijeko)         (Ulje)            0.524590   
86  (Šampon, Toaletpapir)   (Deterdžent)            0.540984   

    consequent support   support  confidence      lift  leverage  conviction  
80            0.639344  0.508197    1.000000  1.564103  0.183284         inf  
7             0.639344  0.573770    0.972222  1.520655  0.196453   12.983607  
56            0.639344  0.508197    0.968750  1.515224  0.172803   11.540984  
74            0.639344  0.508197    0.968750  1.515224  0.172803   11.540984  
86            0.606557  0.508197    0.939394  1.548731  0.180059    6.491803  


In [19]:
# Building the model for Male
frq_items = apriori(basket_Male, min_support = 0.5, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

         antecedents consequents  antecedent support  consequent support  \
14  (Hljeb, Mlijeko)      (Kafa)            0.615385            0.846154   
1            (Hljeb)      (Kafa)            0.717949            0.846154   
7       (Tjestenina)      (Kafa)            0.615385            0.846154   
13   (Kafa, Mlijeko)     (Hljeb)            0.666667            0.717949   
12     (Kafa, Hljeb)   (Mlijeko)            0.666667            0.769231   

     support  confidence      lift  leverage  conviction  
14  0.589744    0.958333  1.132576  0.069034    3.692308  
1   0.666667    0.928571  1.097403  0.059172    2.153846  
7   0.564103    0.916667  1.083333  0.043393    1.846154  
13  0.589744    0.884615  1.232143  0.111111    2.444444  
12  0.589744    0.884615  1.150000  0.076923    2.000000  
