In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Groceries_dataset.csv')

In [3]:
df['Member_number']= df['Member_number'].astype('str')
df['itemDescription']= df['itemDescription'].str.strip()
df=df[['Member_number','itemDescription', 'Date']]
df.head()

Unnamed: 0,Member_number,itemDescription,Date
0,1808,tropical fruit,21-07-2015
1,2552,whole milk,05-01-2015
2,2300,pip fruit,19-09-2015
3,1187,other vegetables,12-12-2015
4,3037,whole milk,01-02-2015


In [4]:
df=df.groupby(['Member_number','Date','itemDescription'])['itemDescription'].count().reset_index(name='Count')
df

Unnamed: 0,Member_number,Date,itemDescription,Count
0,1000,15-03-2015,sausage,1
1,1000,15-03-2015,semi-finished bread,1
2,1000,15-03-2015,whole milk,1
3,1000,15-03-2015,yogurt,1
4,1000,24-06-2014,pastry,1
...,...,...,...,...
38001,5000,10-02-2015,root vegetables,1
38002,5000,10-02-2015,semi-finished bread,1
38003,5000,10-02-2015,soda,1
38004,5000,16-11-2014,bottled beer,1


In [5]:
df['Transaction'] = df.groupby(['Member_number', 'Date'])['Date'].ngroup()+1

In [6]:
item_count_pivot = df.pivot_table(index='Transaction', columns='itemDescription', values='Count', aggfunc='sum').fillna(0)
item_count_pivot = item_count_pivot.astype('int32')

In [7]:
basket = df.groupby('Transaction')['itemDescription'].apply(list).reset_index()
basket_encoded = basket['itemDescription'].str.join('|').str.get_dummies('|')

In [8]:
t_length = len(basket_encoded)

In [9]:
basket_encoded

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14961,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# One-Hot encoding with 0 - out of itemset or 1 in itemset
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [11]:
def create_l1_itemset(transactions, global_itemset):
    temp_itemsets = {}
    for index, row in transactions.iterrows():
        for key in transactions.keys():
            frozen_key = frozenset([key])
            if row[key] == 1:
                temp_itemsets[frozen_key] = temp_itemsets[frozen_key] + 1 if frozen_key in temp_itemsets else 1
                global_itemset[frozen_key] = global_itemset[frozen_key] + 1 if frozen_key in global_itemset else 1
    return temp_itemsets, global_itemset

In [12]:
def prune(itemset, transactions, min_support):
    new_itemset = {}
    for k, v in itemset.items():
        if v/t_length >= min_support:
            new_itemset[k] = v
        else:
            for key in list(k):
                if key in transactions.columns:
                    transactions.drop(columns=key, inplace=True)
    return new_itemset, transactions

In [13]:
from itertools import chain, combinations
def get_union(transactions, k, global_itemset):
    new_set = {}
    for index, row in transactions.iterrows():
        itemset = []
        for key in transactions.keys():
            if row[key] == 1:
                itemset.append(key)
        comb = list(combinations(itemset, k))
        for c in comb:
            key = frozenset(c)
            new_set[key] = new_set[key] + 1 if key in new_set else 1
            global_itemset[key] = global_itemset[key] + 1 if key in global_itemset else 1
    return new_set, global_itemset

In [14]:
def powerset(s):
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))

In [15]:
def get_association_rules(itemset, min_confidence, min_lift, global_itemset):
    rules = []
    for item in itemset.keys():
        support = global_itemset[item]/t_length
        subsets = powerset(item)
        for subset in subsets:
            lhs = frozenset(subset)
            rhs = frozenset(element for element in item if element not in subset)
            confidence = (global_itemset[lhs.union(rhs)]/t_length)/(global_itemset[lhs]/t_length)
            if confidence >= min_confidence:
                lift = confidence / (global_itemset[rhs]/t_length)
                if lift >= min_lift:
                    rules.append({
                       'lhs': lhs,
                        'rhs': rhs,
                        'support': support,
                        'confidence': confidence,
                        'lift': lift 
                    })
    return rules

In [16]:
def apriori(transactions, min_support, max_length, min_confidence, min_lift):
    global_itemset = {}
    itemset, global_itemset = create_l1_itemset(transactions, global_itemset)
    k = 2
    while k <= max_length:
        itemset, transactions = prune(itemset, transactions, min_support)
        unioned, global_itemset = get_union(transactions, k, global_itemset)
        if unioned:
            itemset = unioned
            k+=1
            if k > max_length:
                itemset, transactions = prune(itemset, transactions, min_support)
        else:
            break
    rules = get_association_rules(itemset, min_confidence, min_lift, global_itemset)
    sorted_rules = sorted(rules, key=lambda x: x['lift'], reverse=True)
    return sorted_rules

In [17]:
import time
start_time = time.time()
result = apriori(transactions=basket_encoded, min_support=0.01, max_length=2, min_confidence=0.0005, min_lift=0.0009)
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

Elapsed time: 14.439175844192505 seconds


In [18]:
resultsinDataFrame = pd.DataFrame(result, columns = ['lhs', 'rhs', 'support', 'confidence', 'lift'])
resultsinDataFrame

Unnamed: 0,lhs,rhs,support,confidence,lift
0,(whole milk),(yogurt),0.011161,0.070673,0.82294
1,(yogurt),(whole milk),0.011161,0.129961,0.82294
2,(whole milk),(rolls/buns),0.013968,0.088447,0.804028
3,(rolls/buns),(whole milk),0.013968,0.126974,0.804028
4,(other vegetables),(rolls/buns),0.010559,0.086481,0.786154
5,(rolls/buns),(other vegetables),0.010559,0.09599,0.786154
6,(other vegetables),(whole milk),0.014837,0.121511,0.76943
7,(whole milk),(other vegetables),0.014837,0.093948,0.76943
8,(soda),(whole milk),0.011629,0.119752,0.758296
9,(whole milk),(soda),0.011629,0.073635,0.758296
