In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import copy

In [20]:
def remove_empty_value(dataset):
    datalist = []
    for row in dataset:
        temp_row = [column for column in row if not(pd.isnull(column)) == True]
        datalist.append(temp_row)
    return datalist


def f1(dataset, min_support):
    itemset = []
    itemcount = {}
    total_transaction = len(dataset)
    unique_items = set()
    for transaction in dataset:
        for item in transaction:
            if item not in unique_items:
                unique_items.add(item)
                itemset.append([item])
                itemcount[item] = 1
            else:
                itemcount[item] += 1
    f1_set = []
    for item, count in itemcount.items():
        if count >= (min_support * len(dataset)):
            f1_set.append([item])
    return f1_set


def merge_candidate(current_set, previous_set, size, flag):
    pruned_set = []
    if flag == 1:
        for candidate in current_set:
            isFrequent = True
            for subset in combinations(candidate, size - 1):
                if list(subset) not in previous_set:
                    isFrequent = False
                    break
            if isFrequent:
                pruned_set.append(candidate)
    else:
        pruned_set = copy.deepcopy(current_set)
    return pruned_set

def generate_candidates_f1_fk1(dataset, f1_set, min_support):
    candidates = apriori(dataset, f1_set, min_support, 0)
    return candidates

def generate_candidates_fk1_fk1(dataset, f1_set, min_support):
    candidates = apriori(dataset, f1_set, min_support, 1)
    return candidates

def apriori(dataset, candidate_set, min_support, flag):    
    candidates = []
    set_size = 1
    f1 = copy.deepcopy(candidate_set)
    count_candidate_gen = 0
    
    while candidate_set:
        temp_candidates = []
        pruned_temp_candidates = []
        unique_set = set()
        if flag == 1:
            for i in range(len(candidate_set)):
                items1 = candidate_set[i]
                for j in range(i + 1, len(candidate_set)):
                    items2 = candidate_set[j]
                    if items1[:-1] == items2[:-1] and items1[-1] != items2[-1]:
                        temp_items = copy.deepcopy(items1)
                        temp_items.append(items2[-1])
                        temp_items = sorted(temp_items)
                        if tuple(temp_items) not in unique_set:
                            unique_set.add(tuple(temp_items))
                            temp_candidates.append(temp_items)
            set_size += 1
            pruned_temp_candidates = merge_candidate(temp_candidates, candidate_set, set_size, flag)
        else:
            for i in range(len(f1)):
                items1 = f1[i]
                for j in range(len(candidate_set)):
                    items2 = candidate_set[j]
                    if items1[-1] not in items2:
                        temp_items = copy.deepcopy(items2) 
                        temp_items.append(items1[-1])
                        temp_items = sorted(temp_items)
                        if tuple(temp_items) not in unique_set:
                            unique_set.add(tuple(temp_items))
                            temp_candidates.append(temp_items)                            
            set_size += 1
            pruned_temp_candidates = merge_candidate(temp_candidates, candidate_set, set_size, flag)
        
        if len(pruned_temp_candidates) > 0:
            count_candidate_gen = len(pruned_temp_candidates)
            
        itemcount = {}
        for candidate in pruned_temp_candidates:
             itemcount[tuple(candidate)] = 0
                
        for transaction in dataset:
            for candidate in pruned_temp_candidates:
                if set(candidate).issubset(transaction):
                    itemcount[tuple(candidate)] += 1

        new_itemset = []
        for candidate, count in itemcount.items():
            if count >= (min_support * len(dataset)):
                new_itemset.append(list(candidate))
        if len(new_itemset) > 0:
            candidates = copy.deepcopy(new_itemset)
        candidate_set = new_itemset
        
    return candidates, count_candidate_gen

def rules_generator(frequent_set, min_confidence, dataset):
    rules = []
    for itemset in frequent_set:
        if len(itemset) > 1:
            for i in range(1,len(itemset)):
                for prefix in combinations(itemset, i):
                    prefix = list(prefix)
                    suffix = list(set(itemset) - set(prefix))
                    
                    itemset_support = 0
                    for transaction in dataset:
                        if set(itemset).issubset(transaction):
                            itemset_support += 1
                    
                    prefix_support = 0
                    for transaction in dataset:
                        if set(prefix).issubset(transaction):
                            prefix_support += 1
                    
                    confidence = itemset_support/prefix_support
                    if confidence >= min_confidence:
                        rules.append((prefix, suffix, confidence))
    return rules

In [39]:
# Load the dataset
df = pd.read_csv('groceries.csv')
# Cleaning the dataset
# Remove the items column
df.drop(columns='Item(s)', inplace=True)
# Add all the values to a list
df = df.values.tolist()
# Call the function to remove all the empty values
dataset = remove_empty_value(df)

# Defining some default values
min_support = 0.04
min_confidence = 0.07
f_1 = f1(dataset, min_support)
candidates_f1_fk1, f1_fk1_candidate_gencount = generate_candidates_f1_fk1(dataset, f_1, min_support)
candidates_fk1_fk1, fk1_fk1_candidate_gencount = generate_candidates_fk1_fk1(dataset, f_1, min_support)

rules_f1_fk1 = rules_generator(candidates_f1_fk1, min_confidence, dataset)
rules_fk1_fk1 = rules_generator(candidates_fk1_fk1, min_confidence, dataset)

print("The number of candidates in f1 & fk-1 = ", f1_fk1_candidate_gencount)
print("The number of candidates in fk-1 & fk-1 = ", fk1_fk1_candidate_gencount)
print("Candidate generation differences between the first and second = ", abs(f1_fk1_candidate_gencount - fk1_fk1_candidate_gencount))
print("\n")
print(f"Frequent Itemsets f1 & fk-1 with a support >= ", {min_support})
for itemset in candidates_f1_fk1:
    print(itemset)
print("\n")
print("f1 & fk-1 association rules with a confidence >= ", {min_confidence})
for rule in rules_f1_fk1:
    antecedent, consequent, confidence = rule
    print(f"{antecedent} => {consequent}, Confidence: {confidence:.2f}")
print("\n")
print(f"The frequent itemsets from fk-1 & fk-1 with a support >= ", {min_support})
for itemset in candidates_fk1_fk1:
    print(itemset)
print("\n")
print(f"fk-1 & fk-1 association rules with a confidence >= ", {min_confidence})
for rule in rules_fk1_fk1:
    antecedent, consequent, confidence = rule
    print(f"{antecedent} => {consequent}, Confidence: {confidence:.2f}")

print("\n The potential solution for this was discussed with classmates")

The number of candidates in f1 & fk-1 =  249
The number of candidates in fk-1 & fk-1 =  3
Candidate generation differences between the first and second =  246


Frequent Itemsets f1 & fk-1 with a support >=  {0.04}
['tropical fruit', 'whole milk']
['whole milk', 'yogurt']
['other vegetables', 'yogurt']
['other vegetables', 'whole milk']
['rolls/buns', 'whole milk']
['soda', 'whole milk']
['root vegetables', 'whole milk']
['other vegetables', 'rolls/buns']
['other vegetables', 'root vegetables']


f1 & fk-1 association rules with a confidence >=  {0.07}
['tropical fruit'] => ['whole milk'], Confidence: 0.40
['whole milk'] => ['tropical fruit'], Confidence: 0.17
['whole milk'] => ['yogurt'], Confidence: 0.22
['yogurt'] => ['whole milk'], Confidence: 0.40
['other vegetables'] => ['yogurt'], Confidence: 0.22
['yogurt'] => ['other vegetables'], Confidence: 0.31
['other vegetables'] => ['whole milk'], Confidence: 0.39
['whole milk'] => ['other vegetables'], Confidence: 0.29
['rolls/buns'] =>