In [1]:
from itertools import combinations
import pandas as pd

In [2]:
dataset = pd.read_csv('../data/transactions.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26531 entries, 0 to 26530
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       26531 non-null  object
 1   1       23677 non-null  object
 2   2       22143 non-null  object
 3   3       21037 non-null  object
 4   4       20142 non-null  object
 5   5       19240 non-null  object
 6   6       18476 non-null  object
 7   7       17702 non-null  object
 8   8       16946 non-null  object
 9   9       16239 non-null  object
 10  10      15556 non-null  object
 11  11      14879 non-null  object
 12  12      14261 non-null  object
 13  13      13635 non-null  object
 14  14      13001 non-null  object
 15  15      12356 non-null  object
 16  16      11707 non-null  object
 17  17      11164 non-null  object
 18  18      10636 non-null  object
 19  19      10073 non-null  object
 20  20      9530 non-null   object
 21  21      9045 non-null   object
 22  22      8625 non-null 

In [4]:
class Aprio:

    def __init__(self, dataset, min_support, min_confidence):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.transactions = self.list_from_dataframe(dataset)
        self.item_sets = self.generate_item_sets()
        self.frequent_items = self.get_frequent_itemsets()
        self.items_count = self.get_items_count()
        self.rules = self.generate_rules(self.frequent_items, self.min_confidence)
        
    def get_items_count(self):
        items_count = {}
        for transaction in self.transactions:
            for itemset in self.frequent_items:
                if itemset.issubset(transaction):
                    items_count[itemset] = items_count.get(itemset, 0) + 1
        return items_count

    """
        input: transactions in form of a dataframe
        output: a 2-D list of transactions [[item1, item2...], [item1, item3...]]
    """
    def list_from_dataframe(self, dataset):
        list_ = []
        for index in range(dataset.shape[0]):
            row = dataset.iloc[index].dropna()
            list_.append(list(row.values))
        return list_
    
    """
        needed data: 2-D list of transactions
        output: a list of all items in all transactions 
    """
    def generate_item_sets(self):
        return [frozenset([item]) for transaction in self.transactions for item in transaction]
    

    """
        removes items that does not verify the minimum support condition
    """
    def prune_min_supp(self, candidate_counts):
        return {itemset for itemset, count in candidate_counts.items() if count >= self.min_support}
        

    def prune_subsets(self,candidates, prev_frequent_itemsets):
        pruned_candidates = set()
        for candidate in candidates:
            is_valid = True
            subsets = combinations(candidate, len(candidate) - 1)
            for subset in subsets:
                if frozenset(subset) not in prev_frequent_itemsets:
                    is_valid = False
                    break
            if is_valid:
                pruned_candidates.add(candidate)
        return pruned_candidates


    def generate_next_candidates_set(self, prev_candidates, k):
        candidates = set()
        for itemset1 in prev_candidates:
            for itemset2 in prev_candidates:
                union_set = itemset1.union(itemset2)
                if len(union_set) == k:
                    candidates.add(union_set)
        return candidates


    """
        needed data: 2-D list of transactions
        output: list of union(F_k), frequent items in eatch iteration 
    """
    def get_frequent_itemsets(self):

        itemsets = self.item_sets.copy()
        frequent_itemsets = []
        
        k = 2

        while itemsets:

            candidate_counts = {}
            for transaction in self.transactions:
                for candidate in itemsets:
                    if candidate.issubset(transaction):
                        candidate_counts[candidate] = candidate_counts.get(candidate, 0) + 1
            
            frequent_itemsets_k = self.prune_min_supp(candidate_counts)
            
            candidates_k = self.generate_next_candidates_set(frequent_itemsets_k, k)
            
            candidates_k = self.prune_subsets(candidates_k, frequent_itemsets_k)
            
            frequent_itemsets.extend(frequent_itemsets_k)

            itemsets = candidates_k
            
            k += 1
        
        return frequent_itemsets

    def calculate_confidence(self, itemset, antecedent):
        return self.items_count[itemset] / self.items_count[antecedent]

    def calculate_lift(self, confidence, consequent):
        return confidence / self.calculate_support(consequent)

    def calculate_support(self, itemset):
        return self.items_count[itemset] / len(self.transactions)

    def generate_rules(self, frequent_itemsets, min_confidence):
        rules = []
        for itemset in frequent_itemsets:
            if len(itemset) > 1:
                itemset_list = list(itemset)
                for i in range(1, len(itemset)):
                    antecedent = frozenset(itemset_list[:i])
                    consequent = frozenset(itemset_list[i:])
                    confidence = self.calculate_confidence(itemset, antecedent)
                    lift = self.calculate_lift(confidence, consequent)
                    if confidence >= min_confidence:
                        rules.append({"antecedent": list(antecedent), "consequent": list(consequent), "confidence": confidence, "lift": lift})
        return rules

In [5]:
apr = Aprio(dataset.head(100), 5, 0.5)

In [6]:
apr.rules

[{'antecedent': ['22727'],
  'consequent': ['22726'],
  'confidence': 0.7,
  'lift': 9.999999999999998},
 {'antecedent': ['22780'],
  'consequent': ['22779'],
  'confidence': 1.0,
  'lift': 20.0},
 {'antecedent': ['22727'],
  'consequent': ['22728'],
  'confidence': 0.6,
  'lift': 8.571428571428571},
 {'antecedent': ['22629'],
  'consequent': ['POST'],
  'confidence': 0.75,
  'lift': 1.9736842105263157},
 {'antecedent': ['84992'],
  'consequent': ['84991'],
  'confidence': 1.0,
  'lift': 12.5},
 {'antecedent': ['22326'],
  'consequent': ['22630'],
  'confidence': 0.5384615384615384,
  'lift': 4.1420118343195265},
 {'antecedent': ['22629'],
  'consequent': ['22630'],
  'confidence': 0.8333333333333334,
  'lift': 6.410256410256411},
 {'antecedent': ['22556'],
  'consequent': ['22555'],
  'confidence': 0.8333333333333334,
  'lift': 11.904761904761903},
 {'antecedent': ['22698'],
  'consequent': ['22423'],
  'confidence': 1.0,
  'lift': 5.2631578947368425},
 {'antecedent': ['15056BL'],
  '