In [12]:
from itertools import combinations
import pandas as pd

In [13]:
dataset = pd.read_csv('../data/transactions.csv')

In [14]:
class Aprio:

    def __init__(self, dataset, min_support, min_confidence):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.transactions = self.list_from_dataframe(dataset)
        self.item_sets = self.generate_item_sets()
        self.frequent_items = self.get_frequent_itemsets()
        self.items_count = self.get_items_count()
        self.rules = self.generate_rules(self.frequent_items, self.min_confidence)
        
    def get_items_count(self):
        print("====> getting items' count")
        items_count = {}
        for transaction in self.transactions:
            for itemset in self.frequent_items:
                if itemset.issubset(transaction):
                    items_count[itemset] = items_count.get(itemset, 0) + 1
        return items_count

    """
        input: transactions in form of a dataframe
        output: a 2-D list of transactions [[item1, item2...], [item1, item3...]]
    """
    def list_from_dataframe(self, dataset):
        print("====> getting list of transactions")
        list_ = []
        for index in range(dataset.shape[0]):
            row = dataset.iloc[index].dropna()
            list_.append(list(row.values))
        return list_
    
    """
        needed data: 2-D list of transactions
        output: a list of all items in all transactions 
    """
    def generate_item_sets(self):
        print("====> generating item sets")
        return [frozenset([item]) for transaction in self.transactions for item in transaction]
    

    """
        removes items that does not verify the minimum support condition
    """
    def prune_min_supp(self, candidate_counts):
        print("====> pruning min support")
        return {itemset for itemset, count in candidate_counts.items() if count  >= self.min_support}
        

    def prune_subsets(self,candidates, prev_frequent_itemsets):
        print("====> pruning")
        pruned_candidates = set()
        for candidate in candidates:
            is_valid = True
            subsets = combinations(candidate, len(candidate) - 1)
            for subset in subsets:
                if frozenset(subset) not in prev_frequent_itemsets:
                    is_valid = False
                    break
            if is_valid:
                pruned_candidates.add(candidate)
        return pruned_candidates


    def generate_next_candidates_set(self, prev_candidates, k):
        print("====> generating next condidates")
        candidates = set()
        for itemset1 in prev_candidates:
            for itemset2 in prev_candidates:
                union_set = itemset1.union(itemset2)
                if len(union_set) == k:
                    candidates.add(union_set)
        return candidates


    """
        needed data: 2-D list of transactions
        output: list of union(F_k), frequent items in eatch iteration 
    """
    def get_frequent_itemsets(self):
        print("====> getting frequent items")
        itemsets = self.item_sets.copy()
        frequent_itemsets = []
        
        k = 2

        while itemsets:

            candidate_counts = {}
            for transaction in self.transactions:
                for candidate in itemsets:
                    if candidate.issubset(transaction):
                        candidate_counts[candidate] = candidate_counts.get(candidate, 0) + 1
            
            frequent_itemsets_k = self.prune_min_supp(candidate_counts)
            
            candidates_k = self.generate_next_candidates_set(frequent_itemsets_k, k)
            
            candidates_k = self.prune_subsets(candidates_k, frequent_itemsets_k)
            
            frequent_itemsets.extend(frequent_itemsets_k)

            itemsets = candidates_k
            
            k += 1
        
        return frequent_itemsets

    def calculate_confidence(self, itemset, antecedent):
        return self.items_count[itemset] / self.items_count[antecedent]

    def calculate_lift(self, confidence, consequent):
        return confidence / self.calculate_support(consequent)

    def calculate_support(self, itemset):
        return self.items_count[itemset] / len(self.transactions)

   
    def generate_rules(self, frequent_itemsets, min_confidence):
        print("====> generating rulesl")
        rules = []
        for itemset in frequent_itemsets:
            if len(itemset) > 1:
                itemset_list = list(itemset)
                for i in range(1, len(itemset)):
                    antecedent = frozenset(itemset_list[:i])
                    consequent = frozenset(itemset_list[i:])
                    confidence = self.calculate_confidence(itemset, antecedent)
                    lift = self.calculate_lift(confidence, consequent)
                    if confidence >= min_confidence:
                        if len(list(antecedent)) > 0 and len(list(consequent)) > 0:
                            rules.append({"antecedent": list(antecedent), "consequent": list(consequent), "confidence": confidence, "lift": lift})
        return rules
    

In [15]:
apr = Aprio(dataset.sample(100), 5, 0.4)   

====> getting list of transactions
====> generating item sets
====> getting frequent items
====> pruning min support
====> generating next condidates
====> pruning
====> pruning min support
====> generating next condidates
====> pruning
====> pruning min support
====> generating next condidates
====> pruning
====> pruning min support
====> generating next condidates
====> pruning
====> getting items' count
====> generating rulesl


In [16]:
len(apr.rules)

33

In [127]:
rules_df = pd.DataFrame(apr.rules)
rules_df

Unnamed: 0,antecedent,consequent,confidence,lift
0,[22630],[22326],0.466667,3.562341
1,[23175],[22699],0.562500,19.396552
2,[22628],[POST],0.444444,1.019368
3,[23309],[POST],0.500000,1.146789
4,[22554],[22551],0.469880,6.436706
...,...,...,...,...
2064,"[23291, 23290, 22630, 23292]","[22629, 23289]",0.714286,119.047619
2065,"[23291, 23290, 23292, 22630, 22629]",[23289],1.000000,55.555556
2066,"[22554, 22326, 22551]","[22556, POST, 22328]",0.400000,36.363636
2067,"[22556, 22554, 22326, 22551]","[POST, 22328]",0.500000,9.615385


In [129]:
import json 
import re

def rules_to_json(rules, path):   

    def filter_fields(dictionary, fields_to_keep):
        return {key: value for key, value in dictionary.items() if key in fields_to_keep}

    filtered_data = [filter_fields(item, ['antecedent', 'consequent', 'lift', 'confidence']) for item in rules]

    json_data = json.dumps(filtered_data, indent=2)
    with open(f'{path}/rules.json', 'w') as json_file:
        json_file.write(json_data)
        json_file.close()
        print("successfully wrote results.")
    

def rules_to_markdown(rules, path):
    rules_file = open(f'{path}/rules.md', 'w')
    line = f"<ol>\n"
    for rule in rules:
        line += f"<li>"
        
        line += f"{rule['antecedent']}"
        
        line += f" ========> "
        
        line += f"{rule['consequent']} "
        line += f"</li>\n"
    line += f"</ol>"
    rules_file.write(line)

    rules_file.close()

def rules_to_xlsx(rules):
    pd.DataFrame(rules).to_csv('../results/results.xlsx')

def map_rules(rules):
    mapping = open("../data/map_stockCode_item.json", "r")
    json_map = json.load(mapping)
    
    cp_rules = rules.copy()
    mapped_rules = []
    for rule in cp_rules:
        ant = []
        for item in rule['antecedent']:
            ant.append(json_map[item])

        cons = []
        for item in rule['consequent']:
            cons.append(json_map[item])

        obj = {'antecedent': ', '.join(ant), 'consequent': ', '.join(cons), 'confidence': rule['confidence'], 'lift': rule['lift']}
        mapped_rules.append(obj)

    return mapped_rules


print("mapping to items...")
rules = map_rules(apr.rules)
print("writing to file...")
rules_to_markdown(rules, "../results")
rules_to_json(rules, "../results")
rules_to_xlsx(rules)
rules

mapping to items...
writing to file...
successfully wrote results.


[{'antecedent': 'DOLLY GIRL LUNCH BOX',
  'consequent': 'ROUND SNACK BOXES SET OF4 WOODLAND',
  'confidence': 0.4666666666666667,
  'lift': 3.5623409669211195},
 {'antecedent': 'REGENCY MILK JUG PINK',
  'consequent': 'ROSES REGENCY TEACUP AND SAUCER',
  'confidence': 0.5625,
  'lift': 19.39655172413793},
 {'antecedent': 'PICNIC BOXES SET OF 3 RETROSPOT',
  'consequent': 'POSTAGE',
  'confidence': 0.4444444444444444,
  'lift': 1.019367991845056},
 {'antecedent': 'SET OF 60 I LOVE LONDON CAKE CASES',
  'consequent': 'POSTAGE',
  'confidence': 0.5,
  'lift': 1.146788990825688},
 {'antecedent': 'PLASTERS IN TIN WOODLAND ANIMALS',
  'consequent': 'PLASTERS IN TIN SPACEBOY',
  'confidence': 0.46987951807228917,
  'lift': 6.43670572701766},
 {'antecedent': 'TRADTIONAL ALPHABET STAMP SET',
  'consequent': 'MINI LIGHTS WOODLAND MUSHROOMS',
  'confidence': 0.5714285714285714,
  'lift': 18.433179723502302},
 {'antecedent': 'PLASTERS IN TIN CIRCUS PARADE',
  'consequent': 'PLASTERS IN TIN WOODLAN

In [133]:
pd.DataFrame(rules).isna().sum()

antecedent    0
consequent    0
confidence    0
lift          0
dtype: int64