In [1]:
import pandas as pd

DATA = pd.read_csv('Bakery sales.csv')
DATA.head()

Unnamed: 0.1,Unnamed: 0,date,time,ticket_number,article,Quantity,unit_price
0,0,2021-01-02,08:38,150040.0,BAGUETTE,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,PAIN AU CHOCOLAT,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,PAIN AU CHOCOLAT,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,PAIN,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,TRADITIONAL BAGUETTE,5.0,"1,20 €"


In [2]:
ITEMS = DATA['article'].unique()
ITEMS

array(['BAGUETTE', 'PAIN AU CHOCOLAT', 'PAIN', 'TRADITIONAL BAGUETTE',
       'CROISSANT', 'BANETTE', 'BANETTINE', 'SPECIAL BREAD', 'COUPE',
       'SAND JB EMMENTAL', 'KOUIGN AMANN', 'BOULE 200G', 'BOULE 400G',
       'GAL FRANGIPANE 6P', 'CAMPAGNE', 'MOISSON', 'CAFE OU EAU',
       'BRIOCHE', 'CEREAL BAGUETTE', 'SEIGLE', 'COMPLET',
       'DIVERS PATISSERIE', 'GAL FRANGIPANE 4P', 'COOKIE', 'FICELLE',
       'PAIN AUX RAISINS', 'GAL POMME 6P', 'GAL POMME 4P', 'FINANCIER X5',
       'VIK BREAD', 'DIVERS VIENNOISERIE', 'GACHE', 'SANDWICH COMPLET',
       'PAIN BANETTE', 'GRAND FAR BRETON', 'QUIM BREAD',
       'SPECIAL BREAD KG', 'GD KOUIGN AMANN', 'BOULE POLKA',
       'DEMI BAGUETTE', 'CHAUSSON AUX POMMES', 'BAGUETTE GRAINE',
       'DIVERS CONFISERIE', 'SUCETTE', 'DIVERS BOULANGERIE',
       'BOISSON 33CL', 'PATES', 'FORMULE SANDWICH', 'DIVERS SANDWICHS',
       'CROISSANT AMANDES', 'PAIN CHOCO AMANDES', 'SACHET VIENNOISERIE',
       'NANTAIS', 'CHOCOLAT', 'PAIN S/SEL', 'FONDANT CHOC

In [3]:
DATA.replace({'article': dict(zip(ITEMS, range(len(ITEMS))))}, inplace=True)
DATA.head()

  DATA.replace({'article': dict(zip(ITEMS, range(len(ITEMS))))}, inplace=True)


Unnamed: 0.1,Unnamed: 0,date,time,ticket_number,article,Quantity,unit_price
0,0,2021-01-02,08:38,150040.0,0,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,1,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,1,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,2,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,3,5.0,"1,20 €"


In [10]:
from typing import Iterable

def to_bitmap(items: Iterable[int]) -> int:
    bitmap = 0
    for item in items:
        bitmap |= 1 << item
    return bitmap

def from_bitmap(bitmap: int) -> Iterable[int]:
    return (i for i in range(bitmap.bit_length()) if bitmap & (1 << i))

TRANSACTIONS = DATA.groupby('ticket_number')['article'].apply(to_bitmap).reset_index(name='items')
TRANSACTIONS.head()

Unnamed: 0,ticket_number,items
0,150040.0,3
1,150041.0,6
2,150042.0,8
3,150043.0,17
4,150044.0,32


In [20]:
import itertools

TRANSACTION_COUNT = len(TRANSACTIONS)
MINSUP = 0.01

def remove_infrequent_patterns(transactions: Iterable[int], patterns: list[int], threshold: int):
    counts = [0] * len(patterns)

    for transaction in transactions:
        for i, pattern in enumerate(patterns):
            if transaction & pattern == pattern:
                counts[i] += 1

    i = 0
    while i < len(patterns):
        if counts[i] < threshold:
            del patterns[i]
            del counts[i]
        else:
            i += 1

def get_candidates(F_k: list[int], k: int) -> set[int]:
    C_k_plus_1 = set()

    for i, p in enumerate(F_k):
        for q in F_k[i + 1:]:
            p_union_q = p | q
            
            cardinal = p_union_q.bit_count()
            if cardinal != k + 1:
                continue

            all_subsets_of_candidate_are_frequent = True
            for subset in itertools.combinations(from_bitmap(p_union_q), k):
                if to_bitmap(subset) not in F_k:
                    all_subsets_of_candidate_are_frequent = False
                    break
            if not all_subsets_of_candidate_are_frequent:
                continue

            C_k_plus_1.add(p_union_q)

    return C_k_plus_1

def apriori(transactions: Iterable[int], item_count: int, minsup: float) -> Iterable[int]:
    threshold = minsup * len(transactions)

    # Start with singletons.
    F_1 = [to_bitmap([item]) for item in range(item_count)]
    remove_infrequent_patterns(transactions, F_1, threshold)

    F = [F_1]

    for k in range(1, item_count):
        C_k_plus_1 = get_candidates(F[-1], k)

        F_k_plus_1 = list(C_k_plus_1)
        remove_infrequent_patterns(transactions, F_k_plus_1, threshold)

        if not F_k_plus_1:
            break

        F.append(F_k_plus_1)

    return (pattern for F_k in F for pattern in F_k)

FREQUENT_PATTERNS = apriori(TRANSACTIONS['items'], len(ITEMS), MINSUP)
FREQUENT_PATTERNS = [list(map(lambda item: ITEMS[item], from_bitmap(pattern))) for pattern in FREQUENT_PATTERNS]

print(FREQUENT_PATTERNS)

[['BAGUETTE'], ['PAIN AU CHOCOLAT'], ['PAIN'], ['TRADITIONAL BAGUETTE'], ['CROISSANT'], ['BANETTE'], ['BANETTINE'], ['SPECIAL BREAD'], ['COUPE'], ['SAND JB EMMENTAL'], ['BOULE 200G'], ['BOULE 400G'], ['CAMPAGNE'], ['MOISSON'], ['CAFE OU EAU'], ['BRIOCHE'], ['CEREAL BAGUETTE'], ['COMPLET'], ['COOKIE'], ['FICELLE'], ['PAIN AUX RAISINS'], ['VIK BREAD'], ['SANDWICH COMPLET'], ['PAIN BANETTE'], ['GRAND FAR BRETON'], ['CHAUSSON AUX POMMES'], ['BAGUETTE GRAINE'], ['BOISSON 33CL'], ['FORMULE SANDWICH'], ['CROISSANT AMANDES'], ['PAIN CHOCO AMANDES'], ['TARTELETTE'], ['ECLAIR'], ['BAGUETTE', 'TRADITIONAL BAGUETTE'], ['PAIN AU CHOCOLAT', 'TRADITIONAL BAGUETTE'], ['BAGUETTE', 'CROISSANT'], ['PAIN AU CHOCOLAT', 'CROISSANT'], ['TRADITIONAL BAGUETTE', 'CROISSANT'], ['TRADITIONAL BAGUETTE', 'VIK BREAD'], ['BAGUETTE', 'COUPE'], ['COUPE', 'CAMPAGNE'], ['COUPE', 'MOISSON'], ['TRADITIONAL BAGUETTE', 'COUPE'], ['SPECIAL BREAD', 'COUPE'], ['COUPE', 'COMPLET'], ['COUPE', 'VIK BREAD'], ['COUPE', 'BOULE 200G']