In [112]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from itertools import combinations

#Load dữ liệu
df = pd.read_csv('data.csv', header=None)
display(df)

Unnamed: 0,0,1,2,3,4,5
0,Wine,Chips,Bread,Butter,Milk,Apple
1,Wine,,Bread,Butter,Milk,
2,,,Bread,Butter,Milk,
3,,Chips,,,,Apple
4,Wine,Chips,Bread,Butter,Milk,Apple
5,Wine,Chips,,,Milk,
6,Wine,Chips,Bread,Butter,,Apple
7,Wine,Chips,,,Milk,
8,Wine,,Bread,,,Apple
9,Wine,,Bread,Butter,Milk,


In [113]:
records = []
for i in range(0, df.shape[0]):
    records.append([str(df.values[i, j]) for j in range(0, df.shape[1])])

In [114]:
#chuyển records thành transaction
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
display(df1)

Unnamed: 0,Apple,Bread,Butter,Chips,Milk,Wine,nan
0,True,True,True,True,True,True,False
1,False,True,True,False,True,True,True
2,False,True,True,False,True,False,True
3,True,False,False,True,False,False,True
4,True,True,True,True,True,True,False
5,False,False,False,True,True,True,True
6,True,True,True,True,False,True,True
7,False,False,False,True,True,True,True
8,True,True,False,False,False,True,True
9,False,True,True,False,True,True,True


In [115]:
frequent_itemsets = apriori(df1, min_support=0.6, use_colnames=True)
display(frequent_itemsets)

Unnamed: 0,support,itemsets
0,0.681818,(Apple)
1,0.727273,(Bread)
2,0.681818,(Butter)
3,0.636364,(Chips)
4,0.772727,(Milk)
5,0.727273,(Wine)
6,0.818182,(nan)
7,0.636364,"(Milk, Wine)"


In [116]:
# build association rules using support metric
rules = association_rules(frequent_itemsets, metric="support", support_only=True, 
                          min_threshold=0.1)

rules = rules[['antecedents', 'consequents', 'support']]
print(rules)

  antecedents consequents   support
0      (Milk)      (Wine)  0.636364
1      (Wine)      (Milk)  0.636364


In [117]:
def find_frequent_1_itemsets(D):
    total_transactions = len(D)

    # Đếm tần suất xuất hiện từng item
    item_counts = {}
    for transaction in np.array(D):
        for item in transaction:
            item_counts[item] = item_counts.get(item, 0) + 1

    # Tính support (%)
    data = []
    for item, count in item_counts.items():
        support = f"{round(count / total_transactions * 100)}%"
        data.append([item, count, support])

    # Tạo bảng dạng pandas DataFrame
    df2 = pd.DataFrame(data, columns=["Item", "frequency", "support"])
    df2 = df2.sort_values(by="Item").reset_index(drop=True)

    return df2

find_frequent_1_itemsets(df)

Unnamed: 0,Item,frequency,support
0,Apple,15,68%
1,Bread,16,73%
2,Butter,15,68%
3,Chips,14,64%
4,Milk,17,77%
5,Wine,16,73%
6,,39,177%


In [118]:
def find_frequent_2_itemsets(D):
    total_txns = len(D)

    # Đếm tần suất cho 1-item và 2-itemsets
    pair_freq = {}
    item_freq = {}

    for transaction in np.array(D):
        # Remove nan values and ensure all items are strings
        filtered_transaction = [str(item) for item in transaction if not pd.isna(item)]
        for item in filtered_transaction:
            item_freq[item] = item_freq.get(item, 0) + 1
        for pair in combinations(sorted(filtered_transaction), 2):
            pair = frozenset(pair)
            pair_freq[pair] = pair_freq.get(pair, 0) + 1

    # Thiết lập minsup và minconf
    minsup = 0.6
    minconf = 0.8

    # Lọc 2-itemsets theo minsup
    filtered_pairs = {}
    data = []
    for pair, freq in pair_freq.items():
        support = freq / total_txns
        if support >= minsup:
            filtered_pairs[pair] = freq
            items = ', '.join(pair)
            data.append([items, freq, f"{int(support * 100)}%"])

    df_freq = pd.DataFrame(data, columns=['Item', 'frequency', 'support'])

    # Tính confidence từ các 2-itemsets thỏa minsup
    rules = []
    for pair, freq_pair in filtered_pairs.items():
        A, B = list(pair)
        # A → B
        conf_AB = freq_pair / item_freq[A]
        if conf_AB >= minconf:
            rules.append((f"{A} → {B}", round(conf_AB * 100)))
        # B → A
        conf_BA = freq_pair / item_freq[B]
        if conf_BA >= minconf:
            rules.append((f"{B} → {A}", round(conf_BA * 100)))

    df_conf = pd.DataFrame(rules, columns=["Rule", "Confidence (%)"])

    return df_freq, df_conf

find_frequent_2_itemsets(df)

(         Item  frequency support
 0  Milk, Wine         14     63%,
           Rule  Confidence (%)
 0  Milk → Wine              82
 1  Wine → Milk              88)

In [119]:
def find_frequent_3_itemsets(D):
    total_transactions = len(D)
    pair_counts = {}

    # Đếm tần suất xuất hiện các 2-itemsets
    for transaction in np.array(D):
        # Remove nan values before sorting
        filtered_transaction = [item for item in transaction if not pd.isna(item)]
        for pair in combinations(sorted(filtered_transaction), 3):
            pair = frozenset(pair)
            pair_counts[pair] = pair_counts.get(pair, 0) + 1

    # Tạo bảng frequency + support
    data = []
    for pair, count in pair_counts.items():
        items = ", ".join(sorted(pair))
        support = f"{round(count / total_transactions * 100)}%"
        data.append([items, count, support])

    # Chuyển sang DataFrame
    df4 = pd.DataFrame(data, columns=["Item", "frequency", "support"])
    df4 = df4.sort_values(by="Item").reset_index(drop=True)

    return df4

find_frequent_3_itemsets(df)

Unnamed: 0,Item,frequency,support
0,"Apple, Bread, Butter",9,41%
1,"Apple, Bread, Chips",8,36%
2,"Apple, Bread, Milk",9,41%
3,"Apple, Bread, Wine",10,45%
4,"Apple, Butter, Chips",8,36%
5,"Apple, Butter, Milk",9,41%
6,"Apple, Butter, Wine",8,36%
7,"Apple, Chips, Milk",7,32%
8,"Apple, Chips, Wine",6,27%
9,"Apple, Milk, Wine",9,41%


In [120]:
from itertools import combinations

def find_frequent_1_itemsets(transactions, min_sup_count):
    item_count = {}
    for txn in transactions:
        for item in txn:
            item_count[item] = item_count.get(item, 0) + 1
    L1 = {frozenset([item]) for item, count in item_count.items() if count >= min_sup_count}
    return L1, item_count

def has_infrequent_subset(candidate, prev_freq_sets):
    k = len(candidate)
    for subset in combinations(candidate, k - 1):
        if frozenset(subset) not in prev_freq_sets:
            return True
    return False

def apriori_gen(prev_freq_sets):
    candidates = set()
    prev_freq_list = list(prev_freq_sets)
    for i in range(len(prev_freq_list)):
        for j in range(i + 1, len(prev_freq_list)):
            l1 = list(prev_freq_list[i])
            l2 = list(prev_freq_list[j])
            l1.sort()
            l2.sort()
            if l1[:-1] == l2[:-1] and l1[-1] < l2[-1]:
                candidate = frozenset(l1 + [l2[-1]])
                if not has_infrequent_subset(candidate, prev_freq_sets):
                    candidates.add(candidate)
    return candidates

def apriori(transactions, min_sup):
    transactions = list(map(set, transactions))
    total_txns = len(transactions)
    min_sup_count = int(min_sup * total_txns)

    L, itemset_count = [], {}

    L1, count1 = find_frequent_1_itemsets(transactions, min_sup_count)
    L.append(L1)
    for item, count in count1.items():
        itemset_count[frozenset([item])] = count

    k = 2
    while True:
        Ck = apriori_gen(L[k - 2])
        freq_ck = {}
        for txn in transactions:
            for candidate in Ck:
                if candidate.issubset(txn):
                    freq_ck[candidate] = freq_ck.get(candidate, 0) + 1
        Lk = {itemset for itemset, count in freq_ck.items() if count >= min_sup_count}
        if not Lk:
            break
        L.append(Lk)
        itemset_count.update(freq_ck)
        k += 1

    frequent_itemsets = set().union(*L)
    return frequent_itemsets, itemset_count, total_txns

def generate_association_rules(frequent_itemsets, itemset_count, min_conf, total_txns):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) >= 2:
            for i in range(1, len(itemset)):
                for lhs in combinations(itemset, i):
                    lhs = frozenset(lhs)
                    rhs = itemset - lhs
                    if lhs in itemset_count:
                        conf = itemset_count[itemset] / itemset_count[lhs]
                        if conf >= min_conf:
                            rules.append((set(lhs), set(rhs), round(conf * 100, 2)))
    return rules

minsup = 0.6
minconf = 0.8

frequent_itemsets, itemset_count, total_txns = apriori(records, minsup)
rules = generate_association_rules(frequent_itemsets, itemset_count, minconf, total_txns)

print("-->Frequent itemsets (minsup 60%):")
for item in frequent_itemsets:
    support = round(itemset_count[item] / total_txns * 100, 2)
    print(f"{set(item)}: {itemset_count[item]} times ({support}%)")

print("\n-->Association Rules (minconf 80%):")
for lhs, rhs, conf in rules:
    print(f"{lhs} -> {rhs} (conf: {conf}%)")


-->Frequent itemsets (minsup 60%):
{'Apple'}: 15 times (68.18%)
{'nan'}: 18 times (81.82%)
{'Butter'}: 15 times (68.18%)
{'Chips'}: 14 times (63.64%)
{'Milk'}: 17 times (77.27%)
{'Milk', 'Butter'}: 13 times (59.09%)
{'Wine'}: 16 times (72.73%)
{'Milk', 'Wine'}: 14 times (63.64%)
{'Bread'}: 16 times (72.73%)

-->Association Rules (minconf 80%):
{'Butter'} -> {'Milk'} (conf: 86.67%)
{'Milk'} -> {'Wine'} (conf: 82.35%)
{'Wine'} -> {'Milk'} (conf: 87.5%)
