In [1]:
import pandas as pd
from mlxtend import frequent_patterns
from mlxtend.preprocessing import TransactionEncoder

import service
from association import apriori

### data_1

In [2]:
df = pd.read_json("data/data_1.jsonl", lines=True)
df.head()

Unnamed: 0,Transaction
0,"[a, b, c, d, e]"
1,"[a, c, d, f]"
2,"[a, b, c, d, e, g]"
3,"[c, d, e, f]"
4,"[c, e, f, h]"


In [3]:
transactions = df['Transaction'].to_list()
num_transactions = len(transactions)

#### Apriori algorithm

In [4]:
min_support = 0.4
min_confidence = 0.75

In [5]:
frequent_itemsets = apriori.apriori_algorithm(transactions, min_support=min_support)

for itemset, count in frequent_itemsets.items():
    print(f"{set(itemset)}: {count}")

{'a'}: 5
{'c'}: 7
{'d'}: 7
{'e'}: 7
{'f'}: 6
{'a', 'c'}: 4
{'c', 'd'}: 5
{'c', 'e'}: 5
{'c', 'f'}: 4
{'d', 'e'}: 6
{'c', 'd', 'e'}: 4


In [6]:
rules = apriori.association_rules(frequent_itemsets, transactions, min_confidence=min_confidence)

for antecedent, consequent, confidence in rules:
    print(f"{set(antecedent)} => {set(consequent)} ({confidence:.2f})")

{'a'} => {'c'} (0.80)
{'d'} => {'e'} (0.86)
{'e'} => {'d'} (0.86)
{'c', 'd'} => {'e'} (0.80)
{'c', 'e'} => {'d'} (0.80)


Compare with similar `mlxtend` algorithm

In [7]:
encoder = TransactionEncoder()
te_ary = encoder.fit_transform(transactions)
mlxtend_df = pd.DataFrame(te_ary, columns=encoder.columns_)

In [8]:
mlxtend_frequent_itemsets = frequent_patterns.apriori(
    mlxtend_df, min_support=min_support, use_colnames=True
)

for _, (support, itemset) in mlxtend_frequent_itemsets.iterrows():
    count = int(support * num_transactions)
    print(f"{set(itemset)}: {count}")

{'a'}: 5
{'c'}: 7
{'d'}: 7
{'e'}: 7
{'f'}: 6
{'a', 'c'}: 4
{'c', 'd'}: 5
{'c', 'e'}: 5
{'c', 'f'}: 4
{'d', 'e'}: 6
{'c', 'd', 'e'}: 4


In [9]:
mlxtend_rules = frequent_patterns.association_rules(
    mlxtend_frequent_itemsets, metric="confidence", min_threshold=min_confidence
)[["antecedents", "consequents", "confidence"]]

mlxtend_rules

Unnamed: 0,antecedents,consequents,confidence
0,(a),(c),0.8
1,(d),(e),0.857143
2,(e),(d),0.857143
3,"(c, d)",(e),0.8
4,"(c, e)",(d),0.8


In [10]:
assert service.check_frequent_itemsets_equal(
    frequent_itemsets, mlxtend_frequent_itemsets, num_transactions
)
# TODO rules assert

### More datasets (data_2)

In [11]:
df = pd.read_json("data/data_2.jsonl", lines=True)
df.head()

Unnamed: 0,Dataset,Transaction
0,1,"[c, a, d, b]"
1,1,"[b, c, d]"
2,1,"[a, e, f, g, h]"
3,1,"[e, d, c, g, j, b]"
4,1,"[e, c, d, f, b]"


In [12]:
min_support = 0.4
min_confidence = 0.75

In [13]:
for dataset_id, df_v in df.groupby('Dataset'):
    transactions = df_v['Transaction'].to_list()
    num_transactions = len(transactions)

    frequent_itemsets = apriori.apriori_algorithm(transactions, min_support=min_support)
    rules = apriori.association_rules(frequent_itemsets, transactions, min_confidence=min_confidence)

    encoder = TransactionEncoder()
    te_ary = encoder.fit_transform(transactions)
    mlxtend_df = pd.DataFrame(te_ary, columns=encoder.columns_)
    mlxtend_frequent_itemsets = frequent_patterns.apriori(
        mlxtend_df, min_support=min_support, use_colnames=True
    )
    mlxtend_rules = frequent_patterns.association_rules(
        mlxtend_frequent_itemsets, metric="confidence", min_threshold=min_confidence
    )[["antecedents", "consequents", "confidence"]]

    assert service.check_frequent_itemsets_equal(
        frequent_itemsets, mlxtend_frequent_itemsets, num_transactions
    )
    # TODO rules assert

    print(f"Dataset {dataset_id} checks passed.")

Dataset 1 checks passed.
Dataset 2 checks passed.
Dataset 3 checks passed.
