In [37]:
pip install mlxtend



# Load and Prepare the Dataset

In [38]:
import pandas as pd

# Load the dataset (update path if needed)
df = pd.read_excel('LAB1-ARM-Dataset.xlsx')

# Drop Transaction ID for mining
df_clean = df.drop(columns=['Transaction ID'])

print("Sample Data:")
df_clean

Sample Data:


Unnamed: 0,Bread,Milk,Diaper,Beer,Cola,Chicken,Eggs,Apple,Juice,Banana
0,1,1,0,1,0,0,1,0,0,1
1,1,0,1,1,1,0,1,1,0,0
2,1,1,1,0,1,1,0,1,1,0
3,0,1,1,1,0,1,1,0,1,0
4,1,1,1,0,1,0,0,1,0,1
5,1,1,1,0,0,1,1,1,0,0
6,1,0,1,1,1,1,0,0,1,0
7,0,1,0,1,0,1,1,1,0,0
8,1,0,1,0,1,0,1,0,1,0
9,1,1,0,0,1,1,0,1,1,0


#PART 1: APRIORI ALGORITHM
Task 1: Mine Frequent Itemsets

In [39]:
from mlxtend.frequent_patterns import apriori, association_rules

# Apriori - support threshold = 0.3
frequent_itemsets_apriori = apriori(df_clean, min_support=0.3, use_colnames=True)
print("\nFrequent Itemsets (Apriori):")
print(frequent_itemsets_apriori)


Frequent Itemsets (Apriori):
     support                 itemsets
0   0.733333                  (Bread)
1   0.666667                   (Milk)
2   0.733333                 (Diaper)
3   0.466667                   (Beer)
4   0.600000                   (Cola)
5   0.600000                (Chicken)
6   0.600000                   (Eggs)
7   0.600000                  (Apple)
8   0.533333                  (Juice)
9   0.333333                 (Banana)
10  0.466667            (Bread, Milk)
11  0.533333          (Diaper, Bread)
12  0.533333            (Cola, Bread)
13  0.400000         (Chicken, Bread)
14  0.400000            (Eggs, Bread)
15  0.400000           (Apple, Bread)
16  0.400000           (Juice, Bread)
17  0.400000           (Diaper, Milk)
18  0.533333          (Chicken, Milk)
19  0.400000             (Eggs, Milk)
20  0.400000            (Apple, Milk)
21  0.333333           (Beer, Diaper)
22  0.466667           (Cola, Diaper)
23  0.400000        (Chicken, Diaper)
24  0.400000        

Task 2: Generate Association Rules

In [40]:
# Generate rules - confidence threshold = 0.6
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.6)

# Select relevant columns
rules_apriori = rules_apriori[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
print("\nAssociation Rules (Apriori):")
print(rules_apriori)



Association Rules (Apriori):
        antecedents     consequents   support  confidence      lift
0           (Bread)          (Milk)  0.466667    0.636364  0.954545
1            (Milk)         (Bread)  0.466667    0.700000  0.954545
2          (Diaper)         (Bread)  0.533333    0.727273  0.991736
3           (Bread)        (Diaper)  0.533333    0.727273  0.991736
4            (Cola)         (Bread)  0.533333    0.888889  1.212121
..              ...             ...       ...         ...       ...
63   (Cola, Diaper)         (Apple)  0.333333    0.714286  1.190476
64    (Juice, Cola)        (Diaper)  0.333333    0.833333  1.136364
65  (Juice, Diaper)          (Cola)  0.333333    0.714286  1.190476
66   (Cola, Diaper)         (Juice)  0.333333    0.714286  1.339286
67          (Juice)  (Cola, Diaper)  0.333333    0.625000  1.339286

[68 rows x 5 columns]


Task 3: Intepretation Example

In [41]:
# Example output interpretation
for idx, row in rules_apriori.iterrows():
    print(f"Rule: {set(row['antecedents'])} → {set(row['consequents'])}")
    print(f"Support: {row['support']:.2f}, Confidence: {row['confidence']:.2f}, Lift: {row['lift']:.2f}\n")


Rule: {'Bread'} → {'Milk'}
Support: 0.47, Confidence: 0.64, Lift: 0.95

Rule: {'Milk'} → {'Bread'}
Support: 0.47, Confidence: 0.70, Lift: 0.95

Rule: {'Diaper'} → {'Bread'}
Support: 0.53, Confidence: 0.73, Lift: 0.99

Rule: {'Bread'} → {'Diaper'}
Support: 0.53, Confidence: 0.73, Lift: 0.99

Rule: {'Cola'} → {'Bread'}
Support: 0.53, Confidence: 0.89, Lift: 1.21

Rule: {'Bread'} → {'Cola'}
Support: 0.53, Confidence: 0.73, Lift: 1.21

Rule: {'Chicken'} → {'Bread'}
Support: 0.40, Confidence: 0.67, Lift: 0.91

Rule: {'Eggs'} → {'Bread'}
Support: 0.40, Confidence: 0.67, Lift: 0.91

Rule: {'Apple'} → {'Bread'}
Support: 0.40, Confidence: 0.67, Lift: 0.91

Rule: {'Juice'} → {'Bread'}
Support: 0.40, Confidence: 0.75, Lift: 1.02

Rule: {'Milk'} → {'Diaper'}
Support: 0.40, Confidence: 0.60, Lift: 0.82

Rule: {'Chicken'} → {'Milk'}
Support: 0.53, Confidence: 0.89, Lift: 1.33

Rule: {'Milk'} → {'Chicken'}
Support: 0.53, Confidence: 0.80, Lift: 1.33

Rule: {'Eggs'} → {'Milk'}
Support: 0.40, Confidenc

#Part 2: FP-growth Algorithm
Task 4: Mine Frequent Itemsets

In [42]:
pip install pyfpgrowth



In [43]:
import pyfpgrowth

# Convert each row to list of items
transactions = []
for i, row in df_clean.iterrows():
    transaction = [item for item in df_clean.columns if row[item] == 1]
    transactions.append(transaction)

# Minimum support = 0.3 * 15 = 4.5 → round to 5
patterns = pyfpgrowth.find_frequent_patterns(transactions, 5)
print("\nFrequent Itemsets (FP-growth):")
for itemset, support in patterns.items():
    print(f"{itemset}: {support}")



Frequent Itemsets (FP-growth):
('Banana',): 5
('Beer',): 7
('Beer', 'Diaper'): 5
('Chicken', 'Juice'): 5
('Cola', 'Juice'): 6
('Cola', 'Diaper', 'Juice'): 5
('Bread', 'Cola', 'Juice'): 5
('Bread', 'Diaper', 'Juice'): 5
('Diaper', 'Juice'): 7
('Chicken', 'Eggs'): 5
('Chicken', 'Eggs', 'Milk'): 5
('Eggs', 'Milk'): 6
('Bread', 'Eggs'): 6
('Diaper', 'Eggs'): 6
('Cola', 'Diaper'): 7
('Bread', 'Cola', 'Diaper'): 6
('Bread', 'Cola'): 8
('Apple', 'Chicken'): 5
('Apple', 'Chicken', 'Milk'): 5
('Apple', 'Cola'): 6
('Apple', 'Cola', 'Diaper'): 5
('Apple', 'Bread', 'Cola'): 5
('Apple', 'Bread', 'Diaper'): 5
('Apple', 'Milk'): 6
('Apple', 'Diaper'): 7
('Chicken', 'Diaper'): 6
('Chicken', 'Diaper', 'Milk'): 5
('Bread', 'Chicken'): 6
('Bread', 'Chicken', 'Milk'): 5
('Chicken', 'Milk'): 8
('Diaper', 'Milk'): 6
('Bread', 'Milk'): 7
('Bread',): 11
('Diaper',): 11
('Bread', 'Diaper'): 8


Task 5: Generate Association Rules

In [44]:
# Minimum confidence = 0.6
rules_fp = pyfpgrowth.generate_association_rules(patterns, 0.6)

print("\nAssociation Rules (FP-growth):")
for antecedent, (consequent, confidence) in rules_fp.items():
    print(f"Rule: {antecedent} → {consequent} | Confidence: {confidence:.2f}")



Association Rules (FP-growth):
Rule: ('Beer',) → ('Diaper',) | Confidence: 0.71
Rule: ('Cola', 'Diaper') → ('Apple',) | Confidence: 0.71
Rule: ('Cola', 'Juice') → ('Bread',) | Confidence: 0.83
Rule: ('Diaper', 'Juice') → ('Bread',) | Confidence: 0.71
Rule: ('Bread', 'Cola') → ('Apple',) | Confidence: 0.62
Rule: ('Bread', 'Diaper') → ('Apple',) | Confidence: 0.62
Rule: ('Diaper',) → ('Bread',) | Confidence: 0.73
Rule: ('Chicken', 'Eggs') → ('Milk',) | Confidence: 1.00
Rule: ('Chicken', 'Milk') → ('Bread',) | Confidence: 0.62
Rule: ('Eggs', 'Milk') → ('Chicken',) | Confidence: 0.83
Rule: ('Bread',) → ('Diaper',) | Confidence: 0.73
Rule: ('Apple', 'Chicken') → ('Milk',) | Confidence: 1.00
Rule: ('Apple', 'Milk') → ('Chicken',) | Confidence: 0.83
Rule: ('Apple', 'Cola') → ('Bread',) | Confidence: 0.83
Rule: ('Apple', 'Diaper') → ('Bread',) | Confidence: 0.71
Rule: ('Chicken', 'Diaper') → ('Milk',) | Confidence: 0.83
Rule: ('Diaper', 'Milk') → ('Chicken',) | Confidence: 0.83
Rule: ('Bread'

Task 6: Comparison Table

In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
import time
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import pyfpgrowth

# ------------ APRIORI ------------
start_apriori = time.time()

frequent_itemsets_apriori = apriori(df_clean, min_support=0.3, use_colnames=True)
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.6)

end_apriori = time.time()
time_apriori = end_apriori - start_apriori

# ------------ FP-GROWTH ------------
# Prepare transactions
transactions = []
for i, row in df_clean.iterrows():
    transaction = [item for item in df_clean.columns if row[item] == 1]
    transactions.append(transaction)

start_fp = time.time()

# Support threshold = 5 transactions
patterns = pyfpgrowth.find_frequent_patterns(transactions, 5)
rules_fp = pyfpgrowth.generate_association_rules(patterns, 0.6)

end_fp = time.time()
time_fp = end_fp - start_fp

# ------------ COMPARISON TABLE ------------
comparison_df = pd.DataFrame({
    "Algorithm": ["Apriori", "FP-growth"],
    "Frequent Itemsets": [len(frequent_itemsets_apriori), len(patterns)],
    "Association Rules": [len(rules_apriori), len(rules_fp)],
    "Computation Time (s)": [round(time_apriori, 4), round(time_fp, 4)],
    "Metrics Available": ["Support, Confidence, Lift", "Support (manual), Confidence"]
})

print("\n=== Comparison Table ===")
comparison_df



=== Comparison Table ===


Unnamed: 0,Algorithm,Frequent Itemsets,Association Rules,Computation Time (s),Metrics Available
0,Apriori,43,68,0.0326,"Support, Confidence, Lift"
1,FP-growth,35,19,0.0019,"Support (manual), Confidence"


#Evaluation:

**<br>1. Discuss the strengths and weaknesses of the Apriori and FP-growth algorithms based on the results.**

Apriori Algorithm:
Strengths:

Simple to Understand: Apriori is an intuitive algorithm based on the "apriori property," which means that if an itemset is frequent, all of its subsets must also be frequent. This makes it conceptually easy to grasp.

Flexibility: It can be used for both frequent itemset mining and association rule generation with various user-defined thresholds (support, confidence).

Widely Used: Since it was one of the first algorithms for association rule mining, it's well-supported and widely used in industry and academia.

Weaknesses:

Efficiency Issues (for Large Datasets): Apriori has to generate candidate itemsets and check their frequency in the dataset iteratively, which leads to high computational cost, especially with larger datasets. This often results in longer computation times as seen in your case.

Memory Intensive: Storing candidate itemsets can be very memory-intensive as it stores all the possible itemsets.

Redundant Computations: The algorithm sometimes performs redundant scans of the dataset when generating large candidate sets, leading to inefficiency.

FP-growth Algorithm:
Strengths:

Efficiency: FP-growth performs significantly better than Apriori in terms of time complexity. It doesn't generate candidate itemsets explicitly; instead, it uses a prefix-tree (FP-tree) structure that reduces the number of passes through the data, making it faster. In your case, the computation time was much shorter compared to Apriori.

Compact Representation: FP-growth's FP-tree structure is much more compact than Apriori’s candidate-generation approach, saving memory.

No Candidate Generation: Unlike Apriori, FP-growth doesn't generate candidate itemsets, which improves its efficiency on larger datasets.

Weaknesses:

Complexity: While FP-growth is faster, it can be more difficult to understand conceptually compared to Apriori, due to the use of the FP-tree structure.

Memory Usage (for Dense Data): For dense datasets (where many itemsets are frequent), FP-growth may still require significant memory, especially for the FP-tree, though it's generally more efficient than Apriori.

Less Flexibility: FP-growth is primarily focused on mining frequent itemsets and doesn't inherently generate association rules, so you need additional steps for rule generation (which you handled manually).





**<br>2. Which algorithm performed better for this dataset and why?**

Computation Time:

FP-growth took 0.0016 seconds, which is much faster than Apriori's 0.0394 seconds. The reason for this is that FP-growth avoids candidate generation and works directly on a compact FP-tree structure, resulting in much lower computational cost.

Frequent Itemsets and Association Rules:

Apriori generated 43 frequent itemsets and 68 association rules, which is higher than the 35 frequent itemsets and 19 rules generated by FP-growth.

While Apriori generated more frequent itemsets and association rules, FP-growth's performance is more scalable. In practice, FP-growth would handle larger datasets more efficiently than Apriori, especially as the number of itemsets and transactions grows.

The number of association rules generated by Apriori is significantly higher, but that also indicates that it might be overfitting or generating too many less meaningful rules. FP-growth might generate fewer, but more relevant and stronger rules, especially in real-world scenarios with larger data.

Conclusion:
FP-growth was clearly the better performer in this specific case because of its speed and efficiency. For datasets with a higher volume or more items, FP-growth would continue to scale better than Apriori, which struggles with larger computations due to the candidate-generation process.

Apriori may generate more itemsets and rules, but FP-growth is more efficient in terms of computation time, which is particularly valuable in big data or real-time applications.
