<a href="https://colab.research.google.com/github/Arbin4/DM_DW_LAB/blob/main/Lab_2_DW_DM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import time
from itertools import combinations
from collections import defaultdict
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

# New file paths
file_paths = {
    'space.txt': '/content/drive/MyDrive/DM_DW_Datas/space.txt',
    'sports.txt': '/content/drive/MyDrive/DM_DW_Datas/sports.txt'
}

min_support = 0.15
min_confidence = 0.7

def get_support(itemset, transactions):
    count = sum(1 for tx in transactions if itemset.issubset(set(tx)))
    return count / len(transactions)

def apriori(transactions, min_support):
    total_tx = len(transactions)
    item_counts = defaultdict(int)

    for tx in transactions:
        for item in tx:
            item_counts[frozenset([item])] += 1

    frequent_itemsets = {item: count for item, count in item_counts.items() if count / total_tx >= min_support}
    all_frequent = frequent_itemsets.copy()
    current_freq = list(frequent_itemsets.keys())
    k = 2

    while current_freq:
        candidates = set()
        for i in range(len(current_freq)):
            for j in range(i + 1, len(current_freq)):
                union = current_freq[i] | current_freq[j]
                if len(union) == k:
                    candidates.add(union)

        candidate_counts = defaultdict(int)
        for tx in transactions:
            tx_set = set(tx)
            for candidate in candidates:
                if candidate.issubset(tx_set):
                    candidate_counts[candidate] += 1

        current_freq = [item for item in candidate_counts if candidate_counts[item] / total_tx >= min_support]
        all_frequent.update({item: candidate_counts[item] for item in current_freq})
        k += 1

    return all_frequent

def generate_rules(frequent_itemsets, transactions, min_confidence):
    total_tx = len(transactions)
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        support_itemset = frequent_itemsets[itemset] / total_tx
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                support_ante = get_support(antecedent, transactions)
                support_cons = get_support(consequent, transactions)
                confidence = support_itemset / support_ante
                lift = confidence / support_cons
                if confidence >= min_confidence:
                    rules.append({
                        'antecedents': set(antecedent),
                        'consequents': set(consequent),
                        'support': round(support_itemset, 2),
                        'confidence': round(confidence, 2),
                        'lift': round(lift, 2)
                    })
    return rules

for name, path in file_paths.items():
    print(f"\n===== Processing {name} =====")

    # Regex-based line extraction
    lines = re.findall(r'\d+,[^\n]+', open(path, encoding="utf-8", errors="ignore").read())
    transactions = [[item.strip() for item in line.split(',')[1:] if item.strip()] for line in lines]

    # Apriori timing and result
    start_apriori = time.time()
    frequent_itemsets_raw = apriori(transactions, min_support)
    rules = generate_rules(frequent_itemsets_raw, transactions, min_confidence)
    end_apriori = time.time()

    # Output frequent itemsets
    total_tx = len(transactions)
    frequent_itemsets_df = pd.DataFrame([{
        'itemsets': set(item),
        'support': round(count / total_tx, 2)
    } for item, count in frequent_itemsets_raw.items()])

    rules_df = pd.DataFrame(rules)

    print("\nFrequent Itemsets:\n", frequent_itemsets_df)

    if not rules_df.empty:
        print("\nAssociation Rules:\n", rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
    else:
        print("\nNo association rules found with confidence ≥", min_confidence)

    # FP-Growth timing and result
    start_fp = time.time()
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_fp = pd.DataFrame(te_ary, columns=te.columns_)
    fp_itemsets = fpgrowth(df_fp, min_support=min_support, use_colnames=True)
    fp_rules = association_rules(fp_itemsets, metric="confidence", min_threshold=min_confidence)
    end_fp = time.time()

    print("\nFP-Growth Frequent Itemsets:\n", fp_itemsets)
    if not fp_rules.empty:
        print("\nFP-Growth Association Rules:\n", fp_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
    else:
        print("\nNo association rules found using FP-Growth with confidence ≥", min_confidence)

    print(f"\nExecution Time (Apriori): {round(end_apriori - start_apriori, 4)} seconds")
    print(f"Execution Time (FP-Growth): {round(end_fp - start_fp, 4)} seconds")

    print("\n=== Comparison Summary ===")
    print(f"Apriori generated {len(rules)} rules")
    print(f"FP-Growth generated {len(fp_rules)} rules")
    if (end_apriori - start_apriori) > (end_fp - start_fp):
        print("FP-Growth is faster than Apriori.")
    else:
        print("Apriori is faster than FP-Growth.")
    print("Both algorithms generated similar types of association rules, but FP-Growth is generally more efficient for large datasets.")



===== Processing space.txt =====

Frequent Itemsets:
                      itemsets  support
0               {Robotic Arm}     0.34
1              {Food Packets}     0.40
2              {Sleeping Bag}     0.32
3                 {Treadmill}     0.28
4                {Space Suit}     0.32
5                {3D Printer}     0.28
6  {Carbon Dioxide Scrubbers}     0.24

No association rules found with confidence ≥ 0.7

FP-Growth Frequent Itemsets:
    support                    itemsets
0     0.40              (Food Packets)
1     0.34               (Robotic Arm)
2     0.32              (Sleeping Bag)
3     0.28                 (Treadmill)
4     0.32                (Space Suit)
5     0.28                (3D Printer)
6     0.24  (Carbon Dioxide Scrubbers)

No association rules found using FP-Growth with confidence ≥ 0.7

Execution Time (Apriori): 0.0012 seconds
Execution Time (FP-Growth): 0.0667 seconds

=== Comparison Summary ===
Apriori generated 0 rules
FP-Growth generated 0 rules
Apriori