In [None]:
# Step 1: Data Preparation

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

# Load dataset (example: Online Retail Dataset)
df = pd.read_excel('datasets/OnlineRetail.xlsx')  # adjust path if needed

# Explore data
print("Data head:\n", df.head())
print("\nData info:\n", df.info())
print("\nData description:\n", df.describe())

# Data cleaning
df = df.dropna(subset=['CustomerID', 'Description'])  # remove missing CustomerID or Description
df = df[df['Quantity'] > 0]  # remove canceled transactions or negative quantities

# Create transaction basket for mining
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

# Visualization: most frequent items
item_counts = df['Description'].value_counts().head(20)
plt.figure(figsize=(12,6))
sns.barplot(x=item_counts.values, y=item_counts.index, palette="viridis")
plt.title("Top 20 Most Frequent Items")
plt.xlabel("Number of Transactions")
plt.ylabel("Item")
plt.show()

# Optional: heatmap of item co-occurrence
co_occurrence = basket.T.dot(basket)
plt.figure(figsize=(12,10))
sns.heatmap(co_occurrence, cmap="YlGnBu")
plt.title("Item Co-occurrence Heatmap")
plt.show()

# -----------------------------------------
# Step 2: Frequent Itemset Mining Using Apriori

# Apriori frequent itemsets
frequent_itemsets_apriori = apriori(basket, min_support=0.01, use_colnames=True)
frequent_itemsets_apriori = frequent_itemsets_apriori.sort_values(by='support', ascending=False)
print("\nTop 10 Frequent Itemsets (Apriori):\n", frequent_itemsets_apriori.head(10))

# Visualization: top frequent itemsets
top_itemsets = frequent_itemsets_apriori.head(10)
plt.figure(figsize=(10,6))
sns.barplot(x='support', y=top_itemsets['itemsets'].astype(str), data=top_itemsets, palette="coolwarm")
plt.title("Top 10 Frequent Itemsets (Apriori)")
plt.xlabel("Support")
plt.ylabel("Itemsets")
plt.show()

# -----------------------------------------
# Step 3: Frequent Itemset Mining Using FP-Growth

# FP-Growth frequent itemsets
frequent_itemsets_fp = fpgrowth(basket, min_support=0.01, use_colnames=True)
frequent_itemsets_fp = frequent_itemsets_fp.sort_values(by='support', ascending=False)
print("\nTop 10 Frequent Itemsets (FP-Growth):\n", frequent_itemsets_fp.head(10))

# Visualization: top frequent itemsets
top_itemsets_fp = frequent_itemsets_fp.head(10)
plt.figure(figsize=(10,6))
sns.barplot(x='support', y=top_itemsets_fp['itemsets'].astype(str), data=top_itemsets_fp, palette="magma")
plt.title("Top 10 Frequent Itemsets (FP-Growth)")
plt.xlabel("Support")
plt.ylabel("Itemsets")
plt.show()

# -----------------------------------------
# Step 4: Generating and Analyzing Association Rules

# Generate association rules from Apriori
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.5)
rules_apriori = rules_apriori.sort_values(by='lift', ascending=False)
print("\nTop 10 Association Rules (Apriori):\n", rules_apriori.head(10))

# Visualization: Support vs Confidence
plt.figure(figsize=(8,6))
sns.scatterplot(x='support', y='confidence', size='lift', hue='lift', data=rules_apriori, palette="cool")
plt.title("Association Rules: Support vs Confidence (Apriori)")
plt.xlabel("Support")
plt.ylabel("Confidence")
plt.show()

# Generate association rules from FP-Growth
rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.5)
rules_fp = rules_fp.sort_values(by='lift', ascending=False)
print("\nTop 10 Association Rules (FP-Growth):\n", rules_fp.head(10))

# Visualization: Support vs Confidence (FP-Growth)
plt.figure(figsize=(8,6))
sns.scatterplot(x='support', y='confidence', size='lift', hue='lift', data=rules_fp, palette="cool")
plt.title("Association Rules: Support vs Confidence (FP-Growth)")
plt.xlabel("Support")
plt.ylabel("Confidence")
plt.show()

# -----------------------------------------
# Step 5: Comparative Analysis

# Print basic comparison
print("\nNumber of frequent itemsets (Apriori):", frequent_itemsets_apriori.shape[0])
print("Number of frequent itemsets (FP-Growth):", frequent_itemsets_fp.shape[0])

print("\nNumber of rules generated (Apriori):", rules_apriori.shape[0])
print("Number of rules generated (FP-Growth):", rules_fp.shape[0])

print("\nObservations:")
print("- FP-Growth is typically faster than Apriori on larger datasets.")
print("- Both algorithms often produce similar frequent itemsets and rules, but FP-Growth avoids candidate generation, improving efficiency.")
print("- Challenges include cleaning the dataset, setting appropriate support/confidence thresholds, and interpreting results for actionable insights.")