In [0]:
%sql
select count(*) from retail_transactions_dataset

In [0]:
# ======================================
# Databricks Free Edition - FP-Growth (Fixed Conversion + Rules)
# ======================================

# --- Install required library ---
%pip install mlxtend --quiet

# --- Imports ---
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
import mlflow
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils

# ======================================
# MLflow setup
# ======================================
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.login()  # INFO-log: Login successful!

mlflow.set_experiment("/Users/amirrezakha@yahoo.com/Retail_ML_Experiments")

# ======================================
# Load and prepare data
# ======================================
df_spark = spark.table("retail_transactions_dataset").select("Transaction_ID", "Product")
df_sample = df_spark.limit(10000)
df = df_sample.toPandas()
print(f"✅ Loaded {len(df)} rows from retail_transactions_dataset")

# Validate
if not {"Transaction_ID", "Product"}.issubset(df.columns):
    raise ValueError("Table must contain columns: Transaction_ID, Product")

print(f"🧾 Unique transactions: {df['Transaction_ID'].nunique()}")
print(f"🛍️ Unique products: {df['Product'].nunique()}")

# One-hot encode
basket = df.groupby(['Transaction_ID', 'Product']).size().unstack(fill_value=0)
basket = (basket > 0).astype(int)
print(f"✅ Basket matrix shape: {basket.shape}")

# ======================================
# Train FP-Growth & Association Rules
# ======================================
with mlflow.start_run(run_name="FPGrowth_mlxtend") as run:
    frequent_itemsets = fpgrowth(basket, min_support=0.001, use_colnames=True)
    frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(len)

    print(f"✅ Frequent itemsets found: {len(frequent_itemsets)}")

    # Try generating rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.05)
    print(f"📈 Association rules found: {len(rules)}")

    # Convert sets to strings for Spark
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(lambda x: ",".join(list(x)))
    if not rules.empty:
        for col in ["antecedents", "consequents"]:
            rules[col] = rules[col].apply(lambda x: ",".join(list(x)))
    else:
        print("⚠️ No association rules found. Try reducing confidence threshold or expanding dataset.")

    # Log metrics
    mlflow.log_param("algorithm", "FPGrowth (mlxtend)")
    mlflow.log_param("min_support", 0.001)
    mlflow.log_param("min_confidence", 0.05)
    mlflow.log_metric("num_itemsets", len(frequent_itemsets))
    mlflow.log_metric("num_rules", len(rules))

    # Save results locally
    fi_path = "/tmp/frequent_itemsets.csv"
    rules_path = "/tmp/association_rules.csv"
    frequent_itemsets.to_csv(fi_path, index=False)
    rules.to_csv(rules_path, index=False)
    mlflow.log_artifact(fi_path, artifact_path="output")
    mlflow.log_artifact(rules_path, artifact_path="output")

    # ======================================
    # Convert results to Spark for Power BI
    # ======================================
    fi_spark = spark.createDataFrame(frequent_itemsets)
    fi_spark.write.format("delta").mode("overwrite").saveAsTable("PowerBI_Frequent_Itemsets")

    if not rules.empty:
        rules_spark = spark.createDataFrame(rules)
        rules_spark.write.format("delta").mode("overwrite").saveAsTable("PowerBI_Basket_Analysis")
        print("✅ Saved rules to PowerBI_Basket_Analysis")
    else:
        print("⚠️ Skipped saving rules (empty DataFrame).")

    print("✅ Saved frequent itemsets to PowerBI_Frequent_Itemsets")

# ======================================
# Display preview
# ======================================
print("📊 Frequent Itemsets:")
display(fi_spark.limit(10))

if not rules.empty:
    print("📈 Association Rules:")
    display(rules_spark.limit(10))
