In [0]:
# ======================================
# Fabric Notebook 2: Clustering (KMeans with automatic k)
# ======================================

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import mlflow
import mlflow.spark
from mlflow import MlflowClient
import matplotlib.pyplot as plt

# Setup MLflow
mlflow.set_experiment("Retail_ML_Experiments")
client = MlflowClient()
registry_name = "retail_ML_clustering"
try:
    client.create_registered_model(registry_name)
except:
    print(f"ℹ️ Registry {registry_name} already exists")

# Load data
df = spark.table("Gold_Customer_LTV")

# Assemble numeric features
assembler = VectorAssembler(
    inputCols=["Lifetime_Spend", "Num_Transactions"],
    outputCol="features"
)
data = assembler.transform(df)

# -------------------------------
# Step 1: Find the best k
# -------------------------------
wssse_list = []
k_values = list(range(2, 11))  # test k from 2 to 10

for k in k_values:
    kmeans = KMeans(k=k, seed=42, featuresCol="features")
    model = kmeans.fit(data)
    wssse = model.summary.trainingCost  # Sum of squared distances to cluster centers
    wssse_list.append(wssse)
    print(f"k={k}, WSSSE={wssse}")

# Plot Elbow Method
plt.figure(figsize=(8,5))
plt.plot(k_values, wssse_list, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("WSSSE")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.show()

# Choose best k based on the elbow method (for demo, pick the k with largest drop)
best_k = k_values[wssse_list.index(min(wssse_list))]  # can be refined manually after inspecting the plot
print(f"✅ Selected k={best_k}")

# -------------------------------
# Step 2: Train final model with best k
# -------------------------------
with mlflow.start_run(run_name=f"KMeans_k{best_k}") as run:
    kmeans = KMeans(
        k=best_k,
        seed=42,
        featuresCol="features",
        predictionCol="cluster"
    )
    model = kmeans.fit(data)
    clusters = model.transform(data)

    # Log parameters and model
    mlflow.log_param("k", best_k)
    mlflow.spark.log_model(model, "model")

    # Register model
    try:
        mv = client.create_model_version(
            name=registry_name,
            source=f"runs:/{run.info.run_id}/model",
            run_id=run.info.run_id
        )
        print(f"Model registered as version {mv.version}")
    except:
        print("Registry skipped")

# Save clustering results for Power BI
clusters.write.format("delta").mode("overwrite").saveAsTable("PowerBI_Customer_Segments")
print("✅ Saved clustering results to PowerBI_Customer_Segments")

# -------------------------------
# Step 3: Optional 2D visualization
# -------------------------------
# Collect data to driver for plotting (only feasible for small datasets)
plot_df = clusters.select("Lifetime_Spend", "Num_Transactions", "cluster").toPandas()
plt.figure(figsize=(8,6))
for c in plot_df['cluster'].unique():
    subset = plot_df[plot_df['cluster']==c]
    plt.scatter(subset['Lifetime_Spend'], subset['Num_Transactions'], label=f'Cluster {c}')
plt.xlabel("Lifetime Spend")
plt.ylabel("Number of Transactions")
plt.title("Customer Segments")
plt.legend()
plt.show()
