In [0]:
# ======================================
# Databricks Free Edition - KMeans Clustering (sklearn)
# ======================================

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import mlflow
import joblib
import matplotlib.pyplot as plt

# ======================================
# MLflow setup
# ======================================
mlflow.set_experiment("/Users/amirrezakha@yahoo.com/Retail_ML_Experiments")
mlflow.login()
experiment_name = "/Users/amirrezakha@yahoo.com/Retail_ML_Experiments"

# ======================================
# Load data
# ======================================
df_spark = spark.table("Gold_Customer_LTV")
df = df_spark.toPandas()

# Features for clustering
feature_cols = ["Lifetime_Spend", "Num_Transactions"]
X = df[feature_cols].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ======================================
# Step 1: Find best k using Elbow method
# ======================================
wssse_list = []
k_values = list(range(2, 11))

for k in k_values:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    wssse = km.inertia_  # Sum of squared distances
    wssse_list.append(wssse)
    print(f"k={k}, WSSSE={wssse}")

# Plot elbow
plt.figure(figsize=(8,5))
plt.plot(k_values, wssse_list, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("WSSSE")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.show()

# Select best k (example: pick k with largest drop)
best_k = k_values[wssse_list.index(min(wssse_list))]
print(f"✅ Selected k={best_k}")

# ======================================
# Step 2: Train final KMeans model
# ======================================
with mlflow.start_run(run_name=f"sklearn_KMeans_k{best_k}") as run:
    km_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    km_final.fit(X_scaled)
    clusters = km_final.labels_
    
    # Log parameters
    mlflow.log_param("k", best_k)
    mlflow.log_param("features", feature_cols)
    
    # Save model locally and log artifact
    local_model_path = f"/tmp/sklearn_kmeans.pkl"
    joblib.dump(km_final, local_model_path)
    mlflow.log_artifact(local_model_path, artifact_path="model")
    
    # Attach clusters to dataframe
    df["cluster"] = clusters

# ======================================
# Step 3: Save clustering results for Power BI
# ======================================
# Convert back to Spark DataFrame
df_spark_out = spark.createDataFrame(df)
df_spark_out.write.format("delta").mode("overwrite").saveAsTable("PowerBI_Customer_Segments")
print("✅ Saved clustering results to PowerBI_Customer_Segments")

# ======================================
# Step 4: 2D scatter plot
# ======================================
plt.figure(figsize=(8,6))
colors = ['red','green','blue','orange','purple','brown']
for c in range(best_k):
    subset = df[df['cluster']==c]
    plt.scatter(subset['Lifetime_Spend'], subset['Num_Transactions'], 
                label=f'Cluster {c}', color=colors[c % len(colors)])
plt.xlabel("Lifetime Spend")
plt.ylabel("Number of Transactions")
plt.title("Customer Segments")
plt.legend()
plt.grid(True)
plt.show()


In [0]:
# ======================================
# Databricks Free Edition - KMeans / GMM clustering
# ======================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import mlflow
import joblib
import matplotlib.pyplot as plt

# MLflow workaround
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.login()

# Load data
df = spark.table("Gold_Customer_LTV").toPandas()
features = ["Lifetime_Spend", "Num_Transactions"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

# MLflow setup
mlflow.set_experiment("/Users/amirrezakha@yahoo.com/Retail_ML_Experiments")

# ----------------------
# KMeans
# ----------------------
with mlflow.start_run(run_name="KMeans") as run:
    kmeans = KMeans(n_clusters=3, random_state=42)
    df['cluster_kmeans'] = kmeans.fit_predict(X_scaled)
    
    mlflow.log_param("algorithm", "KMeans")
    mlflow.log_param("n_clusters", 3)
    
    local_path = "/tmp/kmeans_clusters.csv"
    df.to_csv(local_path, index=False)
    mlflow.log_artifact(local_path, artifact_path="clusters")
    
    print("KMeans unique clusters:", df['cluster_kmeans'].unique())

# ----------------------
# GMM
# ----------------------
with mlflow.start_run(run_name="GMM") as run:
    gmm = GaussianMixture(n_components=3, random_state=42)
    df['cluster_gmm'] = gmm.fit_predict(X_scaled)
    
    mlflow.log_param("algorithm", "GMM")
    mlflow.log_param("n_components", 3)
    
    local_path = "/tmp/gmm_clusters.csv"
    df.to_csv(local_path, index=False)
    mlflow.log_artifact(local_path, artifact_path="clusters")
    
    print("GMM unique clusters:", df['cluster_gmm'].unique())

# ----------------------
# Visualization (sample 50% to save memory)
# ----------------------
plot_df = df.sample(frac=0.5, random_state=42)
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.scatter(plot_df['Lifetime_Spend'], plot_df['Num_Transactions'], c=plot_df['cluster_kmeans'], cmap='viridis')
plt.title("KMeans Clusters")
plt.xlabel("Lifetime Spend")
plt.ylabel("Num Transactions")

plt.subplot(1,2,2)
plt.scatter(plot_df['Lifetime_Spend'], plot_df['Num_Transactions'], c=plot_df['cluster_gmm'], cmap='plasma')
plt.title("GMM Clusters")
plt.xlabel("Lifetime Spend")
plt.ylabel("Num Transactions")

plt.tight_layout()
plt.show()
