In [0]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans


In [0]:
customer_features = spark.table("customer_features")
display(customer_features)

## Assemble Feature Vector

In [0]:
assembler = VectorAssembler(
    inputCols=[
        "tx_count",
        "avg_amount",
        "total_spend",
        "unique_merchants"
    ],
    outputCol="raw_features"
)

assembled_df = assembler.transform(customer_features)


In [0]:
scaler = StandardScaler(
    inputCol="raw_features",
    outputCol="features",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(assembled_df)
scaled_df = scaler_model.transform(assembled_df)


## Train KMeans Model

In [0]:
kmeans = KMeans(
    k=3,
    seed=42,
    featuresCol="features"
)

kmeans_model = kmeans.fit(scaled_df)


## Assign Clusters to Customers

In [0]:
clustered_customers = kmeans_model.transform(scaled_df)

display(
    clustered_customers.select(
        "customer_id",
        "tx_count",
        "avg_amount",
        "total_spend",
        "unique_merchants",
        "prediction"
    )
)


## Save Cluster Results to Delta Table

In [0]:
clustered_customers.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("customer_clusters")


## Cluster Interpretation

In [0]:
cluster_summary = clustered_customers.groupBy("prediction").agg(
    avg("tx_count").alias("avg_tx_count"),
    avg("avg_amount").alias("avg_tx_amount"),
    avg("total_spend").alias("avg_total_spend"),
    avg("unique_merchants").alias("avg_unique_merchants"),
    count("*").alias("num_customers")
)

display(cluster_summary)


**Explaination of the Clusters**

- Cluster 0: Low-frequency, low-spend customers
- Cluster 1: Moderate spenders with diverse merchants
- Cluster 2: High-frequency, high-value customers (higher risk)
