# Customer Clustering

This notebook performs the customer clustering

## Imports

In [17]:
import os

# ETL and Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, to_date, expr
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.sql import functions as F
import numpy as np

# Visualizations
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

## Load Data

In [18]:
spark = SparkSession.builder \
    .appName("LocalSparkForTesting") \
    .master("local[1]") \
    .getOrCreate()

In [23]:
DATA_PATH = os.path.abspath(os.path.join('/sparkdata/wholesale-recommender', 'processed'))

customer_features  = spark.read.parquet(os.path.join(DATA_PATH, "customers_features"))

## Clustering Pipeline

### Preprocessing Steps

In [24]:
# Define feature columns
feature_cols = [col for col in customer_features.columns if col != 'Customer ID']  # Adjust ID column name if needed

# Create vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vector")

# Apply standard scaling
scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_features", withMean=True, withStd=True)

### Clustering Steps

In [25]:
# Via elbow method in EDA
K=14

In [26]:
# KMeans clustering
kmeans = KMeans(featuresCol="scaled_features", predictionCol="cluster", k=K, seed=42)

## Run pipeline

In [27]:
# Build pipeline
pipeline = Pipeline(stages=[assembler, scaler, kmeans])

# Execute
model = pipeline.fit(customer_features)
clustered_customers = model.transform(customer_features)

## Save

In [None]:
OUTPUT_PATH = os.path.abspath(DATA_PATH)

clusters_df = clustered_customers.select("Customer ID", "cluster")
clusters_df.write.mode("overwrite").parquet(os.path.join(OUTPUT_PATH, "customer_cluster"))