In [6]:
#!pip install pyspark

In [39]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import *
from pyspark.ml.evaluation import ClusteringEvaluator

In [4]:
spark = SparkSession.builder \
.appName("KMeansMallCustomerBasic") \
.getOrCreate()

In [5]:
df = spark.read.format("csv") \
.option("header", True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/content/Mall_Customers.csv")

In [7]:
df.limit(4).toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77


In [14]:
df.columns

['CustomerID', 'Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']

In [24]:
from pyspark.sql.functions import col
df = df.select(col("Annual Income (k$)").alias("Annual_Income"), col("Spending Score (1-100)").alias("Spending_Score"))

In [25]:
df.describe().show()

+-------+-----------------+------------------+
|summary|    Annual_Income|    Spending_Score|
+-------+-----------------+------------------+
|  count|              200|               200|
|   mean|            60.56|              50.2|
| stddev|26.26472116527124|25.823521668370173|
|    min|               15|                 1|
|    max|              137|                99|
+-------+-----------------+------------------+



In [31]:
vector_assembler = VectorAssembler() \
.setInputCols(['Annual_Income','Spending_Score']) \
.setOutputCol('features')


In [32]:
standard_scaler = StandardScaler() \
.setInputCol('features') \
.setOutputCol('scaled_features')

In [46]:
def runKMeans(df,k):
  kmeans_obj = KMeans() \
  .setSeed(142) \
  .setK(k) \
  .setPredictionCol('cluster') \
  .setFeaturesCol('scaled_features') \
  .setMaxIter(40) \
  .setTol(1.0e-5)

  pipeline_obj = Pipeline() \
  .setStages([vector_assembler, standard_scaler, kmeans_obj])


  pipeline_model = pipeline_obj.fit(df)
  return pipeline_model



In [47]:
for k in range(2,11):
    pipeline_model = runKMeans(df, k)
    transformed_df = pipeline_model.transform(df)
    
    evaluator = ClusteringEvaluator() \
    .setFeaturesCol('scaled_features') \
    .setPredictionCol('cluster') \
    .setMetricName('silhouette')
    
    score = evaluator.evaluate(transformed_df)
    
    print(k, score)

2 0.452706667734966
3 0.6288672765684975
4 0.6572938259032692
5 0.7408351139612729
6 0.7263562509896634
7 0.7236275795916091
8 0.6884908828073537
9 0.6258921742873865
10 0.6372440905182242


In [37]:
transformed_df.limit(5).toPandas().head()

Unnamed: 0,Annual_Income,Spending_Score,features,scaled_features,cluster
0,15,39,"[15.0, 39.0]","[0.5711082903036444, 1.510251022337088]",4
1,15,81,"[15.0, 81.0]","[0.5711082903036444, 3.1366752002385674]",2
2,16,6,"[16.0, 6.0]","[0.6091821763238874, 0.2323463111287828]",4
3,16,77,"[16.0, 77.0]","[0.6091821763238874, 2.9817776594860455]",2
4,17,40,"[17.0, 40.0]","[0.6472560623441304, 1.5489754075252185]",4
