In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()
df = spark.read.csv('Mall_Customers.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)



In [3]:
df.columns

['CustomerID', 'Genre', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']

In [4]:

df.take(1)

[Row(CustomerID=1, Genre='Male', Age=19, Annual Income (k$)=15, Spending Score (1-100)=39)]

In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Age',
 'Annual Income (k$)',
 'Spending Score (1-100)'],
outputCol = 'features')
final_df = assembler.transform(df)
final_df.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)
 |-- features: vector (nullable = true)



In [6]:

from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')
scaler_model = scaler.fit(final_df)
final_df = scaler_model.transform(final_df)
final_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [7]:

final_df.select('features','scaledFeatures').show(3)

+----------------+--------------------+
|        features|      scaledFeatures|
+----------------+--------------------+
|[19.0,15.0,39.0]|[1.36015391423519...|
|[21.0,15.0,81.0]|[1.50332801047048...|
| [20.0,16.0,6.0]|[1.43174096235284...|
+----------------+--------------------+
only showing top 3 rows



In [8]:
final_df.take(1)

[Row(CustomerID=1, Genre='Male', Age=19, Annual Income (k$)=15, Spending Score (1-100)=39, features=DenseVector([19.0, 15.0, 39.0]), scaledFeatures=DenseVector([1.3602, 0.5711, 1.5103]))]

In [9]:
from pyspark.ml.clustering import KMeans

kmeans2 = KMeans(featuresCol = 'scaledFeatures', k=2)
kmeans3 = KMeans(featuresCol = 'scaledFeatures', k=3)

model_k2 = kmeans2.fit(final_df)
model_k3 = kmeans3.fit(final_df)

In [10]:
print('WSSSE_K2:', model_k2.computeCost(final_df))
print('WSSSE_K3:', model_k3.computeCost(final_df))



WSSSE_K2: 387.4392580116556
WSSSE_K3: 335.7398981508935


In [11]:
model_k2.transform(final_df).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   97|
|         0|  103|
+----------+-----+

