In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

In [3]:
# Get data
data = spark.read.format('libsvm').load('sample_kmeans_data.txt')

In [4]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [5]:
# Ignore the label column...
finalData = data.select('features')

In [6]:
finalData.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [31]:
# Initiate the model with 2,3,4 clusters
kmeans = KMeans().setK(2).setSeed(1)

In [32]:
# Fit the model
kModel = kmeans.fit(finalData)

In [33]:
# EValuate your model -> "Within Set Sum of Squared Errors"
wssse = kModel.summary.trainingCost

In [34]:
wssse # Lowest significant drop is at 4 clusters

0.11999999999994547

In [35]:
# View centroid coordinates
centers = kModel.clusterCenters()

In [36]:
centers

# Centers with 4 clusters were like this
#[array([9.05, 9.05, 9.05]),
# array([0.05, 0.05, 0.05]),
# array([0.2, 0.2, 0.2]),
# array([9.2, 9.2, 9.2])]

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [37]:
# At the end of the day, what we want to know is
# Which centroid does EACH ROW belong to?

In [40]:
results = kModel.transform(finalData)

In [41]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

