# Clustering with K-means

## Basics

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [0]:
from pyspark.ml.clustering import KMeans
dataset = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')
final_data = dataset.select('features')

In [0]:
kmeans = KMeans().setK(3).setSeed(1)    # seed to replicate experiment

In [0]:
model = kmeans.fit(final_data)

In [0]:
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|(692,[127,128,129...|         2|
|(692,[158,159,160...|         1|
|(692,[124,125,126...|         1|
|(692,[152,153,154...|         1|
|(692,[151,152,153...|         1|
|(692,[129,130,131...|         2|
|(692,[158,159,160...|         1|
|(692,[99,100,101,...|         1|
|(692,[154,155,156...|         2|
|(692,[127,128,129...|         2|
|(692,[154,155,156...|         1|
|(692,[153,154,155...|         2|
|(692,[151,152,153...|         0|
|(692,[129,130,131...|         1|
|(692,[154,155,156...|         2|
|(692,[150,151,152...|         1|
|(692,[124,125,126...|         0|
|(692,[152,153,154...|         2|
|(692,[97,98,99,12...|         1|
|(692,[124,125,126...|         1|
+--------------------+----------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()    # silhouette as default
squaredEuclideanDistance = evaluator.evaluate(results)
squaredEuclideanDistance    # best if closest to 1

Out[61]: 0.48938548088261546

In [0]:
centers = model.clusterCenters()
for center in centers:
    print(center)

[  0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.       0.       0.       0.       0.
   0.       0.       0.       0.  

## Code along

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [0]:
dataset = spark.read.csv('/FileStore/tables/seeds_dataset.csv', inferSchema=True, header=True)
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
dataset.columns

Out[90]: ['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [0]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')
final_data = assembler.transform(dataset)

In [0]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(final_data)
final_data = scaler_model.transform(final_data)

In [0]:
final_data.head()

Out[93]: Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))

In [0]:
kmeans = KMeans(featuresCol='scaledFeatures',k=3, seed=1)
model = kmeans.fit(final_data)
results = model.transform(final_data)

In [0]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(featuresCol='scaledFeatures')    # silhouette as default
squaredEuclideanDistance = evaluator.evaluate(results)
squaredEuclideanDistance    # best if closest to 1

Out[99]: 0.5928460631863877

In [0]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
+----------+
only showing top 20 rows

