In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

In [3]:
# Get data
data = spark.read.csv('seeds_dataset.csv',
                      header=True,inferSchema=True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [6]:
# We want to group seeds, knowing there are 3 varieties of wheat.
# No label. This is unsupervised learning!

In [7]:
# Prepare data for ML
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [9]:
# We will pass all columns through the assembler!
assembler = VectorAssembler(inputCols=data.columns,
                            outputCol='features')

In [10]:
finalData = assembler.transform(data)

In [11]:
finalData.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
# Scale your Features! Make sure every single one
# is on the same order of magnitude!
from pyspark.ml.feature import StandardScaler

In [13]:
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')

In [15]:
# Fit (collect stddev info) and transform accordingly
scalerModel = scaler.fit(finalData)
finalData = scalerModel.transform(finalData)

In [16]:
finalData.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [17]:
# Initiate the model with 3 clusters
kmeans = KMeans(featuresCol='scaledFeatures',k=3)

In [18]:
# Fit the model
kModel = kmeans.fit(finalData)

In [19]:
# Evaluate your model -> "Within Set Sum of Squared Errors"
wssse = kModel.summary.trainingCost
wssse

# Value is not helpful because the data got scaled!

429.1153810024243

In [20]:
# View centroid coordinates
centers = kModel.clusterCenters()
centers

[array([ 4.90993613, 10.92295738, 37.28032496, 12.38401355,  8.5873381 ,
         1.7739463 , 10.35147469]),
 array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
         2.41428142, 12.28078861]),
 array([ 4.06818854, 10.13938448, 35.87110297, 11.81191124,  7.52564313,
         3.24585755, 10.40780927])]

In [22]:
results = kModel.transform(finalData)
results.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

