# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("KMeans Cluster Model").getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator

# Load and verify data

In [3]:
data = spark.read.csv('seeds_dataset.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
data.head(3)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22),
 Row(area=14.88, perimeter=14.57, compactness=0.8811, length_of_kernel=5.553999999999999, width_of_kernel=3.333, asymmetry_coefficient=1.018, length_of_groove=4.956),
 Row(area=14.29, perimeter=14.09, compactness=0.905, length_of_kernel=5.291, width_of_kernel=3.3369999999999997, asymmetry_coefficient=2.699, length_of_groove=4.825)]

In [6]:
for item in data.head(1)[0]:
    print(item)

15.26
14.84
0.871
5.763
3.312
2.221
5.22


In [7]:
data.describe().show()

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|
| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|
|    min|             10.59|             12.41|              0.8081|              4.899|            

In [8]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

# Data Preprocessing

In [9]:
assembler = VectorAssembler(inputCols =data.columns,outputCol='features')
final_data = assembler.transform(data)

In [10]:
scaler = StandardScaler(inputCol='features',outputCol='scaledfeatures')
final_data = scaler.fit(final_data).transform(final_data)
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledfeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

# Train and Test data

In [11]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [12]:
train_data.show(2)

+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledfeatures|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
|10.74|    12.73|     0.8329|           5.145|          2.642|                4.702|           4.963|[10.74,12.73,0.83...|[3.69110289768413...|
|10.79|    12.93|     0.8107|           5.317|          2.648|    5.462000000000001|           5.194|[10.79,12.93,0.81...|[3.70828680316683...|
+-----+---------+-----------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
only showing top 2 rows



In [13]:
test_data.show(2)

+-----+---------+------------------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
| area|perimeter|       compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledfeatures|
+-----+---------+------------------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
|10.59|    12.41|            0.8648|           4.899|          2.787|                4.975|           4.794|[10.59,12.41,0.86...|[3.63955118123602...|
| 10.8|    12.57|0.8590000000000001|           4.981|          2.821|                4.773|           5.063|[10.8,12.57,0.859...|[3.71172358426337...|
+-----+---------+------------------+----------------+---------------+---------------------+----------------+--------------------+--------------------+
only showing top 2 rows



# Build Model 

In [14]:
classifier = KMeans(k=2,featuresCol='scaledfeatures')
model = classifier.fit(train_data)

In [15]:
predictions = model.transform(test_data)

# Evaluate Model

In [16]:
print(ClusteringEvaluator().evaluate(predictions))

0.7079488489048497


In [17]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[ 4.43876644 10.47752429 36.54781608 12.03818152  7.98931206  2.51620454
 10.33086089]
[ 6.24648602 12.29699282 37.41610089 13.84183042  9.68595154  2.26612593
 12.17784757]


In [18]:
predictions.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows

