In [1]:
!pip install -q findspark
!pip install -q pyspark

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[K     |████████████████████████████████| 198 kB 67.4 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [4]:
df = spark.read.csv('seeds_dataset.csv',header=True, inferSchema=True)
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [6]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

# Assembler les vecteurs


In [9]:
from pyspark.ml.feature import VectorAssembler
vec_assembler = VectorAssembler(inputCols = df.columns, outputCol='features')
final_data = vec_assembler.transform(df)

In [10]:
final_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355| 

# Feature Scaling Data

In [11]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(final_data)

In [12]:
final_data = scalerModel.transform(final_data)

#K-means

In [14]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='scaledFeatures', k=3)
model = kmeans.fit(final_data)

In [17]:
w_set_sse = model.clusterCenters()
w_set_sse

[array([ 4.90993613, 10.92295738, 37.28032496, 12.38401355,  8.5873381 ,
         1.7739463 , 10.35147469]),
 array([ 4.06818854, 10.13938448, 35.87110297, 11.81191124,  7.52564313,
         3.24585755, 10.40780927]),
 array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
         2.41428142, 12.28078861])]

In [19]:
model.transform(final_data).select("features","prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[15.26,14.84,0.87...|         0|
|[14.88,14.57,0.88...|         0|
|[14.29,14.09,0.90...|         0|
|[13.84,13.94,0.89...|         0|
|[16.14,14.99,0.90...|         0|
|[14.38,14.21,0.89...|         0|
|[14.69,14.49,0.87...|         0|
|[14.11,14.1,0.891...|         0|
|[16.63,15.46,0.87...|         2|
|[16.44,15.25,0.88...|         0|
|[15.26,14.85,0.86...|         0|
|[14.03,14.16,0.87...|         0|
|[13.89,14.02,0.88...|         0|
|[13.78,14.06,0.87...|         0|
|[13.74,14.05,0.87...|         0|
|[14.59,14.28,0.89...|         0|
|[13.99,13.83,0.91...|         0|
|[15.69,14.75,0.90...|         0|
|[14.7,14.21,0.915...|         0|
|[12.72,13.57,0.86...|         1|
+--------------------+----------+
only showing top 20 rows



In [None]:
from google.colab import drive
drive.mount('/content/drive')