# Clustring Review 

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Clustering").getOrCreate()

In [16]:
from pyspark.ml.clustering import KMeans

In [12]:
dataset = spark.read.format("libsvm").load("../../Data/sample_kmeans_data.txt")
dataset.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
data = dataset.select("features")
data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [27]:
Kmeans = KMeans().setK(2).setSeed(1) 

In [28]:
model = Kmeans.fit(data)

# get the centers 

In [31]:
centers = model.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

# view the label

In [33]:
predict = model.transform(data)

In [35]:
predict.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



# Another Example 

In [36]:
dataset = spark.read.csv("../../Data/seeds_dataset.csv",inferSchema=True, header=True)
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [55]:
from pyspark.ml.clustering import KMeans

In [56]:
from pyspark.ml.feature import VectorAssembler

In [57]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [58]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol="features")

In [59]:
final_data = assembler.transform(dataset)

In [60]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



# Scaling the Data

In [61]:
from pyspark.ml.feature import StandardScaler

In [62]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [63]:
scaler_model = scaler.fit(final_data)
final_data = scaler_model.transform(final_data)

In [64]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [65]:
print(final_data.select("scaledFeatures").head(1))
final_data.select("features").head(1)

[Row(scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]


[Row(features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]))]

In [68]:
Kmeans = KMeans(featuresCol="scaledFeatures", k = 3 )

In [72]:
model = Kmeans.fit(final_data)

In [73]:
predict = model.transform(final_data)

In [77]:
predict.select("prediction").show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows

