In [1]:
import findspark

In [2]:
findspark.init("/home/danial/spark-3.3.2-bin-hadoop3")

In [3]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [12]:
path = '/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/sample_kmeans_data.txt'

In [13]:
data = spark.read.format('libsvm').load(path)

23/04/11 14:50:31 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [14]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [16]:
# since it is unsupervised we don't need label

final_data = data.select('features')
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [19]:
from pyspark.ml.clustering import KMeans

In [46]:
kmeans = KMeans().setK(2).setSeed(1)

In [47]:
model = kmeans.fit(final_data)

In [48]:
# Within Set Sum of Squared Errors (WSSSE) 
wssse = model.summary.trainingCost
wssse

0.11999999999994547

In [24]:
centers = model.clusterCenters()

In [50]:
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [51]:
results = model.transform(final_data)

In [52]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [56]:
kmeans = KMeans().setK(3).setSeed(1)

In [57]:
model = kmeans.fit(final_data)

In [58]:
wssse = model.summary.trainingCost
wssse

0.07499999999994544

In [32]:
centers = model.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]

In [59]:
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [35]:
# Clustering Code Along Example

In [60]:
import findspark

In [61]:
findspark.init("/home/danial/spark-3.3.2-bin-hadoop3/")

In [62]:
from pyspark.sql import SparkSession

In [63]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

23/04/11 16:33:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [64]:
path = '/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv'

In [66]:
data = spark.read.csv(path, header=True, inferSchema=True)

In [68]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [71]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [72]:
from pyspark.ml.clustering import KMeans

In [69]:
from pyspark.ml.feature import VectorAssembler

In [70]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [73]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [74]:
final_data = assembler.transform(data)

In [76]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
# Since a lot of machine learning algorithm object don't mind having a bunch of extra 
# columns (they don't read them) they won't do anything with them, they just look for 
# features column and (in case of supervised also they look for label column) so I don't 
# need to perform the following:

# my_final_data = final_data.select('features')

In [77]:
from pyspark.ml.feature import StandardScaler

In [78]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [79]:
scaler_model = scaler.fit(final_data)

In [80]:
final_data = scaler_model.transform(final_data)

In [81]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [82]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [83]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [84]:
model = kmeans.fit(final_data)

In [85]:
print (f" wssse is equal to {model.summary.trainingCost}")

 wssse is equal to 429.07559671507244


In [86]:
centers = model.clusterCenters()

In [88]:
centers

[array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
         1.81649011, 10.32998598]),
 array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
         2.39849968, 12.2661748 ]),
 array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
         3.27184732, 10.42126018])]

In [91]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows



In [92]:
# consulting project 