# <span style ='color:brown;font-family: helvetica'> K-means Clustering with pySpark

In [1]:
import findspark

In [2]:
findspark.init('/home/chandan/spark-3.2.4-bin-hadoop3.2')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

23/05/21 12:46:56 WARN Utils: Your hostname, chandan-VivoBook-ASUSLaptop-X515MA-X515MA resolves to a loopback address: 127.0.1.1; using 192.168.0.169 instead (on interface wlo1)
23/05/21 12:46:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/21 12:46:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Documentation Example

In [5]:
from pyspark.ml.clustering import KMeans

In [6]:
dataset = spark.read.format('libsvm').load("sample_kmeans_data.txt")

23/05/21 13:04:26 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [7]:
dataset.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



                                                                                

In [8]:
final_data = dataset.select('features')

In [9]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [10]:
kmeans  = KMeans().setK(2).setSeed(1)

#### setSeed is the seed value to a random number generator

In [11]:
model = kmeans.fit(final_data)

                                                                                

### We can do model evaluation with within set sum of squares

In [12]:
centers = model.clusterCenters()

In [13]:
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [14]:
results = model.transform(final_data)

In [15]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [16]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [17]:
evaluator = ClusteringEvaluator()

In [18]:
clus_eval = evaluator.evaluate(results)

                                                                                

In [19]:
clus_eval

0.9997530305375207

In [20]:
wsse = model.summary.trainingCost

In [21]:
wsse

0.11999999999994547

### Now We will See Practice Exercise

# Practice Exercise

In [23]:
df = spark.read.csv("seeds_dataset.csv", header= True, inferSchema=True)

In [24]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [25]:
df.head(1)

[Stage 24:>                                                         (0 + 1) / 1]                                                                                

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [26]:
from pyspark.ml.clustering import KMeans

In [27]:
from pyspark.ml.feature import VectorAssembler

In [28]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [29]:
assembeler = VectorAssembler(inputCols= df.columns, outputCol='features')

In [30]:
final_data = assembeler.transform(df)

In [31]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



### Scaling the data

In [33]:
from pyspark.ml.feature import StandardScaler

In [34]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [35]:
scaler_model = scaler.fit(final_data)

                                                                                

In [36]:
final_data = scaler_model.transform(final_data)

In [37]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

## Train model

In [38]:
kmeans = KMeans(featuresCol='scaledFeatures', k = 3)

In [40]:
model = kmeans.fit(final_data)

                                                                                

In [41]:
wssse = model.summary.trainingCost

In [43]:
wssse

428.6082011872446

### This is not superuseful when you scale the data

In [44]:
centers = model.clusterCenters()

In [45]:
print(centers)

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
        3.15410901, 10.38031464]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
        1.80061978, 10.41913733])]


In [46]:
results = model.transform(final_data)

In [52]:
results.select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows



# Practical Exercise

In [53]:
df = spark.read.csv('hack_data.csv', header= True, inferSchema=True)

In [54]:
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [55]:
df.describe().show()

                                                                                

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [56]:
from pyspark.ml.feature import VectorAssembler

In [57]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [58]:
assember = VectorAssembler(inputCols=['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted','WPM_Typing_Speed'], outputCol= 'features')

In [59]:
final_data = assember.transform(df)

In [60]:
final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [63]:
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))]

#### Scale the data

In [65]:
from pyspark.ml.feature import StandardScaler

In [66]:
scaler = StandardScaler(inputCol='features', outputCol='ScaledFeatures')

In [67]:
scaled_features = scaler.fit(final_data).transform(final_data)

In [68]:
scaled_features.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), ScaledFeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

In [69]:
from pyspark.ml.clustering import KMeans

In [81]:
kmeans = KMeans(featuresCol='ScaledFeatures', k= 2)

In [82]:
model = kmeans.fit(scaled_features)

In [83]:
wssse = model.summary.trainingCost

In [84]:
wssse

601.7707512676691

In [85]:
results = model.transform(scaled_features)

In [86]:
results.select('prediction').show()

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
+----------+
only showing top 20 rows



In [87]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [88]:
evaluator = ClusteringEvaluator()

In [89]:
clust_eval = evaluator.evaluate(results)

In [90]:
clust_eval

0.6683623593283755

### With 3 k out clust_eval is 0.304, with k =2 its 0.668

In [91]:
results.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



### We can see now that data is getting evenly split between two