In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('hack_data').getOrCreate()

In [3]:
dataset = spark.read.csv('../data/hack_data.csv', header=True, inferSchema=True)

In [4]:
dataset.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [5]:
dataset.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

In [6]:
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [9]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [10]:
final_data = vec_assembler.transform(dataset)

In [11]:
from pyspark.ml.feature import StandardScaler

In [12]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

In [13]:
scalerModel = scaler.fit(final_data)

In [14]:
cluster_final_data = scalerModel.transform(final_data)

In [15]:
from pyspark.ml.clustering import KMeans

kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [18]:
model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

In [19]:
wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [20]:
print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse_k3))
print('--'*30)
print("With K=2")
print("Within Set Sum of Squared Errors = " + str(wssse_k2))

With K=3
Within Set Sum of Squared Errors = 434.7550730848762
------------------------------------------------------------
With K=2
Within Set Sum of Squared Errors = 601.7707512676687


In [21]:
for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676687
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.7550730848762
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 267.13361168878964
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 248.00282758265615
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 224.85463506441042
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 205.7238509582768
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 191.79004199978718
------------------------------------------------------------


In [22]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   88|
|         0|   79|
+----------+-----+



In [23]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

