In [173]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hack_find').getOrCreate()

In [174]:
from pyspark.ml.clustering import KMeans

# Loads data.
dataset = spark.read.csv("hack_data.csv",header=True,inferSchema=True)

In [175]:
dataset.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [176]:
dataset.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+------------------+
|  count|                    334|               334|               334|              334|               334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|              40.0|
|    max|                   60.0|            1330.5|                 1|             10.0|              1

In [177]:
df.columns

Index(['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
       'Servers_Corrupted', 'Pages_Corrupted', 'Location', 'WPM_Typing_Speed'],
      dtype='object')

In [178]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [179]:
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [180]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [181]:
final_data = vec_assembler.transform(dataset)

In [182]:
from pyspark.ml.feature import StandardScaler

In [183]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [184]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [185]:
# Normalize each feature to have unit standard deviation.
cluster_final_data = scalerModel.transform(final_data)

In [187]:
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [188]:
model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

In [189]:
wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [197]:
print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse_k3))
print('--'*30)
print("With K=2")
print("Within Set Sum of Squared Errors = " + str(wssse_k2))

With K=3
Within Set Sum of Squared Errors = 434.1492898715845
------------------------------------------------------------
With K=2
Within Set Sum of Squared Errors = 601.7707512676716


In [204]:
for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676716
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.1492898715845
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 267.1336116887891
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 246.44966476509273
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 227.11523291903796
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 208.95468005311722
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 196.20268561839072
------------------------------------------------------------


In [205]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



In [206]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



________