# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("KMeans Cluster Model").getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator

# Load and verify data

In [3]:
data = spark.read.csv('hack_data.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [5]:
data.head(3)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37),
 Row(Session_Connection_Time=20.0, Bytes Transferred=720.99, Kali_Trace_Used=0, Servers_Corrupted=3.04, Pages_Corrupted=9.0, Location='British Virgin Islands', WPM_Typing_Speed=69.08),
 Row(Session_Connection_Time=31.0, Bytes Transferred=356.32, Kali_Trace_Used=1, Servers_Corrupted=3.71, Pages_Corrupted=8.0, Location='Tokelau', WPM_Typing_Speed=70.58)]

In [6]:
for item in data.head(1)[0]:
    print(item)

8.0
391.09
1
2.96
7.0
Slovenia
72.37


In [7]:
data.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [8]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

# Data Preprocessing

In [9]:
assembler = VectorAssembler(inputCols =['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'],outputCol='features')
final_data = assembler.transform(data)

In [10]:
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))]

In [11]:
final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
scaler = StandardScaler(inputCol='features',outputCol='scaledfeatures')
final_data = scaler.fit(final_data).transform(final_data)
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledfeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

# Train and Test data

In [13]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [14]:
train_data.show(2)

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|        Location|WPM_Typing_Speed|            features|      scaledfeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--------------------+
|                    1.0|           390.69|              1|             2.79|            9.0|Marshall Islands|           71.57|[1.0,390.69,1.0,2...|[0.07098138558313...|
|                    2.0|           228.08|              1|             2.48|            8.0|         Bolivia|            70.8|[2.0,228.08,1.0,2...|[0.14196277116626...|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--

In [15]:
test_data.show(2)

+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|Location|WPM_Typing_Speed|            features|      scaledfeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+--------------------+--------------------+
|                    2.0|            343.1|              1|             2.74|            8.0| Albania|           70.64|[2.0,343.1,1.0,2....|[0.14196277116626...|
|                    4.0|           295.79|              0|             2.75|            8.0|Maldives|           73.15|[4.0,295.79,0.0,2...|[0.28392554233252...|
+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+--------------------+--------------------+
only showing top 2 rows



# Build and Evaluate Model 

In [16]:
classifier = KMeans(k= 2 ,featuresCol='scaledfeatures')
model = classifier.fit(train_data)
predictions = model.transform(test_data)
print(ClusteringEvaluator().evaluate(predictions))

0.6551601662052332


In [17]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   45|
|         0|   50|
+----------+-----+



In [18]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[2.99941855 2.95258268 1.00732507 3.21725082 4.51967842 3.28019777]
[1.29453756 1.34774374 1.04790916 1.35920808 2.54180458 5.26602171]


In [19]:
classifier = KMeans(k= 3 ,featuresCol='scaledfeatures')
model = classifier.fit(train_data)
predictions = model.transform(test_data)
print(ClusteringEvaluator().evaluate(predictions))

0.26655685807090285


In [20]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   19|
|         2|   26|
|         0|   50|
+----------+-----+



In [21]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[2.99941855 2.95258268 1.00732507 3.21725082 4.51967842 3.28019777]
[1.27766494 1.41451096 1.99757683 1.36639429 2.52466412 5.29900654]
[1.31315563 1.27406957 0.         1.35127845 2.56071819 5.22962465]


#### There were 2 hackers