# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("KMeans Cluster Model").getOrCreate()

In [23]:
from pyspark.ml.clustering import KMeans, LDA, BisectingKMeans, GaussianMixture, PowerIterationClustering
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import ClusteringEvaluator

# Load and verify data

In [3]:
data = spark.read.csv('hack_data.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [5]:
data.head(3)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37),
 Row(Session_Connection_Time=20.0, Bytes Transferred=720.99, Kali_Trace_Used=0, Servers_Corrupted=3.04, Pages_Corrupted=9.0, Location='British Virgin Islands', WPM_Typing_Speed=69.08),
 Row(Session_Connection_Time=31.0, Bytes Transferred=356.32, Kali_Trace_Used=1, Servers_Corrupted=3.71, Pages_Corrupted=8.0, Location='Tokelau', WPM_Typing_Speed=70.58)]

In [6]:
for item in data.head(1)[0]:
    print(item)

8.0
391.09
1
2.96
7.0
Slovenia
72.37


In [7]:
data.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [8]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

# Data Preprocessing

In [9]:
assembler = VectorAssembler(inputCols =['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'],outputCol='features')
final_data = assembler.transform(data)

In [10]:
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))]

In [11]:
final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
scaler = StandardScaler(inputCol='features',outputCol='scaledfeatures')
final_data = scaler.fit(final_data).transform(final_data)
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledfeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

# Train and Test data

In [13]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [14]:
train_data.show(2)

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|        Location|WPM_Typing_Speed|            features|      scaledfeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--------------------+
|                    1.0|           390.69|              1|             2.79|            9.0|Marshall Islands|           71.57|[1.0,390.69,1.0,2...|[0.07098138558313...|
|                    2.0|           228.08|              1|             2.48|            8.0|         Bolivia|            70.8|[2.0,228.08,1.0,2...|[0.14196277116626...|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+----------------+--------------------+--

In [15]:
test_data.show(2)

+-----------------------+-----------------+---------------+-----------------+---------------+-----------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|   Location|WPM_Typing_Speed|            features|      scaledfeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+-----------+----------------+--------------------+--------------------+
|                    2.0|            343.1|              1|             2.74|            8.0|    Albania|           70.64|[2.0,343.1,1.0,2....|[0.14196277116626...|
|                    6.0|           370.45|              1|             2.72|            8.0|Philippines|           68.33|[6.0,370.45,1.0,2...|[0.42588831349878...|
+-----------------------+-----------------+---------------+-----------------+---------------+-----------+----------------+--------------------+--------------------+
only showi

# Build and Evaluate Model 

### K-means

In [16]:
classifier = KMeans(k= 2 ,featuresCol='scaledfeatures')
model = classifier.fit(train_data)
predictions = model.transform(test_data)
print(ClusteringEvaluator().evaluate(predictions))

0.640031992469459


In [17]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   53|
|         0|   52|
+----------+-----+



In [18]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[1.24927239 1.3135289  0.8858819  1.36419975 2.56595463 5.26858977]
[2.99367107 2.93449086 1.05135623 3.20744749 4.52408357 3.29363445]


In [19]:
classifier = KMeans(k= 3 ,featuresCol='scaledfeatures')
model = classifier.fit(train_data)
predictions = model.transform(test_data)
print(ClusteringEvaluator().evaluate(predictions))

0.3041242586860749


In [20]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   25|
|         2|   28|
|         0|   52|
+----------+-----+



In [21]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[1.24927239 1.3135289  0.8858819  1.36419975 2.56595463 5.26858977]
[2.94835644 2.88876049 0.         3.19300485 4.55176256 3.32260227]
[3.03445423 2.9756482  1.99757683 3.22044587 4.49917247 3.26756341]


#### There were 2 hackers

### LDA Model

In [24]:
# Build and Train Model
classifier = LDA(k=10, maxIter=10)
model = classifier.fit(train_data)
ll = model.logLikelihood(train_data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))

The lower bound on the log likelihood of the entire corpus: -92543.57194332496


In [25]:
lp = model.logPerplexity(train_data)
print("The upper bound on perplexity: " + str(lp))

The upper bound on perplexity: 0.5683371880795841


In [26]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+----------------------------------------------------------------+
|topic|termIndices|termWeights                                                     |
+-----+-----------+----------------------------------------------------------------+
|0    |[1, 5, 0]  |[0.8778912208582277, 0.061600159303839164, 0.038745638419526586]|
|1    |[3, 0, 5]  |[0.19323920813914042, 0.16992826297847025, 0.1651360782002795]  |
|2    |[5, 3, 4]  |[0.18318189460712053, 0.17863853257783938, 0.17485553307440463] |
|3    |[1, 2, 4]  |[0.18181948937914433, 0.17692876753903303, 0.16654630287108652] |
|4    |[2, 1, 0]  |[0.1885405071506146, 0.1737984959091707, 0.17144227653864472]   |
|5    |[4, 5, 3]  |[0.18304237170145105, 0.17938902163213982, 0.17897332919055967] |
|6    |[2, 1, 4]  |[0.18278283845243729, 0.17589499274117384, 0.16974439122250504] |
|7    |[4, 3, 2]  |[0.19127979639176304, 0.16993391061108637, 0.16782528885448667] |
|8    |[3, 5, 0

In [27]:
# Make predictions
predictions = model.transform(test_data)
predictions.show(3)

+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|    Location|WPM_Typing_Speed|            features|      scaledfeatures|   topicDistribution|
+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+--------------------+
|                    2.0|            343.1|              1|             2.74|            8.0|     Albania|           70.64|[2.0,343.1,1.0,2....|[0.14196277116626...|[0.87936689639428...|
|                    6.0|           370.45|              1|             2.72|            8.0| Philippines|           68.33|[6.0,370.45,1.0,2...|[0.42588831349878...|[0.89496041445426...|
|                    8.0|            64.83|              0|      

### Bisecting k-means

In [28]:
# Build and Train Model
classifier = BisectingKMeans().setK(2).setSeed(1)
model = classifier.fit(train_data)

In [29]:
# Make predictions
predictions = model.transform(test_data)
# Evaluate clustering by computing Silhouette score
print(ClusteringEvaluator().evaluate(predictions))

0.7792054100391903


In [30]:
# Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:    
    print(center)

Cluster Centers: 
[ 19.75652174 358.74234783   0.45217391   3.61208696   8.51304348
  67.77443478]
[4.00000000e+01 8.57770702e+02 5.17543860e-01 6.90728070e+00
 1.32017544e+01 4.70793860e+01]


In [31]:
predictions.show(3)

+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+----------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|    Location|WPM_Typing_Speed|            features|      scaledfeatures|prediction|
+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+----------+
|                    2.0|            343.1|              1|             2.74|            8.0|     Albania|           70.64|[2.0,343.1,1.0,2....|[0.14196277116626...|         0|
|                    6.0|           370.45|              1|             2.72|            8.0| Philippines|           68.33|[6.0,370.45,1.0,2...|[0.42588831349878...|         0|
|                    8.0|            64.83|              0|             3.58|            8.0|Saudi Arabia|         

### Gaussian Mixture Model (GMM)

In [32]:
# Build and Train Model
classifier = GaussianMixture().setK(2).setSeed(538009335)
model = classifier.fit(train_data)

In [33]:
# Make predictions
predictions = model.transform(test_data)
# Evaluate clustering by computing Silhouette score
print(ClusteringEvaluator().evaluate(predictions))

0.640031992469459


In [34]:
predictions.show(3)

+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+--------------------+----------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|    Location|WPM_Typing_Speed|            features|      scaledfeatures|         probability|prediction|
+-----------------------+-----------------+---------------+-----------------+---------------+------------+----------------+--------------------+--------------------+--------------------+----------+
|                    2.0|            343.1|              1|             2.74|            8.0|     Albania|           70.64|[2.0,343.1,1.0,2....|[0.14196277116626...|[0.99999999880620...|         0|
|                    6.0|           370.45|              1|             2.72|            8.0| Philippines|           68.33|[6.0,370.45,1.0,2...|[0.42588831349878...|[0.99999999827283...|         0|
|         