In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("num_of_hackers").getOrCreate()

In [None]:
df = spark.read.csv("../input/sample-hack-data/hack_data.csv", header=True, inferSchema=True)

In [None]:
df.show(2)

In [None]:
round(df.describe().toPandas(), 2)

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

In [None]:
feat_cols = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [None]:
assembler = VectorAssembler(inputCols=feat_cols, outputCol='features')

In [None]:
final_df = assembler.transform(df)

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler = StandardScaler(inputCol='features', 
                        outputCol='scaled_feat',
                        withStd = True,
                        withMean = False)

In [None]:
scaled_model = scaler.fit(final_df)

In [None]:
cluster_df = scaled_model.transform(final_df)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
eval = ClusteringEvaluator(predictionCol="prediction",
                           featuresCol="scaled_feat",
                           metricName="silhouette",
                           distanceMeasure="squaredEuclidean")

In [None]:
silhouette_score = []
print("""
Silhoutte Scores for K Mean Clustering
======================================
Model\tScore\t
=====\t=====\t
""")
for k in range(2,11):
  kmeans_algo = KMeans(featuresCol='scaled_feat',k=k)
  kmeans_fit = kmeans_algo.fit(cluster_df)
  output = kmeans_fit.transform(cluster_df)
  score = eval.evaluate(output)
  silhouette_score.append(score)
  print(f"K{k}\t{round(score,2)}\t")


In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(10,10))
ax.plot(range(2,11), silhouette_score)
ax.set_xlabel("K")
ax.set_ylabel("Score");