In [1]:
!pip install -q findspark
!pip install -q pyspark

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[K     |████████████████████████████████| 198 kB 54.2 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
df = spark.read.csv('hack_data.csv',header=True, inferSchema=True)
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [3]:
#droping location column, because it's non relevant and string problem
df = df.drop('Location')
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|            70.8|
|                   20.0|            408.5|              0|             3.57|            8.0|           71.28|
|                    1.0|           390.69|              1|             2.79|            9.0|           71.57|
|

In [4]:
from pyspark.ml.feature import VectorAssembler
vec_assembler = VectorAssembler(inputCols = df.columns, outputCol='features')
final_data = vec_assembler.transform(df)

In [5]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(final_data)
final_data = scalerModel.transform(final_data)

In [6]:
final_data.show()


+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|            features|      scaledFeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|[31.0,356.32,1.0,...|[2.20042295307707...|
|                    2.0|           228.08|              1|             2.48|     

In [26]:
'''#Visualizing the silhouette scores in a plot
#import matplotlib.pyplot as plt
#fig, ax = plt.subplots(1,1, figsize =(8,6))
ax.plot(range(2,4),silhouette)
ax.set_xlabel(‘k’)
ax.set_ylabel(‘cost’)'''

'#Visualizing the silhouette scores in a plot\n#import matplotlib.pyplot as plt\n#fig, ax = plt.subplots(1,1, figsize =(8,6))\nax.plot(range(2,4),silhouette)\nax.set_xlabel(‘k’)\nax.set_ylabel(‘cost’)'

#Strategie
Etudier le dataset avec 2 nombres de cluster différent, en effet la meilleur probabilité de succès de prédiction du modèle K-means entre 2 et 3 clusters déterminera quel nombre de clusters est le plus adapté aux données, et donc le nombre de hackers. Le nombre de clusters qui marche le mieux en terme de prédiction sera le plus proche du "elbow" de la courbe. 

#2 clusters with k-means --> 2 hackers

In [27]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='scaledFeatures', k=2)
model = kmeans.fit(final_data)

In [28]:
w_set_sse = model.clusterCenters()
w_set_sse

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
        5.26676612]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ])]

In [29]:
model.transform(final_data).select("features","prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[8.0,391.09,1.0,2...|         0|
|[20.0,720.99,0.0,...|         0|
|[31.0,356.32,1.0,...|         0|
|[2.0,228.08,1.0,2...|         0|
|[20.0,408.5,0.0,3...|         0|
|[1.0,390.69,1.0,2...|         0|
|[18.0,342.97,1.0,...|         0|
|[22.0,101.61,1.0,...|         0|
|[15.0,275.53,1.0,...|         0|
|[12.0,424.83,1.0,...|         0|
|[15.0,249.09,1.0,...|         0|
|[32.0,242.48,0.0,...|         0|
|[23.0,514.54,0.0,...|         0|
|[9.0,284.77,0.0,3...|         0|
|[27.0,779.25,1.0,...|         0|
|[12.0,307.31,1.0,...|         0|
|[21.0,355.94,1.0,...|         0|
|[10.0,372.65,0.0,...|         0|
|[20.0,347.23,1.0,...|         0|
|[22.0,456.57,0.0,...|         0|
+--------------------+----------+
only showing top 20 rows



In [30]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
predictions = model.transform(final_data)


La probabilité de prédiction est de 66% pour 2 hackers


In [31]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



# 3 clusters --> 3 hackers

In [32]:
from pyspark.ml.clustering import KMeans
kmeans3 = KMeans(featuresCol='scaledFeatures', k=3)
model2 = kmeans3.fit(final_data)

w_set_sse2 = model2.clusterCenters()
w_set_sse2

[array([1.21780112, 1.37901802, 1.99757683, 1.37198977, 2.55237797,
        5.29152222]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ]),
 array([1.30217042, 1.25830099, 0.        , 1.35793211, 2.57251009,
        5.24230473])]

In [33]:
model2.transform(final_data).select("features","prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[8.0,391.09,1.0,2...|         0|
|[20.0,720.99,0.0,...|         2|
|[31.0,356.32,1.0,...|         0|
|[2.0,228.08,1.0,2...|         0|
|[20.0,408.5,0.0,3...|         2|
|[1.0,390.69,1.0,2...|         0|
|[18.0,342.97,1.0,...|         0|
|[22.0,101.61,1.0,...|         0|
|[15.0,275.53,1.0,...|         0|
|[12.0,424.83,1.0,...|         0|
|[15.0,249.09,1.0,...|         0|
|[32.0,242.48,0.0,...|         2|
|[23.0,514.54,0.0,...|         2|
|[9.0,284.77,0.0,3...|         2|
|[27.0,779.25,1.0,...|         0|
|[12.0,307.31,1.0,...|         0|
|[21.0,355.94,1.0,...|         0|
|[10.0,372.65,0.0,...|         2|
|[20.0,347.23,1.0,...|         0|
|[22.0,456.57,0.0,...|         2|
+--------------------+----------+
only showing top 20 rows



In [34]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator2 = ClusteringEvaluator()
predictions2 = model2.transform(final_data)


In [35]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



La probabilité de succès des prédictions est de 33% avec k=3

#conclusion
il y a 2 hackers car k=2 est la plus proche du "elbow" de la courbe. Le clustering des données en 2 groupes fait plus de sens que en 3 groupes, suggérant seulement 2 hackers.