In [0]:
# File location and type
file_location = "/FileStore/tables/hack_data-10.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Session_Connection_Time,Bytes Transferred,Kali_Trace_Used,Servers_Corrupted,Pages_Corrupted,Location,WPM_Typing_Speed
8.0,391.09,1,2.96,7.0,Slovenia,72.37
20.0,720.99,0,3.04,9.0,British Virgin Islands,69.08
31.0,356.32,1,3.71,8.0,Tokelau,70.58
2.0,228.08,1,2.48,8.0,Bolivia,70.8
20.0,408.5,0,3.57,8.0,Iraq,71.28
1.0,390.69,1,2.79,9.0,Marshall Islands,71.57
18.0,342.97,1,5.1,7.0,Georgia,72.32
22.0,101.61,1,3.03,7.0,Timor-Leste,72.03
15.0,275.53,1,3.53,8.0,Palestinian Territory,70.17
12.0,424.83,1,2.53,8.0,Bangladesh,69.99


In [0]:
df.columns

Out[2]: ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('clust').getOrCreate()

In [0]:
from pyspark.ml.clustering import KMeans

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [0]:
vc = VectorAssembler(inputCols=['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed'], outputCol='features')

In [0]:
ft = vc.transform(df)

In [0]:
sc = StandardScaler(inputCol='features',outputCol='sc_feat')

In [0]:
ft=sc.fit(ft).transform(ft)

In [0]:
ft.columns

Out[16]: ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed',
 'features',
 'sc_feat']

In [0]:
ft.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- sc_feat: vector (nullable = true)



In [0]:
k2 = KMeans(featuresCol='sc_feat',k=2)
k3 = KMeans(featuresCol='sc_feat',k=3)

In [0]:
model_1 = k2.fit(ft.select('sc_feat'))

In [0]:
pre = model_1.transform(ft.select('sc_feat'))

In [0]:
pre.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [0]:
model_2 = k3.fit(ft.select('sc_feat'))

In [0]:
pro = model_2.transform(ft.select('sc_feat'))

In [0]:
pro.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+

