# Import necessary libraries

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
file_path = "/content/utilization.json"
df = spark.read.format("json").load(file_path)

In [5]:
df.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [11]:
#We want to Group based on cpu_utilization, free_memory and Session_Count
#To do this we need a Vector
#It's a storage system used in machine learning to store values. In our case (cpu_utilization, free_memory and Session_Count)


VectorAssembler = VectorAssembler(inputCols = ["cpu_utilization","free_memory", "session_count"], outputCol = "features")

In [15]:
#vectorized cluster df
#We have our utilization dataframe  df
#We want the vectorAssembler to take the  "cpu_utilization","free_memory", "session_count"
#combine them into a single vector and put that vector into a single column called features

vcluster_df = VectorAssembler.transform(df)

In [18]:
#Features = ["cpu_utilization","free_memory", "session_count"]

vcluster_df.show()

+---------------+-------------------+-----------+---------+-------------+----------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|        features|
+---------------+-------------------+-----------+---------+-------------+----------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|[0.57,0.51,47.0]|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|[0.47,0.62,43.0]|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|[0.56,0.57,62.0]|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|[0.57,0.56,50.0]|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|[0.35,0.46,43.0]|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|[0.41,0.58,48.0]|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|[0.57,0.35,58.0]|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58| [0.41,0.4,58.0]|

# KMeans Algorithm


In [21]:
#k = number of clusters
Kmeans = KMeans().setK(3)

In [22]:
kmeans = Kmeans.setSeed(1)

In [26]:
#Fit : Takes input data and then apply the algorithm
#Returns a machine learning model

Kmodel = Kmeans.fit(vcluster_df)

In [31]:
#Each center represnets the columns  "cpu_utilization","free_memory", "session_count"
Kmodel.clusterCenters()

[array([ 0.7625981 ,  0.23769654, 91.33678898]),
 array([ 0.47415607,  0.52414244, 48.88811904]),
 array([ 0.69488115,  0.30503342, 71.94338453])]