In [2]:
from pyspark.sql import SparkSession

# instantiate spark driver node
spark = SparkSession \
    .builder \
    .appName("Spark data analyst") \
    .getOrCreate()

# load csv file
json_df2_path = './data/utilization.json'
df = spark.read.format('json').load(json_df2_path)
df.createOrReplaceTempView("utilization")

# print out dataframe
df.show()

24/11/19 18:31:14 WARN Utils: Your hostname, Doomzies-2.local resolves to a loopback address: 127.0.0.1; using 192.168.68.101 instead (on interface en0)
24/11/19 18:31:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/19 18:31:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/19 18:31:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|            0.8|04/08/2019 23:37:13|       0.15|      133|           68|
|           0.84|04/08/2019 23:42:13|       0.14|      133|           66|
|           0.69|04/08/2019 23:47:13|        0.4|      133|           72|
|           0.75|04/08/2019 23:52:13|       0.18|      133|           77|
|           0.63|04/08/2019 23:57:13|       0.16|      133|           82|
|           0.72|04/09/2019 00:02:13|       0.06|      133|           78|
|            0.9|04/09/2019 00:07:13|       0.37|      133|           68|
|           0.59|04/09/2019 00:12:13|        0.4|      133|           94|
|           0.87|04/09/2019 00:17:13|       0.35|      133|           86|
|           0.83|04/09/2019 00:22:13|       0.34|      133|           92|
|           0.71|04/09/2019 00:27:13| 

24/11/19 18:31:30 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
# provides statistics on the features in df
df.describe().show()

24/11/19 19:44:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|        free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|             500000|             500000|             500000|            500000|            500000|
|   mean| 0.6205177400000123|               NULL|0.37912809999999764|             124.5|          69.59616|
| stddev|0.15875173872912837|               NULL|0.15830931278376217|14.430884120553253|14.850676696352865|
|    min|               0.22|03/05/2019 08:06:14|                0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|               0.78|               149|               105|
+-------+-------------------+-------------------+-------------------+------------------+------------------+



                                                                                

In [4]:
# chooses specific features to analyse
df.describe('cpu_utilization', 'free_memory').show()



+-------+-------------------+-------------------+
|summary|    cpu_utilization|        free_memory|
+-------+-------------------+-------------------+
|  count|             500000|             500000|
|   mean| 0.6205177400000123|0.37912809999999764|
| stddev|0.15875173872912837|0.15830931278376217|
|    min|               0.22|                0.0|
|    max|                1.0|               0.78|
+-------+-------------------+-------------------+



                                                                                

In [5]:
# determine the correlation coefficient between cpu_utilization and free memory 
df.stat.corr('cpu_utilization', 'free_memory')

                                                                                

-0.4704771573080742

In [7]:
# determine the frequent values for features 
df.stat.freqItems(('server_id', 'session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[137, 146, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [9]:
# create a sample
df_util_sample = df.sample(fraction=0.05, withReplacement=False)
df.createOrReplaceTempView("utilization_sample")
df_util_sample.count()

                                                                                

24870

In [13]:
# GROUP BY server_id
spark.sql('SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
           FROM utilization \
           GROUP BY server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      136|                0.41|                 0.8|    0.11597405743182258|
|      139|                0.51|                0.91|    0.11519660023052102|
|      138|                0.24|                0.64|     0.1155955860444133|
|      142|                 0.5|                 0.9|    0.11593003726970043|
|      141|                0.37|                0.77|    0.11521504481036793|
|      134|                0.33|                0.73|    0.11566984462380653|
|      140|                0.47|                0.87|    0.11539940805020545|
|      137|                0.54|                0.94|    0.11526245077758812|
|      135|                0.31|                0.71|    0.11512655070081647|
|      133|                0.55|                0.95|    0.11534

In [16]:
# FLoor function in SQL 
spark.sql(
    'SELECT server_id, FLOOR(cpu_utilization*100/10) AS bucket FROM utilization').show()

+---------+------+
|server_id|bucket|
+---------+------+
|      133|     8|
|      133|     8|
|      133|     6|
|      133|     7|
|      133|     6|
|      133|     7|
|      133|     9|
|      133|     5|
|      133|     8|
|      133|     8|
|      133|     7|
|      133|     6|
|      133|     8|
|      133|     8|
|      133|     6|
|      133|     9|
|      133|     6|
|      133|     9|
|      133|     8|
|      133|     7|
+---------+------+
only showing top 20 rows



In [21]:
# determine the difference in avg cpu_utilization and each entry
sql_window = "SELECT event_datetime, server_id, cpu_utilization,  \
         avg(cpu_utilization) OVER (PARTITION BY server_id) AS avg_server_util, \
         cpu_utilization - avg(cpu_utilization) OVER (PARTITION BY server_id) AS delta_server_util \
         FROM utilization"

spark.sql(sql_window).show()

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:06:31|      110|           0.68|0.5537749999999892|  0.1262250000000108|
|03/05/2019 08:11:31|      110|           0.58|0.5537749999999892|0.026225000000010712|
|03/05/2019 08:16:31|      110|           0.55|0.5537749999999892|-0.00377499999998...|
|03/05/2019 08:21:31|      110|           0.63|0.5537749999999892| 0.07622500000001076|
|03/05/2019 08:26:31|      110|           0.63|0.5537749999999892| 0.07622500000001076|
|03/05/2019 08:31:31|      110|           0.71|0.5537749999999892| 0.15622500000001072|
|03/05/2019 08:36:31|      110|           0.67|0.5537749999999892| 0.11622500000001079|
|03/05/2019 08:41:31|      110|           0.55|0.5537749999999892|-0.00377499999998...|
|03/05/2019 08:46:31|      110| 

In [28]:
# sliding window
sql_window2 = "SELECT event_datetime, server_id, cpu_utilization,  \
               ROUND(avg(cpu_utilization) OVER (PARTITION BY server_id ORDER BY event_datetime \
               ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING), 2) avg_server_util \
               FROM utilization"

spark.sql(sql_window2).show()

+-------------------+---------+---------------+---------------+
|     event_datetime|server_id|cpu_utilization|avg_server_util|
+-------------------+---------+---------------+---------------+
|03/05/2019 08:06:31|      110|           0.68|           0.63|
|03/05/2019 08:11:31|      110|           0.58|            0.6|
|03/05/2019 08:16:31|      110|           0.55|           0.59|
|03/05/2019 08:21:31|      110|           0.63|            0.6|
|03/05/2019 08:26:31|      110|           0.63|           0.66|
|03/05/2019 08:31:31|      110|           0.71|           0.67|
|03/05/2019 08:36:31|      110|           0.67|           0.64|
|03/05/2019 08:41:31|      110|           0.55|           0.53|
|03/05/2019 08:46:31|      110|           0.37|           0.54|
|03/05/2019 08:51:31|      110|            0.7|           0.58|
|03/05/2019 08:56:31|      110|           0.67|           0.64|
|03/05/2019 09:01:31|      110|           0.56|           0.53|
|03/05/2019 09:06:31|      110|         

In [35]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

# take in the following features and output them as features in vector
assembler = VectorAssembler(
    inputCols=["cpu_utilization", "free_memory", "session_count"], outputCol="features")

In [37]:
# transform the df by the assembler
vcluster_df = assembler.transform(df)
vcluster_df.show()

+---------------+-------------------+-----------+---------+-------------+----------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|        features|
+---------------+-------------------+-----------+---------+-------------+----------------+
|            0.8|04/08/2019 23:37:13|       0.15|      133|           68| [0.8,0.15,68.0]|
|           0.84|04/08/2019 23:42:13|       0.14|      133|           66|[0.84,0.14,66.0]|
|           0.69|04/08/2019 23:47:13|        0.4|      133|           72| [0.69,0.4,72.0]|
|           0.75|04/08/2019 23:52:13|       0.18|      133|           77|[0.75,0.18,77.0]|
|           0.63|04/08/2019 23:57:13|       0.16|      133|           82|[0.63,0.16,82.0]|
|           0.72|04/09/2019 00:02:13|       0.06|      133|           78|[0.72,0.06,78.0]|
|            0.9|04/09/2019 00:07:13|       0.37|      133|           68| [0.9,0.37,68.0]|
|           0.59|04/09/2019 00:12:13|        0.4|      133|           94| [0.59,0.4,94.0]|

In [40]:
# kmeans configurations 
k_means = KMeans().setK(3)
k_means = k_means.setSeed(1)

# kmeans model 
kmeans_model = k_means.fit(vcluster_df)

                                                                                

In [41]:
# prints out kmeans clusters 
kmeans_model.clusterCenters()

[array([ 0.71174897,  0.28808911, 86.87510507]),
 array([ 0.61918113,  0.38080285, 68.75004716]),
 array([ 0.51439668,  0.48445202, 50.49452021])]

In [43]:
from pyspark.ml.regression import LinearRegression

# linear regression
linear_regression_assembler = VectorAssembler(
    inputCols=["cpu_utilization"], outputCol="features")
df_vutil = linear_regression_assembler.transform(df)
df_vutil.show()

+---------------+-------------------+-----------+---------+-------------+--------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|features|
+---------------+-------------------+-----------+---------+-------------+--------+
|            0.8|04/08/2019 23:37:13|       0.15|      133|           68|   [0.8]|
|           0.84|04/08/2019 23:42:13|       0.14|      133|           66|  [0.84]|
|           0.69|04/08/2019 23:47:13|        0.4|      133|           72|  [0.69]|
|           0.75|04/08/2019 23:52:13|       0.18|      133|           77|  [0.75]|
|           0.63|04/08/2019 23:57:13|       0.16|      133|           82|  [0.63]|
|           0.72|04/09/2019 00:02:13|       0.06|      133|           78|  [0.72]|
|            0.9|04/09/2019 00:07:13|       0.37|      133|           68|   [0.9]|
|           0.59|04/09/2019 00:12:13|        0.4|      133|           94|  [0.59]|
|           0.87|04/09/2019 00:17:13|       0.35|      133|           86|  [0.87]|
|   

In [46]:
# instantiate model
lr = LinearRegression(featuresCol="features", labelCol="session_count")
lrModel = lr.fit(df_vutil)

# determine the coefficient in linear regression
lrModel.coefficients

24/11/19 20:30:01 WARN Instrumentation: [a6981fe1] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

DenseVector([47.024])

In [47]:
# determine the intercept
lrModel.intercept

40.41695103556927

In [48]:
# root mean squared error
lrModel.summary.rootMeanSquaredError

12.837990225931836