In [269]:
import findspark
findspark.init()

In [270]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [271]:
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master('local').getOrCreate() 

In [272]:
alerts_df = spark.read.json('alerts.json')

In [273]:
alerts_df.printSchema()

root
 |-- alert_id: string (nullable = true)
 |-- entitled_assets: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- event_source_type: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- instance_id: string (nullable = true)
 |-- serial: string (nullable = true)



In [274]:
alerts_df.show(1)

+-----------+------------------+-------------------+-------------------+--------------------+----------+
|   alert_id|   entitled_assets|  event_source_type|         event_time|         instance_id|    serial|
+-----------+------------------+-------------------+-------------------+--------------------+----------+
|ArbwAO2m4Oa|[qnTrje2, WT87cRS]|healthchecker_alert|2019-06-07 20:50:41|ypJdMVE8XfRgbq8E2...|cyN-1QeXWm|
+-----------+------------------+-------------------+-------------------+--------------------+----------+
only showing top 1 row



In [275]:
alerts_df.count()

10000

In [276]:
alerts_df = alerts_df.withColumn("event_timestamp",
        f.to_timestamp("event_time", "yyyy-MM-dd HH:mm:ss"))
alerts_df.printSchema()
alerts_df.show(1)

root
 |-- alert_id: string (nullable = true)
 |-- entitled_assets: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- event_source_type: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- instance_id: string (nullable = true)
 |-- serial: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)

+-----------+------------------+-------------------+-------------------+--------------------+----------+-------------------+
|   alert_id|   entitled_assets|  event_source_type|         event_time|         instance_id|    serial|    event_timestamp|
+-----------+------------------+-------------------+-------------------+--------------------+----------+-------------------+
|ArbwAO2m4Oa|[qnTrje2, WT87cRS]|healthchecker_alert|2019-06-07 20:50:41|ypJdMVE8XfRgbq8E2...|cyN-1QeXWm|2019-06-07 20:50:41|
+-----------+------------------+-------------------+-------------------+--------------------+----------+-------------------+
only showing 

In [277]:
alerts_df = alerts_df.filter((alerts_df.event_timestamp <= '2019-06-19 00:00:00') & (alerts_df.event_timestamp >= '2019-06-10 00:00:00'))
print(alerts_df.count())
alerts_df.show(1)

3130
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
|   alert_id|     entitled_assets|  event_source_type|         event_time|         instance_id|    serial|    event_timestamp|
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
|1AlMxb4jeZ8|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e|2019-06-11 12:46:18|
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
only showing top 1 row



In [278]:
alerts_df = alerts_df.filter(alerts_df.event_source_type == 'healthchecker_alert')
print(alerts_df.count())
alerts_df.show(1)

1062
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
|   alert_id|     entitled_assets|  event_source_type|         event_time|         instance_id|    serial|    event_timestamp|
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
|1AlMxb4jeZ8|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e|2019-06-11 12:46:18|
+-----------+--------------------+-------------------+-------------------+--------------------+----------+-------------------+
only showing top 1 row



In [279]:
temp_df = alerts_df.groupBy('alert_id').agg(f.min('event_timestamp').alias('event_timestamp'))

In [280]:
temp_df.show(5)
temp_df.count()

+-----------+-------------------+
|   alert_id|    event_timestamp|
+-----------+-------------------+
|XGo0nQHCZzR|2019-06-11 05:33:45|
|8x9rwGukOmB|2019-06-10 16:53:04|
|FYFObaGlK4Q|2019-06-10 21:34:06|
|2VsSDcyzFF7|2019-06-10 11:10:49|
|Y7cWwyAi30w|2019-06-11 00:19:51|
+-----------+-------------------+
only showing top 5 rows



100

In [281]:
first_alerts_df = temp_df.join(alerts_df, ['alert_id','event_timestamp'])

In [282]:
first_alerts_df.show(5)
first_alerts_df.count()

+-----------+-------------------+--------------------+-------------------+-------------------+--------------------+----------+
|   alert_id|    event_timestamp|     entitled_assets|  event_source_type|         event_time|         instance_id|    serial|
+-----------+-------------------+--------------------+-------------------+-------------------+--------------------+----------+
|1AlMxb4jeZ8|2019-06-11 12:46:18|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e|
|czQwIHOzm03|2019-06-12 01:10:03|[WbsKFGu, ekG3XXE...|healthchecker_alert|2019-06-12 01:10:03|eoSV6vRsEK0lrfQjd...|x23-uhxUOj|
|XhBrfwWdYgU|2019-06-11 17:09:12|[jzOVqi2, 08eQJhb...|healthchecker_alert|2019-06-11 17:09:12|nS3G3YBEOYApGiTrb...|X8J-aUFAPn|
|MT9HnkOcAUA|2019-06-10 03:38:31|[LVfoG2r, N0cheY7...|healthchecker_alert|2019-06-10 03:38:31|ai4Kzi73j2o6O6gi1...|4Oy-U52NvV|
|8x9rwGukOmB|2019-06-10 16:53:04|[qRSHlOc, 6UvfMaY...|healthchecker_alert|2019-06-10 16:53:04|uoAIgVh2CskdRg6d0

100

In [283]:
first_alerts_df = first_alerts_df.withColumn('asset_id',f.explode('entitled_assets'))

In [284]:
first_alerts_df.show(5)
first_alerts_df.count()

+-----------+-------------------+--------------------+-------------------+-------------------+--------------------+----------+--------+
|   alert_id|    event_timestamp|     entitled_assets|  event_source_type|         event_time|         instance_id|    serial|asset_id|
+-----------+-------------------+--------------------+-------------------+-------------------+--------------------+----------+--------+
|1AlMxb4jeZ8|2019-06-11 12:46:18|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e| VrE3Mr8|
|1AlMxb4jeZ8|2019-06-11 12:46:18|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e| Z0SBQ7L|
|1AlMxb4jeZ8|2019-06-11 12:46:18|[VrE3Mr8, Z0SBQ7L...|healthchecker_alert|2019-06-11 12:46:18|Aygpbf1YPEpvn1CQj...|Ei3-SNCq7e| aASolix|
|czQwIHOzm03|2019-06-12 01:10:03|[WbsKFGu, ekG3XXE...|healthchecker_alert|2019-06-12 01:10:03|eoSV6vRsEK0lrfQjd...|x23-uhxUOj| WbsKFGu|
|czQwIHOzm03|2019-06-12 01:10:03|[WbsKFGu, ekG3X

297

In [285]:
first_alerts_df = first_alerts_df.withColumn('date', f.split(first_alerts_df['event_timestamp'], ' ').getItem(0)).withColumn(
                                            'event_time', f.split(first_alerts_df['event_timestamp'], ' ').getItem(1))
first_alerts_df = first_alerts_df.select('alert_id', 'asset_id', 'date', 'event_time')
# first_alerts_df = first_alerts_df.select('alert_id', f.col("event_timestamp").alias('event_time'), 'asset_id', 'date')

In [286]:
first_alerts_df.show(5)
first_alerts_df.count()

+-----------+--------+----------+----------+
|   alert_id|asset_id|      date|event_time|
+-----------+--------+----------+----------+
|1AlMxb4jeZ8| VrE3Mr8|2019-06-11|  12:46:18|
|1AlMxb4jeZ8| Z0SBQ7L|2019-06-11|  12:46:18|
|1AlMxb4jeZ8| aASolix|2019-06-11|  12:46:18|
|czQwIHOzm03| WbsKFGu|2019-06-12|  01:10:03|
|czQwIHOzm03| ekG3XXE|2019-06-12|  01:10:03|
+-----------+--------+----------+----------+
only showing top 5 rows



297

In [287]:
# first_alerts_df.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("alerts.csv")
first_alerts_df.write.format("com.databricks.spark.csv").option("header", "true").save("alerts.csv")

In [288]:
sc.stop()