# Tumbling vs Sliding Windows
Tumbling: fixed-size, non-overlapping

Sliding windows: fixed-size, overlapping

In [13]:
import findspark
findspark.init()

import pyspark
from delta import *

In [14]:

builder = pyspark.sql.SparkSession.builder.appName('Trade Summarizer') \
                                          .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                                          .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [15]:
class SlidingAggregate():

    def __init__(self):
        
        self.BASE_DIR = '..'

    
    def get_schema(self):

        from pyspark.sql.types import StructType, StructField, StringType, DoubleType
        return StructType([
            StructField('created_time', StringType(), nullable=True),
            StructField('reading', DoubleType(), nullable=True),
        ])


    def read_bronze(self):
        
        return spark.readStream.table('kafka_bz')
    

    def get_sensor_data(self, kafka_df):

        from pyspark.sql.functions import from_json, to_timestamp
        return kafka_df.select(
                            kafka_df.key.cast('string').alias('sensor_id'),
                            from_json(kafka_df.value.cast('string'), self.get_schema()).alias('value')
                        ) \
                       .select('sensor_id', 'value.*') \
                       .withColumn('created_time', to_timestamp('created_time', 'yyyy-MM-dd HH:mm:ss'))
    

    def aggregate_sensor_data(self, sensor_df):

        from pyspark.sql.functions import window, max
        return sensor_df.withWatermark('created_time', '30 minutes') \
                       .groupBy(
                           sensor_df.sensor_id,
                           window(sensor_df['created_time'], '15 minutes', '5 minutes')
                       ) \
                       .agg(max('reading').alias('max_reading')) \
                       .select('sensor_id', 'window.start', 'window.end', 'max_reading')
    

    def process(self):

        print('Starting sensor data extracting stream...', end='')

        raw_kakfa_df = self.read_bronze()
        trade_df = self.get_sensor_data(raw_kakfa_df)
        aggregated_df = self.aggregate_sensor_data(trade_df)
        streaming_query = aggregated_df.writeStream \
                                       .queryName('sensor_summary') \
                                       .format('delta') \
                                       .option('checkpointLocation', f'{self.BASE_DIR}/checkpoint/sensor_summary') \
                                       .outputMode('complete') \
                                       .toTable('sensor_summary')

        print(' Done.\n')

        return streaming_query

In [16]:
class SensorSummaryTestSuite():

    def __init__(self):
        
        self.BASE_DIR = '..'


    def clean_up_for_testing(self):

        import shutil
        import os

        print('Starting cleaning...', end='')

        shutil.rmtree(f'{self.BASE_DIR}/notebooks/spark-warehouse/kafka_bz')
        os.makedirs(f'{self.BASE_DIR}/notebooks/spark-warehouse/kafka_bz')
        
        shutil.rmtree(f'{self.BASE_DIR}/notebooks/spark-warehouse/sensor_summary')
        os.makedirs(f'{self.BASE_DIR}/notebooks/spark-warehouse/sensor_summary')

        spark.sql('DROP TABLE IF EXISTS kafka_bz')
        spark.sql('DROP TABLE IF EXISTS sensor_summary')

        shutil.rmtree(f'{self.BASE_DIR}/checkpoint/sensor_summary')
        os.makedirs(f'{self.BASE_DIR}/checkpoint/sensor_summary')

        spark.sql('CREATE TABLE kafka_bz(key String, value String) USING delta')

        print(' Done.')



    def wait_for_microbatch(self, sleep_time=15):

        import time

        print(f'\tWaiting for {sleep_time} seconds...', end='')
        time.sleep(sleep_time)

        print(' Done.')


    def assert_sensor_summary(self):
        
        print('\tStarting Sensor Summary validation...', end='')

        actual_result = spark.table('sensor_summary') \
                             .orderBy('start') \
                             .collect()

        expected_result = spark.read \
                               .format('csv') \
                               .option('header', 'true') \
                               .load(f'{self.BASE_DIR}/data/results/sliding_window_result.csv') \
                               .orderBy('start') \
                               .collect()

        for i, _ in enumerate(actual_result):
            
            expected = float(expected_result[i]['max_reading'])
            actual = actual_result[i]['max_reading']
            assert expected == actual, f'Test failed! Expected result is {expected}. Got {actual} instead.'

        print(' Done.\nAll tests passed.')


    def run_stream_tests(self):

        # Sleep time between extract and transform operation
        self.clean_up_for_testing()
        sleep_time = 5

        sensor_summary_stream = SlidingAggregate()
        sensor_summary_streaming_query = sensor_summary_stream.process()

        print('Testing all events...')
        spark.sql(
            '''
            INSERT INTO kafka_bz VALUES
            ('SET41', '{"created_time": "2019-02-05 09:54:00","reading": 36.2}'),
            ('SET41', '{"created_time": "2019-02-05 09:59:00","reading": 36.5}'),
            ('SET41', '{"created_time": "2019-02-05 10:04:00","reading": 36.8}'),
            ('SET41', '{"created_time": "2019-02-05 10:09:00","reading": 36.2}'),
            ('SET41', '{"created_time": "2019-02-05 10:14:00","reading": 36.5}'),
            ('SET41', '{"created_time": "2019-02-05 10:19:00","reading": 36.3}'),
            ('SET41', '{"created_time": "2019-02-05 10:24:00","reading": 37.7}'),
            ('SET41', '{"created_time": "2019-02-05 10:29:00","reading": 37.2}')
            '''
        )
        self.wait_for_microbatch(sleep_time=sleep_time)
        self.assert_sensor_summary()

        sensor_summary_streaming_query.stop()


In [17]:
sensor_summary_tester = SensorSummaryTestSuite()
sensor_summary_tester.run_stream_tests()

Starting cleaning... Done.
Starting sensor data extracting stream... Done.

Testing all events...
	Waiting for 5 seconds... Done.
	Starting Sensor Summary validation... Done.
All tests passed.


In [18]:
spark.stop()