In [1]:
import findspark
findspark.init()

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName('Invoice Reader') \
                                          .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                                          .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
class Bronze():

    def __init__(self):
        
        self.BASE_DIR = '..'

    
    def get_schema(self):

        return '''
               InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
               CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint,
               PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, DeliveryType string,
               DeliveryAddress struct<
                    AddressLine string,
                    City string,
                    ContactNumber string,
                    PinCode string,
                    State string
               >,
               InvoiceLineItems array<struct<
                    ItemCode string,
                    ItemDescription string,
                    ItemPrice double,
                    ItemQty bigint,
                    TotalValue double
               >>
               '''


    def read_invoices(self):

        return spark.readStream \
                    .format('json') \
                    .schema(self.get_schema()) \
                    .option('cleanSource', 'archive') \
                    .option('sourceArchiveDir', f'{self.BASE_DIR}/data/invoices_archive') \
                    .load(f'{self.BASE_DIR}/test_data/invoices')
    

    def process(self):

        print('Starting Bronze data extracting stream...', end='')

        raw_invoice_df = self.read_invoices()
        streaming_query = raw_invoice_df.writeStream \
                                        .queryName('bronze_ingestion') \
                                        .option('checkpointLocation', f'{self.BASE_DIR}/checkpoint/invoices_bz') \
                                        .outputMode('append') \
                                        .toTable('invoices_bz')

        print(' Done.')

        return streaming_query

In [3]:
class Silver():

    def __init__(self):
        
        self.BASE_DIR = '..'


    def read_invoices(self):

        return spark.readStream \
                    .table('invoices_bz')


    def explode_invoices(self, invoice_df):

        return invoice_df.selectExpr(
            'InvoiceNumber', 'CreatedTime', 'StoreID', 'PosID',
            'CustomerType', 'PaymentMethod', 'DeliveryType',
            'DeliveryAddress.City', 'DeliveryAddress.State', 'DeliveryAddress.PinCode',
            'explode(InvoiceLineItems) as LineItem'
        )


    def flatten_invoices(self, exploded_df):

        from pyspark.sql.functions import expr

        flattened_df = exploded_df\
            .withColumn('ItemCode', expr('LineItem.ItemCode')) \
            .withColumn('ItemDescription', expr('LineItem.ItemDescription')) \
            .withColumn('ItemPrice', expr('LineItem.ItemPrice')) \
            .withColumn('ItemQty', expr('LineItem.ItemQty')) \
            .withColumn('TotalValue', expr('LineItem.TotalValue')) \
            .drop('LineItem')
        
        return flattened_df


    def append_invoices(self, flattened_df):
        
        return flattened_df.writeStream \
                           .queryName('silver_processing') \
                           .format('delta') \
                           .option('checkpointLocation', f'{self.BASE_DIR}/checkpoint/invoice_line_items') \
                           .outputMode('append') \
                           .toTable('invoice_line_items')


    def process(self):

        print('Starting Silver processing stream...', end='')

        raw_invoice_df = self.read_invoices()
        exploded_df = self.explode_invoices(raw_invoice_df)
        flattened_df = self.flatten_invoices(exploded_df)
        streaming_query = self.append_invoices(flattened_df)

        print(' Done.')
        return streaming_query

In [4]:
class MedalionApproacheTestSuite():

    def __init__(self):
        
        self.BASE_DIR = '..'


    def clean_up_for_testing(self):

        import shutil
        import os

        print('Starting cleaning...', end='')

        spark.sql('DROP TABLE IF EXISTS invoice_bz')
        spark.sql('DROP TABLE IF EXISTS invoice_line_items')

        shutil.rmtree(f'{self.BASE_DIR}/notebooks/spark-warehouse/invoices_bz')
        os.makedirs(f'{self.BASE_DIR}/notebooks/spark-warehouse/invoices_bz')
        
        shutil.rmtree(f'{self.BASE_DIR}/notebooks/spark-warehouse/invoice_line_items')
        os.makedirs(f'{self.BASE_DIR}/notebooks/spark-warehouse/invoice_line_items')

        shutil.rmtree(f'{self.BASE_DIR}/checkpoint/invoices_bz')
        os.makedirs(f'{self.BASE_DIR}/checkpoint/invoices_bz')

        shutil.rmtree(f'{self.BASE_DIR}/checkpoint/invoice_line_items')
        os.makedirs(f'{self.BASE_DIR}/checkpoint/invoice_line_items')

        shutil.rmtree(f'{self.BASE_DIR}/test_data/invoices')
        os.makedirs(f'{self.BASE_DIR}/test_data/invoices')

        print(' Done.')

    
    def get_data(self, file_num):

        import shutil

        print('\tGetting data...', end='')

        shutil.copyfile(src=f'{self.BASE_DIR}/data/invoices/invoices-{file_num}.json', 
                        dst=f'{self.BASE_DIR}/test_data/invoices/invoices-{file_num}.json')
        
        print(' Done.')

    
    def assert_result(self, expected_result):
        
        print('\tStarting validation...', end='')

        actual_result = spark.sql(
            '''
            SELECT COUNT(*)
            FROM invoice_line_items
            '''
        ).collect()[0][0]

        assert expected_result == actual_result, f'Test failed! Expected result is {expected_result}. Got {actual_result} instead.'
        
        print(' Done.')


    def wait_for_microbatch(self, sleep_time=15):

        import time

        print(f'\tWaiting for {sleep_time} seconds...', end='')
        time.sleep(sleep_time)

        print(' Done.')


    def run_stream_tests(self):

        sleep_time = 10
        self.clean_up_for_testing()

        bronze_extractor = Bronze()
        bronze_streaming_query = bronze_extractor.process()

        silver_processor = Silver()
        silver_streaming_query = silver_processor.process()

        expected_results = [1253, 2510, 3994]
        for i in range(len(expected_results)):

            print(f'Testing file No.{i + 1}...')

            self.get_data(i + 1)
            self.wait_for_microbatch(sleep_time=sleep_time) # Only works if sleep_time >= 5

            self.assert_result(expected_results[i])

            print(f'File No.{i + 1} test passed.\n')

        bronze_streaming_query.stop()
        silver_streaming_query.stop()


        import os

        print('Validating Archive...', end='')
        
        archive_dir = f'{self.BASE_DIR}/data/invoices_archive'
        expected_archive = ['invoices_1.json', 'invoices_2.json']
        
        scanned_files = [f for f in os.scandir(archive_dir) if f.isfile()]
        for f in expected_archive:
            assert f in scanned_files, f'Archive Validation failed for {f}.'

        print(' Done.')


In [5]:
invoice_stream_tester = MedalionApproacheTestSuite()
invoice_stream_tester.run_stream_tests()

Starting cleaning... Done.
Starting Bronze data extracting stream... Done.
Starting Silver processing stream... Done.
Testing file No.1...
	Getting data... Done.
	Waiting for 10 seconds... Done.
	Starting validation... Done.
File No.1 test passed.

Testing file No.2...
	Getting data... Done.
	Waiting for 10 seconds... Done.
	Starting validation... Done.
File No.2 test passed.

Testing file No.3...
	Getting data... Done.
	Waiting for 10 seconds... Done.
	Starting validation... Done.
File No.3 test passed.

Validating Archive... Done.


In [6]:
spark.stop()