In [None]:
%scala
val BASE_DIR = "dbfs:/FileStore"
val DATA_DIR = "dbfs:/FileStore/text"
val data = spark.read
            .format("text")
            .option("lineSep",".")
            .load(DATA_DIR)

data.collect()

In [None]:
# Refactor the code to class and functions for testing
class BatchWordCounts():

    def __init__(self):
        self.DATA_DIR = 'dbfs:/FileStore/test_data/text'


    def get_raw_data(self):
        
        from pyspark.sql.functions import explode, split
        text_data = spark.read \
                         .format('text') \
                         .option('lineSep', '.') \
                         .load(f'{self.DATA_DIR}')

        return text_data.select(explode(split(text_data.value, ' ')).alias('word'))
    

    def get_quality_data(self, raw_words):

        from pyspark.sql.functions import lower, trim

        return raw_words.select(lower(trim(raw_words.word)).alias('cleaned_words')) \
                        .where('cleaned_words is not null') \
                        .where("cleaned_words rlike '[a-z]'")

    def get_word_counts(self, quality_words):

        return quality_words.groupBy('cleaned_words').count()


    def overwrite_word_counts(self, word_counts):

        word_counts.write \
                   .format('delta') \
                   .mode('overwrite') \
                   .saveAsTable('word_counts')


    def execute(self):

        print(f'\tExecuting Word Count...', end='')

        raw_words = self.get_raw_data()
        quality_words = self.get_quality_data(raw_words)
        word_counts = self.get_word_counts(quality_words)
        self.overwrite_word_counts(word_counts)

        print(' Done.')
        

In [None]:
class BatchWordCountsTestSuite():

    def __init__(self):
        self.BASE_DIR = 'dbfs:/FileStore'
        self.DATA_DIR = 'dbfs:/FileStore/test_data/text'


    def clean_up_for_testing(self):

        print('Starting cleaning...', end='')

        spark.sql('DROP TABLE IF EXISTS word_counts')
        dbutils.fs.rm('/user/hive/warehouse/word_counts', recurse=True)
        dbutils.fs.rm(f'{self.BASE_DIR}/checkpoint', recurse=True)
        dbutils.fs.rm(f'{self.BASE_DIR}/test_data/text/', recurse=True)
        dbutils.fs.mkdirs(f'{self.DATA_DIR}')

        print(' Done.')

    
    def get_data(self, file_num):

        print('Getting data...', end='')

        dbutils.fs.mkdirs(f'{self.BASE_DIR}/test_data/text/')
        dbutils.fs.cp(f'{self.DATA_DIR}/text_data_{file_num}.txt', 
                      f'{self.BASE_DIR}/test_data/text/')
        
        print(' Done.')

    
    def assert_result(self, expected_result):
        
        actual_result = spark.sql(
            '''
            SELECT SUM(count)
            FROM word_counts
            WHERE SUBSTR(cleaned_words, 1, 1) == 's'
            '''
        ).collect()[0][0]

        assert expected_result == actual_result, f'Test failed! Expected result is {expected_result}. Got {actual_result} instead.'


    def run_tests(self):

        self.clean_up_for_testing()
        word_counter = BatchWordCounts()

        expected_results = [25, 32, 37]
        for i in range(len(expected_results)):

            print(f'Testing file No.{i + 1}...')

            self.get_data(i + 1)
            word_counter.execute()
            self.assert_result(expected_results[i])

            print(f'File No.{i + 1} test completed.\n')


In [None]:
batch_word_counts_tester = BatchWordCountsTestSuite()
batch_word_counts_tester.run_tests()

Starting cleaning... Done.
Testing file No.1...
Getting data... Done.
	Executing Word Count... Done.
+---------------+-----+
|  cleaned_words|count|
+---------------+-----+
|        ensures|    1|
|         stream|    2|
|fault-tolerance|    1|
|           will|    1|
|            you|    3|
|            can|    2|
|          java,|    1|
|     guarantees|    1|
|         arrive|    1|
|         system|    1|
|       provides|    1|
|            api|    1|
|  aggregations,|    1|
|             in|    2|
|           take|    1|
|           same|    2|
| fault-tolerant|    1|
|      continues|    1|
|      scalable,|    1|
|          fast,|    1|
+---------------+-----+
only showing top 20 rows

File No.1 test completed.

Testing file No.2...
Getting data... Done.
	Executing Word Count... Done.
+---------------+-----+
|  cleaned_words|count|
+---------------+-----+
|        ensures|    1|
|         stream|    2|
|fault-tolerance|    2|
|           will|    2|
|            you|    4|
|   