In [0]:
from delta.tables import DeltaTable

# stageSchemaName    = 'stg'
# stageTableBaseDir  = '/user/hive/warehouse/' + stageSchemaName + '.db/'
# bronzeSchemaName   = 'bz'
# bronzeTableBaseDir = 'abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/'

class populateBronze():
    def __init__(self, stageSchemaName, stageTableBaseDir, bronzeSchemaName, bronzeTableBaseDir):
        self.stageSchemaName    = stageSchemaName
        self.stageTableBaseDir  = stageTableBaseDir + stageSchemaName + '.db/'
        self.bronzeSchemaName   = bronzeSchemaName
        self.bronzeTableBaseDir = bronzeTableBaseDir

    # get list the dirs in bz layer 
    def getBronzeTableList(self):
        tableList = []
        for table in dbutils.fs.ls(self.bronzeTableBaseDir):
            tableList.append(table.name)
        return tableList

    def getStageTableList(self):
        tableList = []
        sub = len(self.stageSchemaName) + 2
        for table in dbutils.fs.ls(self.stageTableBaseDir):
            tableName = table.name[:len(table.name) - sub] + '/'
            tableList.append(tableName)
        return tableList

    # using list, get data from stagetables to a df, return df 
    def getStageTableData(self, table):
        stageTableName = table[:len(table) - 1] + '_' + self.stageSchemaName
        sourceTablePath = self.stageTableBaseDir + stageTableName
        print(f"Loading Stage Table from: {sourceTablePath}")
        return (spark.read
                     .format('delta')
                     .option('header', 'true')
                     .option('inferSchema', 'true')
                     .load(sourceTablePath))
        
    # get df for bronze table from bz layer location 
    def getBronzeTableData(self, table):
        bronzeTableName = table[:len(table) - 1] + '_' + self.bronzeSchemaName
        targetTablePath = self.bronzeTableBaseDir + table
        print(f"Loading Bronze Table from: {targetTablePath}")
        return (DeltaTable.forPath(spark, targetTablePath))

    # perform a delta lake merge to not ingest file with same name and loaded_ts again 
    def insertBronze(self, stageDf, bronzeDf):
        (bronzeDf.alias('target')
            .merge(stageDf.alias('source'), "target.loaded_ts = source.loaded_ts ")
            .whenNotMatchedInsertAll().execute()
        )
    # test the schema, (count num of columns or something )
    def validateSchema(self, stageDf, bronzeDf):
        return (stageDf.schema == bronzeDf.toDF().schema)
    
    # validate the tables, by counting number of rows 
    """def validateTable(self, stageDf, bronzeDf):
        return (stageDf.count() == bronzeDf.toDF().count())"""

    def process(self):
        bzTableList = self.getBronzeTableList()
        stageTableList = self.getStageTableList()
        processTables = [table for table in bzTableList if table in stageTableList]
        for table in processTables:
            print(f"Processing table {table}")
            stage_df      = self.getStageTableData(table)
            dlt_bronze_df = self.getBronzeTableData(table)
            self.insertBronze(stage_df, dlt_bronze_df)
            if (self.validateSchema(stage_df, dlt_bronze_df)):
                print(f"Table {table} populated successfully!")
            else:
                print(f"Table {table} schema validation failed! Please check.")

In [0]:
pB = populateBronze('stg', '/user/hive/warehouse/', 'bz', 'abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/')
pB.process()

Processing table channels/
Loading Stage Table from: /user/hive/warehouse/stg.db/channels_stg
Loading Bronze Table from: abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/channels/


[0;31m---------------------------------------------------------------------------[0m
[0;31mDateTimeException[0m                         Traceback (most recent call last)
File [0;32m<command-5666400133891960>, line 2[0m
[1;32m      1[0m pB [38;5;241m=[39m populateBronze([38;5;124m'[39m[38;5;124mstg[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m/user/hive/warehouse/[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mbz[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mabfss://bronze@datalakeselectivaproject.dfs.core.windows.net/[39m[38;5;124m'[39m)
[0;32m----> 2[0m pB[38;5;241m.[39mprocess()

File [0;32m<command-5666400133891959>, line 70[0m, in [0;36mpopulateBronze.process[0;34m(self)[0m
[1;32m     68[0m stage_df      [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39mgetStageTableData(table)
[1;32m     69[0m dlt_bronze_df [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39mgetBronzeTableData(table)
[0;32m---> 70[0m [38;5;28mself[39m[38;5;241