* Ensure access to stage table directories 

In [0]:
%fs ls /user/hive/warehouse/stg.db/products_stg

path,name,size,modificationTime
dbfs:/user/hive/warehouse/stg.db/products_stg/_delta_log/,_delta_log/,0,1737593259000
dbfs:/user/hive/warehouse/stg.db/products_stg/part-00000-8d867354-0f64-4051-9300-61392d734354-c000.snappy.parquet,part-00000-8d867354-0f64-4051-9300-61392d734354-c000.snappy.parquet,2652,1737593262000
dbfs:/user/hive/warehouse/stg.db/products_stg/part-00001-41e0a0a0-4ef7-49f9-9ed9-bef55a28a9e2-c000.snappy.parquet,part-00001-41e0a0a0-4ef7-49f9-9ed9-bef55a28a9e2-c000.snappy.parquet,2670,1737593262000
dbfs:/user/hive/warehouse/stg.db/products_stg/part-00002-d6182139-8a64-4449-b1f0-a65cae042b1e-c000.snappy.parquet,part-00002-d6182139-8a64-4449-b1f0-a65cae042b1e-c000.snappy.parquet,2642,1737593262000
dbfs:/user/hive/warehouse/stg.db/products_stg/part-00003-d0ed4798-d1ef-460e-83e8-bea4080a610a-c000.snappy.parquet,part-00003-d0ed4798-d1ef-460e-83e8-bea4080a610a-c000.snappy.parquet,2485,1737593262000


(ONE TIME EXE FILE): Just to build Bz tables as expected 
1. Confirm the num of tables being put in bz layer are the same tables expected 
2. (ONCE) Get the schema from the each table and store it 
3. use this extracted schema to create table inside the bz-layer external location. 
Thats it. 

-- Below code only works 

In [0]:
# Class Automatically builds the Bronze layer, during the testing by using the schema of the staging layer 
# DOES NOT POPULATE Bronze layer tables, Just creates empty delta tables inside specified directories for holding delta table data 
# Use with the following variables to see execution 
# hive loc: '/user/hive/warehouse/'
# stage_schema_name = 'stg'
# bz_loc: 'abfss://bronze@datalakeselectivaproject.dfs.core.windows.net'
# bz_schema_name = 'bz'

class buildTableSchema():
    def __init__(self, hive_loc, stage_schema_name, bz_loc, bz_schema_name):
        self.stage_tables_base_loc = hive_loc
        self.stage_schema_name     = stage_schema_name
        self.stage_location        = self.stage_tables_base_loc + self.stage_schema_name + '.db'

        self.bz_base_loc           = bz_loc
        self.bz_schema_name        = bz_schema_name

        # pre defined hardcoded table list
        self.dim_table_list = ['channels_stg', 'costs_stg', 'customers_stg', 'products_stg', 
                               'promotions_stg', 'supplementary_demographics_stg', 'times_stg']
        self.fact_table_list = ['sales_stg']

    # Get list of tables in staging tables 
    # parameters : none
    # returns    : List of table names in the order they appear in the stage layer    
    def get_stage_table_list(self): 
        tb_list = []
        for itr in dbutils.fs.ls(self.stage_location):
            dir_name   = itr.name 
            table_name = dir_name.split('/')[0]
            tb_list.append(table_name)
        return tb_list
    
    # Compare the recieved list of tables in staging table dir with specified list of tables 
    # paramters : List of table names in staging directory 
    # returns   : Boolean if list matches -> true, else -> false 
    def check_stage_table_list(self, stg_table_list):
        if (len(self.dim_table_list) + len(self.fact_table_list) == len(stg_table_list)) and (set(self.dim_table_list + self.fact_table_list) == set(stg_table_list)):
            return True
        else:
            print(f"Unexpected Number of stage tables: {len(stg_table_list)}\nStage table list: {stg_table_list}")
            return False

    # Main function to build the bronze layer tables
    # parameters : list of table names in staging directory
    # returns    : none 
    """ Description: 
        # For loop to iterate over each table in staging directory 
        # calculate the name for the resulting 'target table' 
        # get schema from 'staging table'
        # create empty df using that extracted schema 
        # use the empty df write to create delta format data in 'target table *directory'
        # use spark SQL to create a table inside the metastore catalog 'on-top' of that data 
        # check to make sure the schema of the empty df and table created have same num of columns 
    """
    def build_table(self, table_list):
        for table in (table_list):
            tgt_table = table[:len(table) - len(self.stage_schema_name) - 1] + '_' + self.bz_schema_name
            print(f"Processing table: {table}, target table: {tgt_table}")

            schema    = self.extract_schema(table)
            e_df      = self.apply_schema_to_df(schema)
            col_count = len(e_df.columns)
            self.create_table_deltalog(tgt_table, e_df)
            self.recreate_table_metadata(tgt_table, self.bz_schema_name)

            desc_df   = spark.sql(f"""DESCRIBE `selectiva-project`.{self.bz_schema_name}.{tgt_table}""")
            if (desc_df.count()) != (col_count + 3):
                print(f"Table {tgt_table} created BUT SOMETHING WENT WRONG WITH SCHEMA!!")
                print(f"column count expected: {col_count} but got {desc_df.count()}")
                break
            else:
                print(f"Table {tgt_table} created successfully!!")
                print('\n')
       
    # Extract the schema from the staged table
    # parameters : 'Stage table name' to extract the schema from 
    # returns    : Schema of the staged table 
    def extract_schema(self, table_name):
        dlt_loc = self.stage_location + '/' + table_name
        print(f"\tExtracting the schema from the '{table_name}' table")
        df      = (spark.read
                    .format('delta')
                    .option('header', 'true')
                    .option('inferSchema','true')
                    .load(dlt_loc))
        return df.schema
    
    # Create an Empty Dataframe with a 'Fixed Schema'
    # parameters : Schema to be applied to the Empty dataframe  
    # returns    : Empty DF object with a fixed schema applied to it 
    def apply_schema_to_df(self, schema):
        print(f"\tApplying schema to Empty dataframe and returning the dataframe")
        return (spark.createDataFrame([], schema))
    
    # Writes the Empty DF object to the 'bz layer' target-table location where data is to be kept 
    # variables  : Creates the target_table_dir variable as well as target_location variable 
    # parameters : 'Target-table-name', Empty DF object with fixed schema applied 
    # returns    : None, But creates a 'delta-log' dir inside the target-table-directory 
    def create_table_deltalog(self, table, df):
        tgt_dir = table[:len(table) - len(self.bz_schema_name) -1] + '/'
        loc     = self.bz_base_loc + '/' + tgt_dir 
        print(f"\tWriting Empty df with Extracted schema to {loc}")
        try:
            df.write.format('delta').mode('overwrite').partitionBy("loaded_ts").save(loc)
        except:
            print(f"Error creating table {table} at {loc}. Don't add '/' at the end of contaier location")

    # Creates the External Tables on top of the data in the bronze layer cloud container in ADLS2 
    # parameters : Target-table-name, Target-schema-name 
    # returns    : None, But creates the tables under the schema in project catalog 
    def recreate_table_metadata(self, target_table_name, target_schema_name):
        target_dir = self.bz_base_loc + '/' + target_table_name[:len(target_table_name) - len(target_schema_name) - 1] + '/'
        print(f"\tCreating the bronze table on top of delta file location in bronze layer\n\tTarget table:{target_table_name}\n\tTarget table dir:{target_dir}")
        #"""spark.sql(f"""DROP TABLE IF EXISTS `selectiva-project`.{target_schema_name}.{target_table_name}""")"""
        spark.sql(f"""
                  CREATE TABLE IF NOT EXISTS `selectiva-project`.{target_schema_name}.{target_table_name}
                  USING DELTA 
                  LOCATION '{target_dir}'
                    """)

    # Function used for debugging purposes
    def show(self):
        print(f"Passed parameters are: {self.stage_tables_base_loc}\n{self.stage_schema_name}\n{self.bz_base_loc}\n{self.bz_schema_name}")

    # Driver Function for the class
    def process(self):
        print(f"Creating Empty {self.bz_schema_name} tables with fixed Schemas")
        list_stg_tables = self.get_stage_table_list()  
        if self.check_stage_table_list(list_stg_tables):
            print(f"Stage table number and names as expected!")
            self.build_table(list_stg_tables)   # Call to the main function to build the bronze layer tables
        else:
            print(f"Something went wrong with the stage table list.")
    
        

In [0]:
bd = buildTableSchema('/user/hive/warehouse/','stg', 'abfss://bronze@datalakeselectivaproject.dfs.core.windows.net', 'bz')
bd.process()

Creating Empty bz tables with fixed Schemas
Stage table number and names as expected!
Processing table: channels_stg, target table: channels_bz
	Extracting the schema from the 'channels_stg' table
	Applying schema to Empty dataframe and returning the dataframe
	Writing Empty df with Extracted schema to abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/channels/
	Creating the bronze table on top of delta file location in bronze layer
	Target table:channels_bz
	Target table dir:abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/channels/
Table channels_bz created successfully!!


Processing table: costs_stg, target table: costs_bz
	Extracting the schema from the 'costs_stg' table
	Applying schema to Empty dataframe and returning the dataframe
	Writing Empty df with Extracted schema to abfss://bronze@datalakeselectivaproject.dfs.core.windows.net/costs/
	Creating the bronze table on top of delta file location in bronze layer
	Target table:costs_bz
	Target table dir:abf