### Purpose
The code will do:  
1- Read file from landing zone  
2- Add control columns like processing date, file, etc.  
2- Move the data to a delta table 'as is', always adding information  
3- Check and maintain delta table  

In [None]:
# Get notebook parameter from Azure pipeline
dbutils.widgets.text("_pipeline_run_id","0478ce36-b895-48a0-8a08-1b10430247ca")
dbutils.widgets.text("_filename","nybabynames.csv")
dbutils.widgets.text("_processing_date","21-05-2024 18:39:52")
_pipeline_run_id = dbutils.widgets.get("_pipeline_run_id")
_filename = dbutils.widgets.get("_filename")
_processing_date = dbutils.widgets.get("_processing_date")
print(_processing_date)
print (_pipeline_run_id)
print(_filename)

In [None]:
# Configure my account key and account name so Databricks can access the Data Lake
accountName = dbutils.secrets.get("dataLakeScope","accountName")
accountKey = dbutils.secrets.get("dataLakeScope","accountKey")
sparkProperty = f'fs.azure.account.key.{accountName}.dfs.core.windows.net'
spark.conf.set(sparkProperty,accountKey)

In [None]:
# Define the location
landingSource = f'abfss://landing@{accountName}.dfs.core.windows.net/{_filename}'
bronzeTarget = f'abfss://bronze@{accountName}.dfs.core.windows.net/nybabynames'

# Bronze Delta Table
table_name = "bronze.new_york_baby_names"

In [None]:
# Read cvs file data from Data Lake
gridDataDf = spark.read.option("inferSchema", "true").csv(path= landingSource, header=True)

display(gridDataDf.printSchema)



In [None]:
from  pyspark.sql.functions import *
from datetime import datetime

# Add audit columnd to the data frame 

# 1. Adding current time to process this data set
# 2. Adding pipepeline run id from ADF
# 3. The landig file name. This is useful for debugging prurpose
# 4. Modification date. This help identified order of data when the dataset doesn't have a modification date
gridDataDf = gridDataDf.withColumn("_processing_date", lit(datetime.strptime(_processing_date, '%d-%m-%Y %H:%M:%S'))) \
                       .withColumn("_pipeline_run_id", lit(_pipeline_run_id)) \
                       .withColumn("_input_filename", input_file_name()) \
                       .withColumn("_input_file_modification_date", col("_metadata.file_modification_time"))

display(gridDataDf.printSchema)


In [None]:
from delta.tables import *

# check if the bronze contain the delta table
if(DeltaTable.isDeltaTable(spark, bronzeTarget)): 

    # If yes, add data to the existing delta table
    gridDataDf.write.mode("append").format("delta").save(bronzeTarget)
else:

    # If no, save the file to bronze
    gridDataDf.write.mode("overwrite").format("delta").save(bronzeTarget)

In [None]:
# create the schema and table, if required

spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS {table_name} USING delta LOCATION '{bronzeTarget}'")

# Note: Using spark.sql because we can use f-string to retrieve the bronze

In [None]:
%sql
-- This is not necessary from a pipeline perspective; it involves checking table information as a learning experience.

DESCRIBE EXTENDED bronze.new_york_baby_names

-- Location: stored in the storage account
-- Provider (format): Delta

In [None]:
%sql
-- This is not necessary from a pipeline perspective; it involves showing the transaction log on the delta version as a learning experience.

SELECT version, operationMetrics, operationMetrics.numOutputRows, operationMetrics.numTargetRowsInserted, operationMetrics.numTargetRowsUpdated, operationMetrics.numTargetRowsDeleted
FROM (DESCRIBE HISTORY bronze.new_york_baby_names)


In [None]:
%sql

-- Check your result for testing. Do not do this in production!
-- SELECT *
-- FROM bronze.new_york_baby_names



In [None]:
#  Maintenance for Data Table

# To optimized the performance of the Delta Table, we need to execute 2 commands:
# 1. optimize(): Optimize the number of files used to store the data.
# 2. vacuum(): remove the ild version of the data. It reduce the overhead but it limites the version we can go back to.


# Databricks recommends frequently running the OPTIMIZE command to compact small files.
# This operation does not remove the old files. To remove them, run the VACUUM command (https://learn.microsoft.com/en-us/azure/databricks/delta/vacuum).
# https://learn.microsoft.com/en-us/azure/databricks/delta/best-practices#--compact-files

# In azure we could do predictive optimization (https://learn.microsoft.com/en-us/azure/databricks/optimizations/predictive-optimization#what-operations-does-predictive-optimization-run), it have prerequisites, like a premium plan and managed tables(https://learn.microsoft.com/en-us/azure/databricks/optimizations/predictive-optimization#prerequisites-for-predictive-optimization)

gridDataDelta = DeltaTable.forName(spark, table_name)

# In this example, we will run and vacuum every 30 days
if gridDataDelta.history(30).filter("operation = 'VACUUM START'").count() == 0:
      gridDataDelta.optimize()
      gridDataDelta.vacuum() # default = 7 days