In [0]:
dbutils.widgets.text(name = "env", defaultValue = '', label = 'Enter the environment in lower case')
env = dbutils.widgets.get("env")

In [0]:
%run "./commons"

In [0]:
# One-time run
# to clean up the Delta data + checkpoints (so streaming can start fresh) and reflect changes in tables
# dbutils.fs.rm(f"{bronze}/raw_customer", True)
# dbutils.fs.rm(f"{bronze}/raw_employee", True)
# dbutils.fs.rm(f"{bronze}/raw_orderdetails", True)
# dbutils.fs.rm(f"{bronze}/raw_orders", True)
# dbutils.fs.rm(f"{bronze}/raw_product", True)
# dbutils.fs.rm(f"{bronze}/raw_region", True)
# dbutils.fs.rm(f"{bronze}/raw_warehouse", True)

# dbutils.fs.rm(f"{checkpoint}/rawCustomerLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawEmployeeLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawOrderDetailsLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawOrdersLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawProductLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawRegionLoad", True)
# dbutils.fs.rm(f"{checkpoint}/rawWarehouseLoad", True)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

## Read raw data

In [0]:
def read_Product_Data():
    print("Reading Raw Product Data: ", end='')
    schema = StructType([
        StructField("ProductID", StringType()),
        StructField("ProductName", StringType()),
        StructField("CategoryName", StringType()),
        StructField("ProductDescription", StringType()),
        StructField("ProductStandardCost", DoubleType()),
        StructField("ProductListPrice", DoubleType()),
        StructField("Profit", DoubleType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawProductLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_product/")
         )
    print("Success!")
    return df

In [0]:
def read_Orders_Data():
    print("Reading Raw Orders Data: ", end='')
    schema = StructType([
        StructField("OrderID", IntegerType()),
        StructField("OrderDate", StringType()),
        StructField("CustomerID", IntegerType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawOrdersLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_orders/")
         )
    print("Success!")
    return df

In [0]:
def read_Customer_Data():
    print("Reading Raw Customer Data: ", end='')
    schema = StructType([
        StructField("CustomerID", IntegerType()),
        StructField("CustomerName", StringType()),
        StructField("CustomerEmail", StringType()),
        StructField("CustomerPhone", StringType()),
        StructField("CustomerAddress", StringType()),
        StructField("CustomerCreditLimit", IntegerType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawCustomerLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_customer/")
         )
    print("Success!")
    return df

In [0]:
def read_Warehouse_Data():
    print("Reading Raw Warehouse Data: ", end='')
    schema = StructType([
        StructField("WarehouseID", IntegerType()),
        StructField("WarehouseName", StringType()),
        StructField("WarehouseAddress", StringType()),
        StructField("RegionID", IntegerType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawWarehouseLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_warehouse/")
         )
    print("Success!")
    return df

In [0]:
def read_Employee_Data():
    print("Reading Raw Employee Data: ", end='')
    schema = StructType([
        StructField("EmployeeID", IntegerType()),
        StructField("EmployeeName", StringType()),
        StructField("EmployeeEmail", StringType()),
        StructField("EmployeePhone", StringType()),
        StructField("EmployeeHireDate", StringType()),
        StructField("EmployeeJobTitle", StringType()),
        StructField("WarehouseID", IntegerType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawEmployeeLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_employee/")
         )
    print("Success!")
    return df

In [0]:
def read_Region_Data():
    print("Reading Raw Region Data: ", end='')
    schema = StructType([
        StructField("RegionID", IntegerType()),
        StructField("RegionName", StringType()),
        StructField("CountryName", StringType()),
        StructField("State", StringType()),
        StructField("City", StringType()),
        StructField("PostalCode", StringType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawRegionLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_region/")
         )
    print("Success!")
    return df


In [0]:
def read_OrderDetails_Data():
    print("Reading Raw OrderDetails Data: ", end='')
    schema = StructType([
        StructField("OrderDetailsID", IntegerType()),
        StructField("ProductID", StringType()),
        StructField("OrderItemQuantity", IntegerType()),
        StructField("PerUnitPrice", DoubleType()),
        StructField("OrderStatus", StringType()),
        StructField("OrderID", IntegerType())
    ])

    df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", f"{checkpoint}/rawOrderDetailsLoad/schemaInfer")
            .option("header", "true")
            .schema(schema)
            .load(landing + "/raw_orderdetails/")
         )
    print("Success!")
    return df

In [0]:
def write_to_bronze(streaming_df, environment, table_name, chk_subdir):
    print(f"Writing data to `{environment}_catalog`.`bronze`.`{table_name}` ... ", end='')
    query = (streaming_df.writeStream
                .format("delta")
                .option("checkpointLocation", f"{checkpoint}/{chk_subdir}/Checkpt")
                .outputMode("append")
                .queryName(f"{table_name}_WriteStream")
                .trigger(availableNow=True)
                .toTable(f"`{environment}_catalog`.`bronze`.`{table_name}`"))
    query.awaitTermination()
    print("Success!")
    print("******************************")

In [0]:
# ---------- Read from landing ----------
df_product = read_Product_Data()
df_orders = read_Orders_Data()
df_customer = read_Customer_Data()
df_warehouse = read_Warehouse_Data()
df_employee = read_Employee_Data()
df_region = read_Region_Data()
df_orderdetails = read_OrderDetails_Data()

## Load raw data to bronze tables

In [0]:
# ---------- Write to bronze ----------
write_to_bronze(df_product, env, "raw_product", "rawProductLoad")
write_to_bronze(df_orders, env, "raw_orders", "rawOrdersLoad")
write_to_bronze(df_customer, env, "raw_customer", "rawCustomerLoad")
write_to_bronze(df_warehouse, env, "raw_warehouse", "rawWarehouseLoad")
write_to_bronze(df_employee, env, "raw_employee", "rawEmployeeLoad")
write_to_bronze(df_region, env, "raw_region", "rawRegionLoad")
write_to_bronze(df_orderdetails, env, "raw_orderdetails", "rawOrderDetailsLoad")

## Display sample data in bronze tables

In [0]:
display(spark.sql(f"SELECT * FROM `{env}_catalog`.`bronze`.`raw_product` LIMIT 10"))
display(spark.sql(f"SELECT * FROM `{env}_catalog`.`bronze`.`raw_orders` LIMIT 10"))
display(spark.sql(f"SELECT * FROM `{env}_catalog`.`bronze`.`raw_orderdetails` LIMIT 10"))