In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType, FloatType
import pyspark.sql.functions as f

##Brands

In [0]:
#Step 1

catalog_name = 'e-commerce'

#Define schema for the data file
brand_schema = StructType([
    StructField('brand_code', StringType(), False),
    StructField('brand_name', StringType(), True),
    StructField('category_code', StringType(), True)
])

In [0]:
#Step 2

#adding path for the external volume data file
raw_data_path = '/Volumes/e-commerce/source_data/raw_data/ecomm-raw-data/brands/*.csv'

#reading the files into dataframe
df = spark.read.option('header', 'true').option('delimeter', ",").schema(brand_schema).csv(raw_data_path)

#adding metadata columns
df = df.withColumn('_source_file', f.col('_metadata.file_path'))\
        .withColumn('ingested_at', f.current_timestamp())

#display
display(df.limit(5))

In [0]:
#Step 3

#writing the dataframe to a delta table in catalog 

df.write.format('delta')\
    .mode('overwrite')\
    .option('mergeSchema', 'true')\
    .saveAsTable(f"`{catalog_name}`.bronze.brz_brands")

##Category

In [0]:
#defining the schema for the table
category_schema = StructType([
    StructField("category_code", StringType(), False),
    StructField("category_name", StringType(), True)
])

#Load data using the defined schema 
raw_data_path = '/Volumes/e-commerce/source_data/raw_data/ecomm-raw-data/category/*.csv'

df_raw = spark.read.option("header", "true").option("delimiter", ",").schema(category_schema).csv(raw_data_path)

#Adding metadata columns
df_raw = df_raw.withColumn("_ingested_at", f.current_timestamp())\
               .withColumn("_source_file", f.col("_metadata.file_path"))

#Write raw data to bronze layer (catalog: e-commerce, schema: bronze, table: brz_category)
catalog_name = "e-commerce"
df_raw.write.format("delta")\
    .mode("overwrite")\
    .option("mergeSchema", "true")\
    .saveAsTable(f"`{catalog_name}`.bronze.brz_category")

##Products

In [0]:
products_schema = StructType([
    StructField("product_id", StringType(), False),
    StructField("sku", StringType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand_code", StringType(), True),
    StructField("color", StringType(), True),
    StructField("size", StringType(), True),
    StructField("material", StringType(), True),
    StructField("weight_grams", StringType(), True),
    StructField("length_cm", StringType(), True),
    StructField("width_cm", FloatType(), True),
    StructField("height_cm", FloatType(), True),
    StructField("rating_count", IntegerType(), True),
    StructField("file_name", StringType(), False),
    StructField("ingest_timestamp", TimestampType(), False),
])

raw_data_path = '/Volumes/e-commerce/source_data/raw_data/ecomm-raw-data/products/*.csv'

df = spark.read.option("header", "true").option("delimiter", ",").schema(products_schema).csv(raw_data_path)
df = df.withColumn("file_name", f.col("_metadata.file_path"))\
       .withColumn("ingest_timestamp", f.current_timestamp())

df.write.format("delta")\
  .mode("overwrite")\
  .option("mergeSchema", "true")\
  .saveAsTable(f"`{catalog_name}`.bronze.brz_products")

##Customers

In [0]:
customers_schema = StructType([
    StructField("customer_id", StringType(), False),
    StructField("phone", StringType(), True),
    StructField("country_code", StringType(), True),
    StructField("country", StringType(), True),
    StructField("state", StringType(), True)
])

raw_data_path = '/Volumes/e-commerce/source_data/raw_data/ecomm-raw-data/customers/*.csv'

df = spark.read.option("header", "true").option("delimiter", ",").schema(customers_schema).csv(raw_data_path)\
     .withColumn("file_name", f.col("_metadata.file_path"))\
     .withColumn("ingest_timestamp", f.current_timestamp())

df.write.format("delta")\
  .mode("overwrite")\
  .option("mergeSchema", "true")\
  .saveAsTable(f"`{catalog_name}`.bronze.brz_customers")


##Date

In [0]:
data_schema = StructType([
    StructField("date", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("day_name", StringType(), True),
    StructField("quarter", IntegerType(), True),
    StructField("week_of_year", IntegerType(), True)
])

raw_data_path = '/Volumes/e-commerce/source_data/raw_data/ecomm-raw-data/date/*.csv'

df = spark.read.option("header", "true").option("delimiter", ",").schema(data_schema).csv(raw_data_path)\
    .withColumn("_ingested_at", f.current_timestamp())\
    .withColumn("_source_file", f.col("_metadata.file_path"))

df.write.format("delta")\
  .mode("overwrite")\
  .option("mergeSchema", "true")\
  .saveAsTable(f"`{catalog_name}`.bronze.brz_calendar")