### This notebook:
- reads raw CSVs
- infers or defines schema
- writes Delta tables


### Old SQL -> New Single Notebook
```
ddl_bronze.sql ----------|
                         |---------> notebooks/bronze/01_bronze_ingestion.py
proc_load_bronze.sql ----|
```

Step 1: Setup

In [0]:
# Databricks notebook source
# ---------------------------------------
# Bronze Layer Ingestion
# Loads raw CRM + ERP CSV files into Delta Bronze tables
# ---------------------------------------


base_path = "/Volumes/datawarehouse/google_drive/project_volume"
spark.sql("USE CATALOG datawarehouse")
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")


DataFrame[]

Step 2: Define Schema

In [0]:
from pyspark.sql.types import *


cust_schema = StructType([
    StructField("cst_id", IntegerType()),
    StructField("cst_key", StringType()),
    StructField("cst_firstname", StringType()),
    StructField("cst_lastname", StringType()),
    StructField("cst_marital_status", StringType()),
    StructField("cst_gndr", StringType()),
    StructField("cst_create_date", DateType())
])
prd_schema = StructType([
    StructField("prd_id", IntegerType()),
    StructField("prd_key", StringType()),
    StructField("prd_nm", StringType()),
    StructField("prd_cost", IntegerType()),
    StructField("prd_line", StringType()),
    StructField("prd_start_dt", DateType()),
    StructField("prd_end_dt", DateType())
])
sales_schema = StructType([
    StructField("sls_ord_num", StringType()),
    StructField("sls_prd_key", StringType()),
    StructField("sls_cust_id", IntegerType()),
    StructField("sls_order_dt", IntegerType()),
    StructField("sls_ship_dt", IntegerType()),
    StructField("sls_due_dt", IntegerType()),
    StructField("sls_sales", IntegerType()),
    StructField("sls_quantity", IntegerType()),
    StructField("sls_price", IntegerType())
])
erp_px_cat_schema = StructType([
    StructField("ID", StringType()),
    StructField("CAT", StringType()),
    StructField("SUBCAT", StringType()),
    StructField("MAINTENANCE", StringType())
])
erp_loc_schema = StructType([
    StructField("CID", StringType()),
    StructField("CNTRY", StringType())
])
erp_cust_schema = StructType([
    StructField("CID", StringType()),
    StructField("BDATE", DateType()),
    StructField("GEN", StringType())
])



In [0]:
tables = [
    {
        "file":"cust_info.csv",
        "table":"bronze.crm_cust_info", 
        "schema":cust_schema
    },
    {
        "file":"prd_info.csv", 
        "table":"bronze.crm_prd_info", 
        "schema":prd_schema
    },
    {
        "file":"sales_details.csv", 
        "table":"bronze.crm_sales_details", "schema":sales_schema
    },
    {
        "file":"PX_CAT_G1V2.csv", 
        "table":"bronze.erp_px_cat_g1v2", "schema":erp_px_cat_schema
    },
    {
        "file":"LOC_A101.csv", 
        "table":"bronze.erp_loc_a101", "schema":erp_loc_schema
    },
    {
        "file":"CUST_AZ12.csv", 
        "table":"bronze.erp_cust_az12", "schema":erp_cust_schema
    }
]


Step 3: Read and then Write to Delta Tables

In [0]:
for t in tables:
    df = (
        spark.read
            .option("header", True)
            .schema(t["schema"])
            .csv(f"{base_path}/{t['file']}")
    )

    (
        df.write
          .format("delta")
          .mode("overwrite")
          .saveAsTable(t["table"])
    )

    print(f"Loaded {t['table']}")

Loaded bronze.crm_cust_info
Loaded bronze.crm_prd_info
Loaded bronze.crm_sales_details
Loaded bronze.erp_px_cat_g1v2
Loaded bronze.erp_loc_a101
Loaded bronze.erp_cust_az12
