In [0]:
from pyspark.sql.functions import col, expr, current_timestamp

In [0]:
departments_bronze_df = spark.read.table("employee_catalog.raw.departments")

In [0]:
# Show raw schema (includes Airbyte columns)
departments_bronze_df.printSchema()

root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- _ab_cdc_cursor: string (nullable = true)
 |-- _ab_cdc_deleted_at: string (nullable = true)
 |-- _ab_cdc_log_file: string (nullable = true)
 |-- _ab_cdc_log_pos: integer (nullable = true)
 |-- _ab_cdc_updated_at: timestamp (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- dept_no: string (nullable = true)
 |-- _file_name: string (nullable = true)
 |-- load_date: timestamp (nullable = true)



In [0]:
display(departments_bronze_df)

_airbyte_ab_id,_airbyte_emitted_at,_ab_cdc_cursor,_ab_cdc_deleted_at,_ab_cdc_log_file,_ab_cdc_log_pos,_ab_cdc_updated_at,dept_name,dept_no,_file_name,load_date
a479f4cb-bb6b-4a2e-b777-754ac647fd75,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Marketing,d001,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
f55a7bf7-e8b8-4213-9b41-3178d162b61f,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Finance,d002,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
b1656ee6-0bdc-4611-b33a-3f4244a41951,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Human Resources,d003,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
1234c58d-cc06-44a4-a011-a69622a96cc2,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Production,d004,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
958720fe-e22e-4aa9-ba63-272d9462f987,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Development,d005,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
5ebac472-7654-4e2b-929d-bed5c402f001,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Quality Management,d006,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
6f734bc5-1971-4757-8d62-62ae58039dad,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Sales,d007,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
c9bfb984-9666-4440-a281-337e706197a8,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Research,d008,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z
398150c6-f08a-4f79-b40b-b509abde3e62,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,Customer Service,d009,dbfs:/mnt/raw/departments/2025_02_07_1738956715753_0.csv,2025-02-10T13:33:09.111Z


In [0]:
departments_bronze_df = departments_bronze_df.select(
    expr("TRIM(dept_no)").alias("dept_no"),
    expr("TRIM(dept_name)").alias("dept_name"),
    col("_ab_cdc_updated_at").alias("updated_at")
) \
.withColumn("load_date", current_timestamp()) \
.where(col("dept_no").isNotNull())

In [0]:
display(departments_bronze_df.limit(10))

dept_no,dept_name,updated_at,load_date
d001,Marketing,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d002,Finance,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d003,Human Resources,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d004,Production,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d005,Development,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d006,Quality Management,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d007,Sales,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d008,Research,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z
d009,Customer Service,2025-02-07T19:31:55.421Z,2025-02-10T15:57:27.644Z


In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS employee_catalog.silver")

DataFrame[]

In [0]:
departments_bronze_df \
    .write.mode("overwrite") \
    .saveAsTable("employee_catalog.silver.departments")

In [0]:
# MERGE INTO silver_employees AS target
# USING (
#     SELECT emp_no, birth_date, first_name, last_name, gender, hire_date, _airbyte_emitted_at AS updated_at
#     FROM raw_employees
# ) AS source
# ON target.emp_no = source.emp_no
# WHEN MATCHED THEN 
#     UPDATE SET 
#         target.first_name = source.first_name,
#         target.last_name = source.last_name,
#         target.gender = source.gender,
#         target.hire_date = source.hire_date,
#         target.updated_at = source.updated_at
# WHEN NOT MATCHED THEN 
#     INSERT (emp_no, birth_date, first_name, last_name, gender, hire_date, updated_at)
#     VALUES (source.emp_no, source.birth_date, source.first_name, source.last_name, source.gender, source.hire_date, source.updated_at);


In [0]:
# MERGE INTO silver_salaries AS target
# USING (
#     SELECT emp_no, salary, from_date AS effective_from, to_date AS effective_to
#     FROM raw_salaries
# ) AS source
# ON target.emp_no = source.emp_no 
#    AND target.is_current = 'Y'  -- Only update current records
#    AND source.effective_from > target.effective_from  -- Ensure new salary is later
# WHEN MATCHED AND source.salary <> target.salary THEN
#     -- Mark old record as historical
#     UPDATE SET 
#         target.effective_to = source.effective_from - INTERVAL '1 DAY',
#         target.is_current = 'N'
# WHEN NOT MATCHED THEN 
#     -- Insert new record
#     INSERT (emp_no, salary, effective_from, effective_to, is_current)
#     VALUES (source.emp_no, source.salary, source.effective_from, source.effective_to, 
#             CASE WHEN source.effective_to IS NULL THEN 'Y' ELSE 'N' END);


In [0]:
# MERGE INTO silver_titles AS target
# USING (
#     SELECT emp_no, title, from_date AS effective_from, to_date AS effective_to
#     FROM raw_titles
# ) AS source
# ON target.emp_no = source.emp_no
#    AND target.is_current = 'Y'
#    AND source.effective_from > target.effective_from
# WHEN MATCHED AND source.title <> target.title THEN
#     -- Close old record
#     UPDATE SET 
#         target.effective_to = source.effective_from - INTERVAL '1 DAY',
#         target.is_current = 'N'
# WHEN NOT MATCHED THEN 
#     -- Insert new title record
#     INSERT (emp_no, title, effective_from, effective_to, is_current)
#     VALUES (source.emp_no, source.title, source.effective_from, source.effective_to, 
#             CASE WHEN source.effective_to IS NULL THEN 'Y' ELSE 'N' END);
