In [0]:
from pyspark.sql.functions import col, expr, current_timestamp, concat_ws

In [0]:
employees_bronze_df = spark.read.table("employee_catalog.raw.employees")

In [0]:
# Show raw schema 
employees_bronze_df.printSchema()

root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- _ab_cdc_cursor: string (nullable = true)
 |-- _ab_cdc_deleted_at: string (nullable = true)
 |-- _ab_cdc_log_file: string (nullable = true)
 |-- _ab_cdc_log_pos: integer (nullable = true)
 |-- _ab_cdc_updated_at: timestamp (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- emp_no: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- last_name: string (nullable = true)
 |-- _file_name: string (nullable = true)
 |-- load_date: timestamp (nullable = true)



In [0]:
display(employees_bronze_df)

_airbyte_ab_id,_airbyte_emitted_at,_ab_cdc_cursor,_ab_cdc_deleted_at,_ab_cdc_log_file,_ab_cdc_log_pos,_ab_cdc_updated_at,birth_date,emp_no,first_name,gender,hire_date,last_name,_file_name,load_date
f5f20a8a-f84d-4031-9501-666d3330a49b,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1964-06-02,10002,Bezalel,F,1985-11-21,Simmel,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
245a3cfb-87be-4c4a-a3f5-8ed8f346c5d0,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1959-12-03,10003,Parto,M,1986-08-28,Bamford,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
85ab7106-8c45-4dc9-8581-906d5cd52325,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1954-05-01,10004,Chirstian,M,1986-12-01,Koblick,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
79b17167-d683-4b53-abc1-e87199f3d74d,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1955-01-21,10005,Kyoichi,M,1989-09-12,Maliniak,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
d73fb905-6dab-4001-8d62-e147dac9ba40,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1953-04-20,10006,Anneke,F,1989-06-02,Preusig,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
37296ce6-ac42-4ba4-91fc-a85f5c33bd96,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1957-05-23,10007,Tzvetan,F,1989-02-10,Zielinski,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
7c8b1ab2-e31a-4a2f-8cbd-37c0066e3bfe,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1958-02-19,10008,Saniya,M,1994-09-15,Kalloufi,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
5a772d80-cb3f-4b1b-a7bc-4642c308bab4,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1952-04-19,10009,Sumant,F,1985-02-18,Peac,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
3532fb24-4e3a-426c-b4fa-465d09a5dc2f,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1963-06-01,10010,Duangkaew,F,1989-08-24,Piveteau,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z
5e924ac7-75de-45bd-8e1b-8677b78d203a,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,1953-11-07,10011,Mary,F,1990-01-22,Sluis,dbfs:/mnt/raw/employees/2025_02_07_1738956719479_0.csv,2025-02-12T14:09:33.957Z


In [0]:
employees_bronze_df.count()

300024

In [0]:
employees_bronze_df = employees_bronze_df.select(
    expr("TRIM(emp_no)").alias("emp_no"),
    expr("TRIM(first_name)").alias("first_name"),
    expr("TRIM(last_name)").alias("last_name"),
    col("gender"),
    col("hire_date"),
    col("birth_date"),
    col("_ab_cdc_updated_at").alias("updated_at")
) \
.withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name"))) \
.withColumn("load_date", current_timestamp()) \
.where(
    col("emp_no").isNotNull() & 
    col("gender").isNotNull()
    )

In [0]:
display(employees_bronze_df.limit(10))

emp_no,first_name,last_name,gender,hire_date,birth_date,updated_at,full_name,load_date
10002,Bezalel,Simmel,F,1985-11-21,1964-06-02,2025-02-07T19:31:55.421Z,Bezalel Simmel,2025-02-12T15:13:59.049Z
10003,Parto,Bamford,M,1986-08-28,1959-12-03,2025-02-07T19:31:55.421Z,Parto Bamford,2025-02-12T15:13:59.049Z
10004,Chirstian,Koblick,M,1986-12-01,1954-05-01,2025-02-07T19:31:55.421Z,Chirstian Koblick,2025-02-12T15:13:59.049Z
10005,Kyoichi,Maliniak,M,1989-09-12,1955-01-21,2025-02-07T19:31:55.421Z,Kyoichi Maliniak,2025-02-12T15:13:59.049Z
10006,Anneke,Preusig,F,1989-06-02,1953-04-20,2025-02-07T19:31:55.421Z,Anneke Preusig,2025-02-12T15:13:59.049Z
10007,Tzvetan,Zielinski,F,1989-02-10,1957-05-23,2025-02-07T19:31:55.421Z,Tzvetan Zielinski,2025-02-12T15:13:59.049Z
10008,Saniya,Kalloufi,M,1994-09-15,1958-02-19,2025-02-07T19:31:55.421Z,Saniya Kalloufi,2025-02-12T15:13:59.049Z
10009,Sumant,Peac,F,1985-02-18,1952-04-19,2025-02-07T19:31:55.421Z,Sumant Peac,2025-02-12T15:13:59.049Z
10010,Duangkaew,Piveteau,F,1989-08-24,1963-06-01,2025-02-07T19:31:55.421Z,Duangkaew Piveteau,2025-02-12T15:13:59.049Z
10011,Mary,Sluis,F,1990-01-22,1953-11-07,2025-02-07T19:31:55.421Z,Mary Sluis,2025-02-12T15:13:59.049Z


In [0]:
query = """
MERGE INTO employee_catalog.silver.employees AS target
USING (
    SELECT emp_no, first_name, last_name, gender, hire_date, birth_date, _ab_cdc_updated_at as updated_at, concat(first_name," ", last_name) as full_name, load_date
    FROM employee_catalog.raw.employees
)
    AS source
ON target.emp_no = source.emp_no 
WHEN MATCHED 
    AND source.updated_at != target.updated_at
THEN
    UPDATE SET
    target.first_name = source.first_name,
    target.last_name = source.last_name,
    target.birth_date = source.birth_date,
    target.full_name = source.full_name,
    target.updated_at = source.updated_at
WHEN MATCHED
    AND source.updated_at = target.updated_at
THEN
DO NOTHING
WHEN NOT MATCHED THEN
    INSERT (emp_no, first_name, last_name, gender, hire_date, birth_date, updated_at, full_name, load_date)
    VALUES (source.emp_no, source.first_name, source.last_name, source.gender, source.hire_date, source.birth_date, source.updated_at, source.full_name, source.load_date)
    """ 
spark.sql(query)

print("Data has been successfully loaded to the silver table")


Data has been successfully loaded to the silver table
