In [0]:
from pyspark.sql.functions import col, expr, current_timestamp, concat_ws, when, max, current_date, date_diff
from pyspark.sql.window import Window

In [0]:
titles_bronze_df = spark.read.table("employee_catalog.raw.titles")

In [0]:
# Show raw schema 
titles_bronze_df.printSchema()

root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- _ab_cdc_cursor: string (nullable = true)
 |-- _ab_cdc_deleted_at: string (nullable = true)
 |-- _ab_cdc_log_file: string (nullable = true)
 |-- _ab_cdc_log_pos: integer (nullable = true)
 |-- _ab_cdc_updated_at: timestamp (nullable = true)
 |-- emp_no: integer (nullable = true)
 |-- from_date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- to_date: date (nullable = true)
 |-- _file_name: string (nullable = true)
 |-- load_date: timestamp (nullable = true)



In [0]:
display(titles_bronze_df)

_airbyte_ab_id,_airbyte_emitted_at,_ab_cdc_cursor,_ab_cdc_deleted_at,_ab_cdc_log_file,_ab_cdc_log_pos,_ab_cdc_updated_at,emp_no,from_date,title,to_date,_file_name,load_date
3275386b-d1ca-42ea-9279-b2711b75907c,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10001,1986-06-26,Senior Engineer,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
6a4592db-2250-4c56-b40d-92e600a0a0bf,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10002,1996-08-03,Staff,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
2981607a-d7b1-4427-a998-570888d872e2,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10003,1995-12-03,Senior Engineer,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
3ee893b9-7b90-481a-9528-451222c3572c,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10004,1986-12-01,Engineer,1995-12-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
9753010e-19ae-4752-8997-41ba6e6bafb5,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10004,1995-12-01,Senior Engineer,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
3792bb29-beaa-4e77-82b9-301eb239546c,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10005,1996-09-12,Senior Staff,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
e88a0dfd-101a-4f2e-b581-3a7414f2290d,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10005,1989-09-12,Staff,1996-09-12,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
9ab6bef4-0fa8-4edc-8b0c-1d1717df1e20,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10006,1990-08-05,Senior Engineer,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
18ff4a26-a096-4f9f-9f53-4f0f83c938ae,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10007,1996-02-11,Senior Staff,9999-01-01,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z
cfc72a62-0b99-4fc2-88f3-2ab3b8e869ed,1738956715421,,,mysql-bin.000003,66100231,2025-02-07T19:31:55.421Z,10007,1989-02-10,Staff,1996-02-11,dbfs:/mnt/raw/titles/2025_02_07_1738956718171_0.csv,2025-02-10T20:31:50.765Z


In [0]:
titles_bronze_df.count()

443308

In [0]:
titles_bronze_df = titles_bronze_df.select(
    expr("TRIM(emp_no)").alias("emp_no"),
    col("from_date"),
    col("title"),
    col("to_date"),
    col("_ab_cdc_updated_at").alias("updated_at")
) \
.withColumn("load_date", current_timestamp()) \
.where(
    col("emp_no").isNotNull() & 
    col("title").isNotNull()
    )

In [0]:
display(titles_bronze_df.limit(10))

emp_no,from_date,title,to_date,updated_at,load_date
10001,1986-06-26,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10002,1996-08-03,Staff,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10003,1995-12-03,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10004,1986-12-01,Engineer,1995-12-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10004,1995-12-01,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10005,1996-09-12,Senior Staff,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10005,1989-09-12,Staff,1996-09-12,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10006,1990-08-05,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10007,1996-02-11,Senior Staff,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z
10007,1989-02-10,Staff,1996-02-11,2025-02-07T19:31:55.421Z,2025-02-11T06:01:10.218Z


In [0]:
window_spec = Window.partitionBy("emp_no")

titles_bronze_df = titles_bronze_df.withColumn("is_current_title", 
                                               when(col("to_date").isNull() | (col("to_date") == max("to_date").over(window_spec)), "Y")
                                               .otherwise("N")) \
                                    .withColumn("title_tenure_days", when(col("to_date") == '9999-01-01', date_diff(current_date(), col("from_date")))
                                                .otherwise(date_diff(col("to_date"), col("from_date")))) \
                                    .withColumn("title_tenure_Years", (col("title_tenure_days") / 365).cast("decimal(10,2)"))

In [0]:
display(titles_bronze_df.limit(10))

emp_no,from_date,title,to_date,updated_at,load_date,is_current_title,title_tenure_days,title_tenure_Years
100008,1988-03-20,Senior Engineer,1998-04-13,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,3676,10.07
100010,1991-10-04,Engineer,1997-10-03,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,N,2191,6.0
100010,1997-10-03,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,9993,27.38
100014,1990-03-25,Engineer,1999-03-25,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,N,3287,9.01
100014,1999-03-25,Senior Engineer,2000-01-30,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,311,0.85
100021,1991-10-21,Engineer,2000-10-20,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,N,3287,9.01
100021,2000-10-20,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,8880,24.33
100022,1990-02-04,Senior Staff,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,12791,35.04
100022,1985-02-04,Staff,1990-02-04,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,N,1826,5.0
100025,1995-02-15,Senior Engineer,9999-01-01,2025-02-07T19:31:55.421Z,2025-02-11T06:01:11.717Z,Y,10954,30.01


In [0]:
titles_bronze_df.write \
    .option("mergeSchema", True) \
    .mode("overwrite") \
    .saveAsTable("employee_catalog.silver.titles")