# Create Gold Notebook with Kimbal Architecture

### CSIS4495-050: Applied Research Project

Group:
- Bruno do Nascimento Beserra
- Jay Clark Bermudez
- Matheus Filipe Figueiredo

Instructor: Dr. Bambang Sarif



In [0]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, DoubleType, BooleanType
import os
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()

In [0]:
# Setup
gold_path = "workspace.default"
silver_table = "workspace.default.hr_silver_data"

dim_tables = [
    "dim_department_gold",
    "dim_job_title_gold",
    "dim_location_gold",
    "dim_status_gold",
    "dim_work_mode_gold",
    "dim_job_level_gold"
]

fact_table = "fact_table_gold_hr_data"

# Define schema for dim_tables
dim_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True)
])

# Define schema for fact table
fact_schema = StructType([
    StructField("employee_id", StringType(), True),
    StructField("full_name", StringType(), True),
    StructField("department_id", IntegerType(), True),
    StructField("job_title_id", IntegerType(), True),
    StructField("hire_date", DateType(), True),
    StructField("location_id", IntegerType(), True),
    StructField("performance_rating", IntegerType(), True),
    StructField("experience_years", IntegerType(), True),
    StructField("status_id", IntegerType(), True),
    StructField("work_mode_id", IntegerType(), True),
    StructField("annual_salary", DoubleType(), True),
    StructField("job_level_id", IntegerType(), True),
    StructField("ingestion_timestamp", DateType(), True),
    StructField("data_hash", StringType(), True),
    StructField("start_effectivity_date", DateType(), True),
    StructField("end_effectivity_date", DateType(), True),
    StructField("is_current", BooleanType(), True)
])

dim_mapping = {
    "department": "dim_department_gold",
    "job_title": "dim_job_title_gold",
    "location": "dim_location_gold",
    "status": "dim_status_gold",
    "work_mode": "dim_work_mode_gold",
    "job_level": "dim_job_level_gold"
}


In [0]:
# Helper Functions
def table_exists(table_name):
    return spark.catalog.tableExists(f"{gold_path}.{table_name}")

def dim_create_update(fact_df, col_name, table_name):
    if table_exists(table_name):
        dim_df = spark.table(f"{gold_path}.{table_name}")
    else:
        dim_df = spark.createDataFrame([], "id int, name string")
    
    fact_vals = fact_df.select(col_name).where(f.col(col_name).isNotNull()).distinct().withColumnRenamed(col_name, "name")
    new_vals = fact_vals.join(dim_df, "name", "left_anti")

    if new_vals.count() > 0:
        max_id = dim_df.agg(f.coalesce(f.max("id"), f.lit(0))).collect()[0][0]
        window = Window.orderBy("name")
        new_vals = new_vals.withColumn("id", f.row_number().over(window) + max_id)
        dim_df = dim_df.unionByName(new_vals)
    dim_df.write.format("delta").mode("overwrite").saveAsTable(f"{gold_path}.{table_name}")
    return dim_df.select("id", "name")


In [0]:
for dim in dim_tables:
    full_path = f"{gold_path}.{dim}"
    if not table_exists(dim):
        empty_df = spark.createDataFrame([], schema=dim_schema)
        empty_df.write.format("delta").mode("overwrite").saveAsTable(full_path)
    else:
        print(f"{full_path} already exists")

In [0]:
if not table_exists(fact_table):
    empty_fact_df = spark.createDataFrame([], fact_schema)
    empty_fact_df.write.format("delta").mode("overwrite").saveAsTable(f"{gold_path}.{fact_table}")
    print(f"Created fact table: {gold_path}.{fact_table}")
else:
    print(f"Fact table already exists: {gold_path}.{fact_table}")

In [0]:
last_ingestion_timestamp = None

if table_exists(fact_table):
    last_timestamp_df = spark.table(fact_table).select(f.max("ingestion_timestamp").alias("last_timestamp"))
    last_ingestion_timestamp = last_timestamp_df.collect()[0]["last_timestamp"]

if last_ingestion_timestamp is not None:
    df_fact = spark.sql(f"""
        SELECT *
        FROM {silver_table}
        WHERE ingestion_timestamp > '{last_ingestion_timestamp}'
    """)
else:
    df_fact = spark.table(silver_table)

In [0]:
df_fact.count()

In [0]:
for col, dim_table in dim_mapping.items():
    dim_df = dim_create_update(df_fact, col, dim_table)
    df_fact = df_fact.join(dim_df, df_fact[col] == dim_df["name"], "left") \
                     .drop(col, "name") \
                     .withColumnRenamed("id", f"{col}_id")

In [0]:
df_fact = df_fact.withColumn(
    "job_level_id",
    f.when(f.col("job_level_id").isNull(), f.lit(5)).otherwise(f.col("job_level_id"))
)

# Append new rows
if df_fact.count() > 0:
    df_fact.write.format("delta").mode("append").saveAsTable(f"{gold_path}.{fact_table}")
else:
    print("No new fact rows to insert")

In [0]:
df_fact.printSchema()

In [0]:
job_level_order = {
    "Specialist": 1,
    "Analyst": 2,
    "Manager": 3,
    "Principal": 4,
    "Executive": 5
}

dim_job_level = spark.table(f"{gold_path}.dim_job_level_gold")
mapping_expr = f.create_map([f.lit(x) for x in sum(job_level_order.items(), ())])

dim_job_level = dim_job_level.withColumn("job_level_order", mapping_expr[f.col("name")])

# Save back to delta table
dim_job_level.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_path}.dim_job_level_gold")