In [None]:
# Import necessary functions
from pyspark.sql.functions import current_timestamp, lit, col


In [None]:
# 1. Define configuration
# We use the Unity Catalog (UC) 3-level namespace: catalog.schema.table
# 'main' is the default catalog created with the workspace.
bronze_catalog = "main"
bronze_schema = "bronze"

# Define GCS paths using the External Location.
# Databricks knows that 'raw_data_source' points to 'gs:///raw/'
#users_gcs_path = "/Volumes/main/default/raw_data_source/users.json"
#events_gcs_path = "/Volumes/main/default/raw_data_source/events.json"
# Note: An alternative to using Volumes is reading directly from the external location path:

users_gcs_path = "gs://databricks-demo-bucket-123/raw/users.json"
events_gcs_path = "gs://databricks-demo-bucket-123/raw/events.json"
# For this to work without Volumes, the cluster's service account must have GCS permissions,
# or the External Location must be properly configured. For simplicity, we'll assume the direct path.

# Define target table names
users_table_name = f"{bronze_catalog}.{bronze_schema}.users"
events_table_name = f"{bronze_catalog}.{bronze_schema}.events"

In [None]:
# 2. Create the Schema (Database) if it doesn't exist
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_catalog}.{bronze_schema}")

In [None]:
# 3. Load raw 'users' data
# JSON files are often multi-line, so we use this option [43]
users_df = spark.read.option("multiline", "true").json(users_gcs_path)

In [None]:
# 4. Load raw 'events' data
events_df = spark.read.option("multiline", "true").json(events_gcs_path)

In [None]:
# 5. Add Ingestion Metadata (Bronze Layer Best Practice)
# This adds columns to track when the data was loaded 
users_df_with_metadata = users_df.withColumn("_ingestion_timestamp", current_timestamp()) \
                                .withColumn("_source_file", lit(users_gcs_path))

events_df_with_metadata = events_df.withColumn("_ingestion_timestamp", current_timestamp()) \
                                 .withColumn("_source_file", lit(events_gcs_path))

In [None]:
spark.sql(
    """
    CREATE VOLUME IF NOT EXISTS main.default.raw_data_source
    """
)
users_df_with_metadata = users_df_with_metadata.toDF(
    *[col.strip() for col in users_df_with_metadata.columns]
)
events_df_with_metadata = events_df_with_metadata.toDF(
    *[col.strip() for col in events_df_with_metadata.columns]
)
users_table_name = "main.bronze.users"
events_table_name = "main.bronze.events"
users_df_with_metadata.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(users_table_name)

events_df_with_metadata.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(events_table_name)

display(f"Successfully wrote to {users_table_name}")
display(f"Successfully wrote to {events_table_name}")