### Simple checks
Just using the read function to have an overview of one of the csv files.

In [0]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/Volumes/pysparkdbt/source/source_data/customers/")

In [0]:
# Checking the schema for customers
schema_customers = df.schema
schema_customers

In [0]:
display(df)

### Spark Streaming Process
In this process we utilized the readStream function to read the data.
We defined entities and obtained the schemas for those entities to make the whole code more dynamic.

In [0]:
entities = ["customers","trips","locations","payments", "vehicles", "drivers"]

In [0]:
from pyspark.sql.functions import current_timestamp, col

# List of entities to ingest
entities = ["customers", "trips", "locations", "payments", "vehicles", "drivers"]

queries = []
for entity in entities:
    # Read a static batch of CSV files once to infer the schema
    df_batch = (spark.read.format("csv")
                .option("header", "true")
                .option("inferSchema", "true")
                .load(f"/Volumes/pysparkdbt/source/source_data/{entity}/"))
    schema_entity = df_batch.schema

    # Define the streaming DataFrame using the schema inferred above
    df = (spark.readStream.format("csv")
          .option("header", "true")
          .schema(schema_entity)
          .load(f"/Volumes/pysparkdbt/source/source_data/{entity}/")
          # Add ingestion timestamp (when the row was ingested into the lakehouse)
          .withColumn("ingestion_timestamp", current_timestamp())
          # Add the source file path (lineage metadata, available in Unity Catalog via _metadata.file_path)
          .withColumn("input_file_path", col("_metadata.file_path")))

    # Write the stream into a Delta table in the Bronze layer
    q = (df.writeStream.format("delta")
         # Define a per-entity checkpoint to ensure idempotency and recovery
         .option("checkpointLocation", f"/Volumes/pysparkdbt/bronze/checkpoint/{entity}")
         # Use trigger once to process available data one time and stop
         .trigger(once=True)
         # Target table in the Bronze schema for this entity
         .toTable(f"pysparkdbt.bronze.{entity}"))
    queries.append(q)

# Wait for all queries to finish before exiting
for q in queries:
    q.awaitTermination()

In [0]:
%sql
/*
DROP TABLE IF EXISTS pysparkdbt.bronze.customers;
DROP TABLE IF EXISTS pysparkdbt.bronze.trips;
DROP TABLE IF EXISTS pysparkdbt.bronze.locations;
DROP TABLE IF EXISTS pysparkdbt.bronze.payments;
DROP TABLE IF EXISTS pysparkdbt.bronze.vehicles;
DROP TABLE IF EXISTS pysparkdbt.bronze.drivers;
*/