### Simple checks
Just using the read function to have an overview of one of the csv files.

In [0]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/Volumes/pysparkdbt/source/source_data/customers/")

In [0]:
# Checking the schema for customers
schema_customers = df.schema
schema_customers

In [0]:
display(df)

### Spark Streaming Process
In this process we utilized the readStream function to read the data.
We defined entities and obtained the schemas for those entities to make the whole code more dynamic.

In [0]:
entities = ["customers","trips","locations","payments", "vehicles", "drivers"]

In [0]:
for entity in entities:

    df_batch = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"/Volumes/pysparkdbt/source/source_data/{entity}/")

    schema_entity = df_batch.schema

    df = spark.readStream.format("csv")\
        .option("header", "true")\
        .schema(schema_entity)\
        .load(f"/Volumes/pysparkdbt/source/source_data/{entity}/")

    # Checkpoints to enable, with trigger once, we just process once to not exceeed a lot of compute, in other settings we could use processingTime = "10 seconds" or so. It dictates when we want to apply the incremental load.
    df.writeStream.format("delta")\
        .option("checkpointLocation", f"/Volumes/pysparkdbt/bronze/checkpoint/{entity}")\
        .trigger(once=True)\
        .toTable(f"pysparkdbt.bronze.{entity}")

#### Schemas definition through structs.

Added infestion timestamp and file source path. 
Fixed StructType per entity removes the randomness of inferSchema, making failures early noticeable.


In [0]:
%sql
/*
DROP TABLE IF EXISTS pysparkdbt.bronze.customers;
DROP TABLE IF EXISTS pysparkdbt.bronze.trips;
DROP TABLE IF EXISTS pysparkdbt.bronze.locations;
DROP TABLE IF EXISTS pysparkdbt.bronze.payments;
DROP TABLE IF EXISTS pysparkdbt.bronze.vehicles;
DROP TABLE IF EXISTS pysparkdbt.bronze.drivers;
*/


In [0]:
# Imports
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, TimestampType, BooleanType
from pyspark.sql.functions import current_timestamp, input_file_name


In [0]:
# Defining schemas for each entity
schemas = {
    "customers": StructType([
        StructField("customer_id", LongType(), True),
        StructField("name", StringType(), True),
        StructField("email", StringType(), True),
        StructField("phone", StringType(), True),
        StructField("created_at", TimestampType(), True)
    ]),
    "trips": StructType([
        StructField("trip_id", LongType(), True),
        StructField("customer_id", LongType(), True),
        StructField("driver_id", LongType(), True),
        StructField("vehicle_id", LongType(), True),
        StructField("pickup_location_id", LongType(), True),
        StructField("dropoff_location_id", LongType(), True),
        StructField("start_time", TimestampType(), True),
        StructField("end_time", TimestampType(), True),
        StructField("distance_km", DoubleType(), True),
        StructField("status", StringType(), True)
    ]),
    "locations": StructType([
        StructField("location_id", LongType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("city", StringType(), True),
        StructField("country", StringType(), True)
    ]),
    "payments": StructType([
        StructField("payment_id", LongType(), True),
        StructField("trip_id", LongType(), True),
        StructField("customer_id", LongType(), True),
        StructField("amount", DoubleType(), True),
        StructField("currency", StringType(), True),
        StructField("method", StringType(), True),
        StructField("paid_at", TimestampType(), True)
    ]),
    "vehicles": StructType([
        StructField("vehicle_id", LongType(), True),
        StructField("driver_id", LongType(), True),
        StructField("plate_number", StringType(), True),
        StructField("make", StringType(), True),
        StructField("model", StringType(), True),
        StructField("year", IntegerType(), True),
        StructField("capacity", IntegerType(), True)
    ]),
    "drivers": StructType([
        StructField("driver_id", LongType(), True),
        StructField("name", StringType(), True),
        StructField("license_number", StringType(), True),
        StructField("hired_at", TimestampType(), True),
        StructField("active", BooleanType(), True)
    ])
}

In [0]:
from pyspark.sql.functions import current_timestamp, col

entities = ["customers", "trips", "locations", "payments", "vehicles", "drivers"]

queries = []
for entity in entities:
    source = f"/Volumes/pysparkdbt/source/source_data/{entity}/"
    check = f"/Volumes/pysparkdbt/bronze/checkpoint/{entity}"
    schema_entity = schemas[entity]

    df = (spark.readStream
          .format("csv")
          .option("header", "true")
          .schema(schema_entity)
          .load(source)
          .withColumn("ingestion_timestamp", current_timestamp())
          .withColumn("input_file_path", col("_metadata.file_path"))
         )

    q = (df.writeStream
         .format("delta")
         .option("checkpointLocation", check)
         .trigger(once=True)
         .toTable(f"pysparkdbt.bronze.{entity}"))
    queries.append(q)

for q in queries:
    q.awaitTermination()
