In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import *

In [0]:
def process_silver_layer_taxi_data(source_table, target_table):
    df_source = spark.table(source_table).select("vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "total_amount")
    
    df_cleaned = df_source.filter(
        (col("vendor_id").isNotNull()) & 
        (col("pickup_datetime").between("2023-01-01", "2023-05-31")) & 
        (col("dropoff_datetime").between("2023-01-01", "2023-05-31")) & 
        (col("passenger_count") > 1) & 
        (col("total_amount") > 0)
        )\
        .withColumn("year", year("pickup_datetime")) \
        .withColumn("month", month("pickup_datetime"))

    df_cleaned.write.format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month") \
            .option("overwriteSchema", "true") \
            .saveAsTable(target_table)

In [0]:
taxi_types = ["yellow", "green"]

for t in taxi_types:
    source_table = f"etl.bronze.bronze__nyc_taxi_{t}"
    target_table = f"etl.silver.silver__nyc_taxi_{t}"
    try:
        process_silver_layer_taxi_data(source_table, target_table)
    except Exception as e:
        raise Exception(f"Error processing silver layer: {e}")