In [1]:
from utils.spark_session import get_spark_session
from utils.hadoop_setup import complete_hadoop_setup

# Hadoop complete setup
complete_hadoop_setup()

# Create Spark Session and assign to spark 'variable'
spark = get_spark_session()

âœ” HADOOP_HOME set to:, os.environ['HADOOP_HOME']
âœ” Added to PATH: C:\hadoop\bin

 âœ” winutils.exe: True
 âœ” hadoop.dll: True

ðŸŽ‰ Setup complete!
âœ” HADOOP_HOME set to:, os.environ['HADOOP_HOME']
âœ” Added to PATH: C:\hadoop\bin

 âœ” winutils.exe: True
 âœ” hadoop.dll: True

ðŸŽ‰ Setup complete!


In [2]:
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\chira\Desktop\data_engineering\PySpark\nyc-taxi-analytics-platform")

BRONZE_GREEN_PATH = str(PROJECT_ROOT / "data" / "bronze" / "nyc_taxi" / "green")
SILVER_GREEN_PATH = str(PROJECT_ROOT / "data" / "silver" / "nyc_taxi" / "green")

In [3]:
df_bronze = spark.read.format("delta").load(BRONZE_GREEN_PATH)

print(f"Bronze row count: {df_bronze.count():,}")
df_bronze.printSchema()

Bronze row count: 48,893
root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- cbd_congestion_fee: double (nullable = true)
 |-- month: in

In [4]:
from pyspark.sql.functions import col

df_selected = (
    df_bronze
    .select(
        col("VendorID").alias("vendor_id"),
        col("lpep_pickup_datetime").alias("pickup_ts"),
        col("lpep_dropoff_datetime").alias("dropoff_ts"),
        col("passenger_count"),
        col("trip_distance"),
        col("fare_amount"),
        col("total_amount"),
        col("_ingestion_date"),
        col("_ingestion_timestamp"),
        col("_batch_id")
    )
)

In [6]:
from pyspark.sql.functions import expr

df_valid = df_selected.filter(
    (col("pickup_ts").isNotNull()) &
    (col("dropoff_ts").isNotNull()) &
    (col("trip_distance") > 0) &
    (col("fare_amount") >= 0) &
    (col("total_amount") >= col("fare_amount"))
    
)

In [9]:
valid_count = df_valid.count()
total_count = df_selected.count()

print(f"Total records: {total_count:,}")
print(f"Valid records: {valid_count:,}")
print(f"Dropped records: {total_count - valid_count:,}")

Total records: 48,893
Valid records: 47,364
Dropped records: 1,529


In [10]:
from pyspark.sql.functions import year, month

df_silver = (
    df_valid
    .withColumn("year", year("pickup_ts"))
    .withColumn("month", month("pickup_ts"))
)

In [11]:
(
    df_silver
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year", "month")
    .save(SILVER_GREEN_PATH)
)

print("âœ” Silver layer write completed")

âœ” Silver layer write completed


In [12]:
spark.read.format("delta").load(SILVER_GREEN_PATH).groupBy("year", "month").count().show()

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2025|    9|47333|
|2025|    8|   17|
|2025|   10|   14|
+----+-----+-----+

