In [46]:
from utils.spark_session import get_spark_session
from utils.hadoop_setup import complete_hadoop_setup

# Hadoop setup run
complete_hadoop_setup()


# Create Spark Session and assign it to spark 'variable'
spark = get_spark_session()

‚úî HADOOP_HOME set to:, os.environ['HADOOP_HOME']
‚úî Added to PATH: C:\hadoop\bin

 ‚úî winutils.exe: True
 ‚úî hadoop.dll: True

üéâ Setup complete!


In [56]:
from pyspark.sql.functions import (
    current_date,
    current_timestamp,
    input_file_name,
    lit
)

from pathlib import Path
from datetime import datetime
from functools import reduce

# Project configuration
PROJECT_ROOT = Path(r"C:\Users\chira\Desktop\data_engineering\PySpark\nyc-taxi-analytics-platform")

# Base paths
LANDING_BASE_PATH = PROJECT_ROOT / "data" / "landing" / "nyc_taxi"
BRONZE_BASE_PATH = PROJECT_ROOT / "data" / "bronze" / "nyc_taxi"

# Process only green taxi
TAXI_TYPE = "green"

# Batch ID for auditability
BATCH_ID = f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

PROCESSED_FILES_PATH = str(PROJECT_ROOT / "data" / "bronze" / "_processed_files")


In [57]:
# Initialize processed files tracker if needed

from pyspark.sql.types import StructType, StructField, StringType, TimestampType
processed_schema = StructType([
    StructField("file_path", StringType(), False),
    StructField("processed_at", TimestampType(), False)
])

# Robust approach: Try to read, create if it fails
try:
    processed_files_df = spark.read.format("delta").load(PROCESSED_FILES_PATH)
    processed_files = {row.file_path for row in processed_files_df.collect()}
    print(f"‚û° Loaded {len(processed_files)} previously processed files")
except Exception as e:
    print("‚û° Creating new _processed_files tracker table...")
    # Ensure directory exists and is empty
    processed_files_path_obj = Path(PROCESSED_FILES_PATH)
    if processed_files_path_obj.exists():
        import shutil
        shutil.rmtree(processed_files_path_obj)
        print(" ‚úî Cleaned up existing directory")

    # Create new Delta table
    spark.createDataFrame([], processed_schema) \
        .write.format("delta") \
        .save(PROCESSED_FILES_PATH)

    processed_files = set()
    print(" ‚úî Tracker table created (empty)")

‚û° Loaded 1 previously processed files


In [58]:
def ingest_green_to_bronze():
    """Ingest green taxi data from landing to bronze layer"""
    
    taxi_type = TAXI_TYPE
    landing_path = LANDING_BASE_PATH / taxi_type
    bronze_path = BRONZE_BASE_PATH / taxi_type

    print(f"\n{'='*60}")
    print(f"Processing {taxi_type.upper()} taxi data")
    print(f"{'='*60}")

    if not landing_path.exists():
        print(f"‚ùå No landing data for {taxi_type}")
        return

    # Only process 2025 data
    year_2025_path = landing_path / "2025"

    if not year_2025_path.exists():
        print(f"‚ùå No 2025 directory found for {taxi_type}")
        return

    # Check for valid parquet files
    parquet_files = list(year_2025_path.rglob("*.parquet"))

    # Filter out empty files (0 bytes)
    valid_files = [f for f in parquet_files if f.stat().st_size > 0]

    if not valid_files:
        print(f"‚ö† No valid parquet files found in 2025 directory")
        return

    try:
        print(f"‚û° Reading {taxi_type} data for year 2025...")
        print(f"‚û° Found {len(valid_files)} valid parquet file(s)")

        # Display file details
        for pf in valid_files:
            file_size_mb = pf.stat().st_size / (1024 * 1024)
            print(f"    -{pf.parent.name}/{pf.name}: {file_size_mb:.2f} MB")

        year_path_str = str(year_2025_path).replace('\\', '/')

        df = spark.read \
            .option("basePath", year_path_str) \
            .option("mergeSchema", "true") \
            .parquet(year_path_str)

        row_count = df.count()
        print(f" ‚úî Loaded {row_count:,} total rows from 2025")

    except Exception as e:
        print(f" ‚ùå Could not read 2025 data: {str(e)[:200]}")
        import traceback
        traceback.print_exc()
        return

    # Filter already processed files
    print(f" ‚û° Checking for already processed files...")
    df = df.withColumn("_source_file", input_file_name())
    df_new = df.filter(~df["_source_file"].isin(processed_files))

    new_count = df_new.count()
    if new_count == 0:
        print(f" ‚úî No new files to ingest for {taxi_type}")
        return

    print(f" ‚û° Found {new_count:,} new records to process...")

    # Add metadata columns
    df_bronze = (
        df_new
        .withColumn("_ingestion_date", current_date())
        .withColumn("_ingestion_timestamp", current_timestamp())
        .withColumn("_batch_id", lit(BATCH_ID))
        
    )

    # Ensure bronze path exists
    bronze_path.mkdir(parents=True, exist_ok=True)

    # Write to bronze Delta Lake
    bronze_path_str = str(bronze_path).replace('\\', '/')
    print(f" ‚û° Writing to bronze layer: {bronze_path_str}")

    df_bronze.write.format("delta") \
        .mode("append") \
        .save(bronze_path_str)

    # Update processed files tracker
    new_files_df = df_new.select("_source_file") \
        .distinct() \
        .withColumnRenamed("_source_file", "file_path") \
        .withColumn("processed_at", current_timestamp())

    files_to_track = new_files_df.count()
    new_files_df.write.format("delta") \
        .mode("append") \
        .save(PROCESSED_FILES_PATH)

    print(f"\n{'='*60}")
    print(f" ‚úî Bronze ingestion completed for {taxi_type}")
    print(f"{'='*60}")
    print(f"   Total records written: {new_count:,}")
    print(f"   Files tracked: {files_to_track}")
    print(f"   Batch ID: {BATCH_ID}")

In [59]:
# Execute ingestion

print(f"\n{'#'*60}")
print(f"Starting GREEN Taxi Bronze Layer Ingestion")
print(f"Batch: {BATCH_ID}")
print(f"{'#'*60}")

try:
    ingest_green_to_bronze()
    print(f"\n ‚úî SUCCESS: Green taxi data ingestion completed!")
except Exception as e:
    print(f"\n ‚ùå ERROR: Green taxi ingestion failed")
    print(f"ERROR: {str(e)}")
    import traceback
    traceback.print_exc()

print(f"\n{'#'*60}")
print(f"Process Complete")
print(f"{'#'*60}")


############################################################
Starting GREEN Taxi Bronze Layer Ingestion
Batch: batch_20260209_200002
############################################################

Processing GREEN taxi data
‚û° Reading green data for year 2025...
‚û° Found 1 valid parquet file(s)
    -month=09/green_tripdata_2025-09.parquet: 1.15 MB
 ‚úî Loaded 48,893 total rows from 2025
 ‚û° Checking for already processed files...
 ‚úî No new files to ingest for green

 ‚úî SUCCESS: Green taxi data ingestion completed!

############################################################
Process Complete
############################################################
