# NYC Taxi ETL - Incremental Ingestion

In [1]:
import json
from pathlib import Path
from datetime import datetime
from pyspark.sql import SparkSession

In [2]:
INBOX_PATH = Path("/home/jovyan/work/data/inbox")
STATE_DIR = Path("/home/jovyan/work/state")
MANIFEST_PATH = STATE_DIR / "manifest.json"

STATE_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# Initialize Spark Session
spark = (SparkSession.builder
    .appName("NYC_Taxi_Incremental_Ingestion")
    .master("local[*]")
    .getOrCreate())

spark.sparkContext.setLogLevel("WARN")
print(f"Spark session initialized: {spark.version}")

Spark session initialized: 4.1.0


In [4]:
# Manifest management functions
def load_manifest():
    """Load the manifest tracking processed files."""
    if MANIFEST_PATH.exists():
        try:
            with open(MANIFEST_PATH, 'r') as f:
                manifest = json.load(f)

            if "processed_files" not in manifest:
                manifest["processed_files"] = []
            if "last_run" not in manifest:
                manifest["last_run"] = None
            return manifest
        except json.JSONDecodeError as e:
            print(f"Warning: Manifest file corrupted, creating new manifest. Error: {e}")
            return {"processed_files": [], "last_run": None}
        except Exception as e:
            print(f"Error reading manifest: {e}")
            raise
    return {"processed_files": [], "last_run": None}

def save_manifest(manifest):
    """Save the manifest to disk."""
    try:
        manifest["last_run"] = datetime.now().isoformat()
        with open(MANIFEST_PATH, 'w') as f:
            json.dump(manifest, indent=2, fp=f)
        print(f"Manifest saved: {len(manifest['processed_files'])} files tracked")
    except Exception as e:
        print(f"Error saving manifest: {e}")
        raise

def add_to_manifest(manifest, file_info):
    """Add a processed file to the manifest."""
    manifest["processed_files"].append(file_info)

print("Manifest functions defined")

Manifest functions defined


In [5]:
# Detect new
def get_new_files(inbox_path, manifest):
    """Identify files in inbox that haven't been processed yet."""
    inbox_path = Path(inbox_path)

    if not inbox_path.exists():
        raise FileNotFoundError(f"Inbox path does not exist: {inbox_path}")

    processed_filenames = {f["filename"] for f in manifest["processed_files"]}

    all_parquet = sorted(inbox_path.glob("*.parquet"))
    print(f"Parquet files found in inbox: {len(all_parquet)}")

    inbox_files = []
    for file in all_parquet:
        if "zone_lookup" in file.name: # for ignoring the zone lookup
            continue

        if file.name not in processed_filenames:
            file_stat = file.stat()
            inbox_files.append({
                "filename": file.name,
                "path": str(file),
                "size_bytes": file_stat.st_size
            })

    return inbox_files

manifest = load_manifest()
new_files = get_new_files(INBOX_PATH, manifest)

print("Manifest loaded")
print(f"  - Previously processed: {len(manifest['processed_files'])} files")
print(f"  - New files found: {len(new_files)} files")
if new_files:
    for f in new_files:
        print(f"    → {f['filename']} ({f['size_bytes']:,} bytes)")

Parquet files found in inbox: 3
Manifest loaded
  - Previously processed: 0 files
  - New files found: 2 files
    → yellow_tripdata_2025-01.parquet (59,158,238 bytes)
    → yellow_tripdata_2025-02.parquet (60,343,086 bytes)


In [6]:
# Process new files and update manifest
if len(new_files) == 0:
    print("No new files to process.")
else:
    print(f"Processing {len(new_files)} new file(s)...")
    
    all_dataframes = []
    
    for file_info in new_files:
        print(f"Processing: {file_info['filename']}")
        

        df = spark.read.parquet(file_info['path'])
        row_count = df.count()
        
        print(f"  Rows: {row_count:,}")
        print(f"  Size: {file_info['size_bytes']:,} bytes")

        all_dataframes.append(df)

        file_metadata = {
            "filename": file_info['filename'],
            "size_bytes": file_info['size_bytes'],
            "row_count": row_count,
            "processed_at": datetime.now().isoformat()
        }
        add_to_manifest(manifest, file_metadata)
        
        print(f"Added to manifest")
    
    # here you can add additional processing logic for example
    # you can use `all_dataframes` list for the transformation if you want
    save_manifest(manifest)
    
    print("Processing complete!")


Processing 2 new file(s)...
Processing: yellow_tripdata_2025-01.parquet
  Rows: 3,475,226
  Size: 59,158,238 bytes
Added to manifest
Processing: yellow_tripdata_2025-02.parquet
  Rows: 3,577,543
  Size: 60,343,086 bytes
Added to manifest
Manifest saved: 2 files tracked
Processing complete!
