In [6]:
import os
import requests
from pathlib import Path
from datetime import datetime

In [7]:
# Global Configuration

# NYC Taxi public data base URL
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data"


# Local landing zone base path
LANDING_BASE_PATH = Path("../data/landing/nyc_taxi")

# Taxi types we ingest
TAXI_TYPES = ["green"]

# Data range (adjust as needed)
YEAR = 2025
MONTHS = ["09"]


In [8]:
# Path Builder

def build_landing_path(taxi_type: str, year: int, month: str) -> Path:
    """
    Build landing zone folder path:
    data/landing/nyc_taxi/<taxi_type>/year=YYYY/month=MM
    """
    return LANDING_BASE_PATH / taxi_type / f"{year}" / f"month={month}"

In [9]:
# Download Helper

def download_file(url: str, destination_path: Path):
    """
    Download a file from URL to local destination path.
    """
    response = requests.get(url, stream=True)
    response.raise_for_status()

    destination_path.parent.mkdir(parents=True, exist_ok=True)

    with open(destination_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

In [10]:
# Landing Zone Ingestion Logic (Incremental)

for taxi_type in TAXI_TYPES:
    for month in MONTHS:
        file_name = f"{taxi_type}_tripdata_{YEAR}-{month}.parquet"
        file_url = f"{BASE_URL}/{file_name}"

        landing_path = build_landing_path(taxi_type, YEAR, month)
        destination_file = landing_path / file_name

        if destination_file.exists():
            print(f"SKIPPED (already exists): {destination_file}")
            continue

        print(f"DOWNLOADING: {file_url}")
        download_file(file_url, destination_file)
        print(f"SAVED TO: {destination_file}")

DOWNLOADING: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-09.parquet
SAVED TO: ..\data\landing\nyc_taxi\green\2025\month=09\green_tripdata_2025-09.parquet
