# ETL/ELT Pipeline - DB-11 Parking Intelligence Database

This notebook provides a comprehensive ETL/ELT pipeline for the Parking Intelligence Database (db-11).

## Pipeline Overview
1. **Extract**: Load data from Data.gov, Census Bureau, BTS TranStats, FHWA, and city open data portals
2. **Transform**: Clean, validate, and transform parking intelligence data
3. **Load**: Load transformed data into PostgreSQL
4. **Validate**: Verify data quality and completeness
5. **Monitor**: Track pipeline performance and errors

## Data Sources
- **Data.gov CKAN API**: Parking facility datasets from various cities
- **Census Bureau API**: Demographics and population data for metropolitan areas
- **BTS TranStats**: Airport passenger volumes and statistics
- **FHWA**: Traffic volume data and highway statistics
- **City Open Data Portals**: Real-time parking utilization and pricing data

## Section 1: Setup and Configuration

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import logging
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

# API and HTTP requests
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Geospatial libraries
try:
    import geopandas as gpd
    from shapely.geometry import Point, Polygon
    GEOSPATIAL_AVAILABLE = True
except ImportError:
    GEOSPATIAL_AVAILABLE = False
    print("Warning: geopandas/shapely not available")

# Database connections
try:
    from sqlalchemy import create_engine, text
    SQLALCHEMY_AVAILABLE = True
except ImportError:
    SQLALCHEMY_AVAILABLE = False
    print("Warning: sqlalchemy not available")

try:
    import psycopg2
    from psycopg2.extras import RealDictCursor
    PG_AVAILABLE = True
except ImportError:
    PG_AVAILABLE = False
    print("Warning: psycopg2 not available")

try:
    from databricks import sql
    DATABRICKS_AVAILABLE = True
except ImportError:
    DATABRICKS_AVAILABLE = False
    print("Warning: databricks-sql-connector not available")

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', None)

print("✓ Imports successful")

In [None]:
# Configuration
DB_NAME = "db-11"
DB_PATH = Path.cwd().parent

# Database connection strings (configure as needed)
# PostgreSQL
POSTGRES_CONNECTION_STRING = None  # "postgresql://user:password@localhost:5432/db_11_validation"

# Databricks
DATABRICKS_CONFIG = {
    'server_hostname': None,  # Set via environment variable DATABRICKS_SERVER_HOSTNAME
    'http_path': None,  # Set via environment variable DATABRICKS_HTTP_PATH
    'access_token': None  # Set via environment variable DATABRICKS_TOKEN
}

# Source data paths
DATA_DIR = DB_PATH / "data"
SCHEMA_FILE = DATA_DIR / "schema.sql"
DATA_FILE = DATA_DIR / "data.sql"
RESEARCH_DIR = DB_PATH / "research"

# API Configuration
# Data.gov CKAN API - optional API key for higher rate limits
DATA_GOV_API_KEY = None  # Set via environment variable DATA_GOV_API_KEY
DATA_GOV_CKAN_BASE_URL = "https://catalog.data.gov/api/3/action"

# Census Bureau API - optional API key recommended
CENSUS_API_KEY = None  # Set via environment variable CENSUS_API_KEY
CENSUS_BASE_URL = "https://api.census.gov/data"

# Target data size: 1 GB
TARGET_DATA_SIZE_GB_MIN = 1.0
TARGET_DATA_SIZE_GB_MAX = 30.0

# Geographic coverage: 400+ cities
TARGET_CITIES_COUNT = 400

print(f"Database: {DB_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Schema file exists: {SCHEMA_FILE.exists()}")
print(f"Target data size: {TARGET_DATA_SIZE_GB} GB")
print(f"Target cities: {TARGET_CITIES_COUNT}+")

## Section 2: Extract Phase

### 2.1 Data.gov CKAN API Integration - Parking Datasets

In [None]:
def create_session_with_retry() -> requests.Session:
    """Create requests session with retry strategy"""
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def search_data_gov_datasets(query: str = "parking", limit: int = 20, api_key: Optional[str] = None) -> Optional[Dict]:
    """
    Search for parking datasets in Data.gov via CKAN API.

    API Documentation: https://catalog.data.gov/api/3/action/package_search
    """
    session = create_session_with_retry()

    url = f"{DATA_GOV_CKAN_BASE_URL}/package_search"

    headers = {}
    if api_key:
        headers["X-API-Key"] = api_key

    params = {
        "q": query,
        "rows": limit,
        "start": 0
    }

    try:
        response = session.get(url, headers=headers, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()

        if data.get('success'):
            results = data.get('result', {}).get('results', [])
            logger.info(f"Found {len(results)} parking datasets on Data.gov")
            return data
        else:
            logger.warning(f"Data.gov API returned success=False")
            return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching Data.gov data: {e}")
        return None

# Search for parking datasets
parking_datasets = search_data_gov_datasets(query="parking", limit=50, api_key=DATA_GOV_API_KEY)
if parking_datasets:
    results = parking_datasets.get('result', {}).get('results', [])
    print(f"✓ Found {len(results)} parking datasets on Data.gov")
    if results:
        print(f"Sample dataset: {results[0].get('title', 'N/A')}")
else:
    print("⚠ Data.gov dataset search failed or returned no results")

### 2.2 Census Bureau API Integration - Metropolitan Areas and Cities

In [None]:
def fetch_census_msa_data(year: int = 2023, api_key: Optional[str] = None) -> Optional[Dict]:
    """
    Fetch metropolitan statistical area (MSA) population and demographic data.

    API Documentation: https://www.census.gov/data/developers/data-sets.html
    """
    session = create_session_with_retry()

    # ACS 5-year estimates for MSA data
    url = f"{CENSUS_BASE_URL}/{year}/acs/acs5"

    params = {
        "get": "B01001_001E,B19013_001E",  # Total population, Median household income
        "for": "metropolitan statistical area/micropolitan statistical area:*",
        "key": api_key if api_key else None
    }

    # Remove None values
    params = {k: v for k, v in params.items() if v is not None}

    try:
        response = session.get(url, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()

        logger.info(f"Fetched Census MSA data for year {year}")
        return data
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching Census data: {e}")
        return None

# Fetch Census MSA data
current_year = datetime.now().year
census_msa_data = fetch_census_msa_data(year=current_year - 1, api_key=CENSUS_API_KEY)
if census_msa_data:
    print(f"✓ Fetched Census MSA data")
    if len(census_msa_data) > 1:
        print(f"Sample: {census_msa_data[1]}")
else:
    print("⚠ Census MSA data fetch failed or skipped")

### 2.3 BTS TranStats - Airport Data Extraction

In [None]:
def fetch_bts_airport_data() -> Optional[pd.DataFrame]:
    """
    Fetch airport passenger data from BTS TranStats.
    Note: BTS TranStats uses web interface, may require web scraping or CSV download.
    """
    # BTS TranStats URL for airport data
    bts_url = "https://www.transtats.bts.gov/airports.asp"

    try:
        # Note: This is a placeholder - actual implementation would require
        # web scraping or CSV download from BTS TranStats
        logger.info("BTS TranStats data extraction - requires web scraping or CSV download")
        print("⚠ BTS TranStats extraction requires web scraping or manual CSV download")
        print(f"URL: {bts_url}")
        return None
    except Exception as e:
        logger.error(f"Error fetching BTS data: {e}")
        return None

# Attempt BTS data extraction
bts_airport_data = fetch_bts_airport_data()

### 2.4 City Open Data Portals - Parking Facility Data

In [None]:
def fetch_city_parking_data(city: str, dataset_id: str) -> Optional[pd.DataFrame]:
    """
    Fetch parking facility data from city open data portals.
    Supports CKAN, Socrata, and ArcGIS APIs.
    """
    city_portals = {
        "seattle": {
            "base_url": "https://data.seattle.gov/api/views",
            "api_type": "Socrata",
            "dataset_id": "3k2p-39jp"  # Public Garages and Parking Lots
        },
        "san_francisco": {
            "base_url": "https://data.sfgov.org/api/views",
            "api_type": "Socrata",
            "dataset_id": "wj8u-xu2y"  # Parking Meters
        },
        "austin": {
            "base_url": "https://data.austintexas.gov/api/views",
            "api_type": "Socrata",
            "dataset_id": "7d8e-dm7r"  # Off-Street Parking
        }
    }

    if city.lower() not in city_portals:
        logger.warning(f"City {city} not configured")
        return None

    portal_config = city_portals[city.lower()]
    session = create_session_with_retry()

    # Socrata API endpoint
    url = f"{portal_config['base_url']}/{portal_config['dataset_id']}/rows.json"

    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()

        # Convert to DataFrame
        df = pd.DataFrame(data)
        logger.info(f"Fetched {len(df)} parking facilities from {city}")
        return df
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {city} parking data: {e}")
        return None

# Example: Fetch Seattle parking data
seattle_parking = fetch_city_parking_data("seattle", "3k2p-39jp")
if seattle_parking is not None:
    print(f"✓ Fetched {len(seattle_parking)} parking facilities from Seattle")
else:
    print("⚠ Seattle parking data fetch failed or skipped")

## Section 3: Transform Phase

### 3.1 Data Cleaning and Normalization

In [None]:
def clean_parking_facility_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and normalize parking facility data.
    """
    if df is None or df.empty:
        return pd.DataFrame()

    # Remove duplicates
    df = df.drop_duplicates()

    # Standardize column names (example - adjust based on actual data)
    # This is a placeholder - actual cleaning depends on source data format

    # Handle missing values
    # df = df.fillna(...)

    # Validate coordinates
    if 'latitude' in df.columns and 'longitude' in df.columns:
        # Filter valid coordinates
        df = df[
            (df['latitude'] >= -90) & (df['latitude'] <= 90) &
            (df['longitude'] >= -180) & (df['longitude'] <= 180)
        ]

    logger.info(f"Cleaned parking facility data: {len(df)} records")
    return df

# Example: Clean Seattle parking data
if seattle_parking is not None:
    cleaned_seattle = clean_parking_facility_data(seattle_parking)
    print(f"✓ Cleaned {len(cleaned_seattle)} parking facilities")

## Section 4: Load Phase

### 4.1 Load to PostgreSQL

In [None]:
def load_to_postgresql(df: pd.DataFrame, table_name: str, connection_string: str) -> bool:
    """
    Load DataFrame to PostgreSQL table.
    """
    if not SQLALCHEMY_AVAILABLE or not connection_string:
        logger.warning("PostgreSQL connection not configured")
        return False

    try:
        engine = create_engine(connection_string)
        df.to_sql(table_name, engine, if_exists='append', index=False)
        logger.info(f"Loaded {len(df)} records to PostgreSQL table {table_name}")
        return True
    except Exception as e:
        logger.error(f"Error loading to PostgreSQL: {e}")
        return False

# Example: Load to PostgreSQL (requires connection string)
if POSTGRES_CONNECTION_STRING and seattle_parking is not None:
    load_to_postgresql(cleaned_seattle, "parking_facilities", POSTGRES_CONNECTION_STRING)
else:
    print("⚠ PostgreSQL loading skipped (connection not configured)")

## Section 5: Validate Phase

### 5.1 Data Quality Checks

In [None]:
def validate_data_quality(df: pd.DataFrame, table_name: str) -> Dict:
    """
    Validate data quality metrics.
    """
    if df is None or df.empty:
        return {"status": "empty", "records": 0}

    metrics = {
        "table_name": table_name,
        "total_records": len(df),
        "null_percentage": (df.isnull().sum() / len(df) * 100).to_dict(),
        "duplicate_count": df.duplicated().sum(),
        "data_types": df.dtypes.to_dict()
    }

    logger.info(f"Data quality metrics for {table_name}: {metrics}")
    return metrics

# Example: Validate Seattle parking data
if seattle_parking is not None:
    quality_metrics = validate_data_quality(cleaned_seattle, "parking_facilities")
    print(f"✓ Data quality validation complete: {quality_metrics['total_records']} records")

## Section 6: Monitor Phase

### 6.1 Pipeline Execution Tracking

In [None]:
# Track pipeline execution
pipeline_metadata = {
    "execution_timestamp": datetime.now().isoformat(),
    "database": DB_NAME,
    "sources_extracted": {
        "data_gov": parking_datasets is not None,
        "census": census_msa_data is not None,
        "bts": bts_airport_data is not None,
        "city_portals": seattle_parking is not None
    },
    "records_extracted": {
        "parking_facilities": len(seattle_parking) if seattle_parking is not None else 0
    },
    "data_quality_scores": {}
}

# Save pipeline metadata
metadata_file = RESEARCH_DIR / "pipeline_execution_log.json"
with open(metadata_file, 'w') as f:
    json.dump(pipeline_metadata, f, indent=2)

print(f"✓ Pipeline execution metadata saved to {metadata_file}")
print(f"Pipeline Summary:")
print(json.dumps(pipeline_metadata, indent=2))