# ETL/ELT Pipeline - DB-1

This notebook provides a comprehensive ETL/ELT pipeline for database db-1.

## Pipeline Overview
1. **Extract**: Load data from source systems
2. **Transform**: Clean, validate, and transform data
3. **Load**: Load transformed data into target database
4. **Validate**: Verify data quality and completeness
5. **Monitor**: Track pipeline performance and errors

## Section 1: Setup and Configuration

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import logging
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

# API and HTTP requests
try:
    import requests
    REQUESTS_AVAILABLE = True
except ImportError:
    REQUESTS_AVAILABLE = False
    print("Warning: requests library not available. Install with: pip install requests")

# Database connections
try:
    from sqlalchemy import create_engine, text
    SQLALCHEMY_AVAILABLE = True
except ImportError:
    SQLALCHEMY_AVAILABLE = False
    print("Warning: sqlalchemy not available")

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', None)

print("✓ Imports successful")

In [None]:
# Configuration
DB_NAME = "db-1"
DB_PATH = Path.cwd().parent

# Database connection strings (configure as needed)
# PostgreSQL
POSTGRES_CONNECTION_STRING = None  # "postgresql://user:password@localhost:5432/dbname"

# Databricks
DATABRICKS_CONNECTION_STRING = None  # Configure Databricks connection

# Databricks
SNOWFLAKE_CONNECTION_STRING = None  # Configure Databricks connection

# Source data paths
DATA_DIR = DB_PATH / "data"
SCHEMA_FILE = DATA_DIR / "schema.sql"
DATA_FILE = DATA_DIR / "data.sql"

# API Configuration
# Data.gov CKAN API (no key required for metadata access)
DATA_GOV_CKAN_API_BASE = "https://catalog.data.gov/api/3/action"

# api.data.gov (requires API key - get from https://api.data.gov/signup/)
# Set via environment variable: export API_DATA_GOV_KEY="your_key_here"
API_DATA_GOV_KEY = os.getenv("API_DATA_GOV_KEY", None)

# National Weather Service API (no key required)
NWS_API_BASE = "https://api.weather.gov"

# GeoPlatform.gov API (no key required)
GEOPLATFORM_API_BASE = "https://geoapi.geoplatform.gov"
GEOPLATFORM_STAC_BASE = "https://stac.geoplatform.gov"
GEOPLATFORM_WEB_BASE = "https://www.geoplatform.gov"

# Research directory for resource documentation
RESEARCH_DIR = DB_PATH / "research"
RESOURCES_FILE = RESEARCH_DIR / "data_resources.json"
SOURCE_METADATA_FILE = RESEARCH_DIR / "source_metadata.json"

print(f"Database: {DB_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Schema file exists: {SCHEMA_FILE.exists()}")
print(f"Data file exists: {DATA_FILE.exists()}")
print(f"Research directory: {RESEARCH_DIR}")
print(f"✓ API endpoints configured")

## Section 2: Extract - Data Loading

In [None]:
def load_schema_file(schema_path: Path) -> Optional[str]:
    """Load database schema from SQL file."""
    try:
        if schema_path.exists():
            with open(schema_path, 'r') as f:
                return f.read()
        else:
            logger.warning(f"Schema file not found: {schema_path}")
            return None
    except Exception as e:
        logger.error(f"Error loading schema: {e}")
        return None

def load_data_file(data_path: Path) -> Optional[str]:
    """Load data from SQL file."""
    try:
        if data_path.exists():
            with open(data_path, 'r') as f:
                return f.read()
        else:
            logger.warning(f"Data file not found: {data_path}")
            return None
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return None

# Load schema and data
schema_sql = load_schema_file(SCHEMA_FILE)
data_sql = load_data_file(DATA_FILE)

if schema_sql:
    print(f"✓ Schema loaded ({len(schema_sql)} characters)")
if data_sql:
    print(f"✓ Data loaded ({len(data_sql)} characters)")

In [None]:
def extract_from_csv(csv_path: Path) -> Optional[pd.DataFrame]:
    """Extract data from CSV file."""
    try:
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            logger.info(f"Loaded {len(df)} rows from {csv_path.name}")
            return df
        return None
    except Exception as e:
        logger.error(f"Error loading CSV {csv_path}: {e}")
        return None

def extract_from_json(json_path: Path) -> Optional[Dict]:
    """Extract data from JSON file."""
    try:
        if json_path.exists():
            with open(json_path, 'r') as f:
                data = json.load(f)
            logger.info(f"Loaded JSON from {json_path.name}")
            return data
        return None
    except Exception as e:
        logger.error(f"Error loading JSON {json_path}: {e}")
        return None

# Find and load data files
csv_files = list(DATA_DIR.glob("*.csv"))
json_files = list(DATA_DIR.glob("*.json"))

extracted_data = {}

for csv_file in csv_files:
    df = extract_from_csv(csv_file)
    if df is not None:
        extracted_data[csv_file.stem] = df

for json_file in json_files:
    data = extract_from_json(json_file)
    if data is not None:
        extracted_data[json_file.stem] = data

print(f"✓ Extracted {len(extracted_data)} data sources")

## Section 2.1: Extract - Data.gov API Integration

In [None]:
def search_datagov_datasets(query: str = "", limit: int = 10, organization: str = None) -> Optional[Dict]:
    """
    Search Data.gov datasets using CKAN API.

    Args:
        query: Search query string
        limit: Maximum number of results to return
        organization: Filter by organization (e.g., 'usgs-gov', 'noaa-gov')

    Returns:
        Dictionary containing search results
    """
    if not REQUESTS_AVAILABLE:
        logger.warning("requests library not available for API calls")
        return None

    try:
        params = {
            'q': query,
            'rows': limit,
            'sort': 'metadata_modified desc'
        }

        if organization:
            params['fq'] = f'organization:{organization}'

        url = f"{DATA_GOV_CKAN_API_BASE}/package_search"
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()

        data = response.json()
        if data.get('success'):
            results = data.get('result', {})
            logger.info(f"Found {results.get('count', 0)} datasets matching query: {query}")
            return results
        else:
            logger.error(f"API returned success=False: {data.get('error', {})}")
            return None

    except requests.exceptions.RequestException as e:
        logger.error(f"Error querying Data.gov API: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return None

def get_datagov_dataset_details(package_id: str) -> Optional[Dict]:
    """
    Get detailed information about a specific Data.gov dataset.

    Args:
        package_id: Dataset package ID from Data.gov

    Returns:
        Dictionary containing dataset details
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{DATA_GOV_CKAN_API_BASE}/package_show"
        params = {'id': package_id}
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()

        data = response.json()
        if data.get('success'):
            return data.get('result')
        return None
    except Exception as e:
        logger.error(f"Error fetching dataset details for {package_id}: {e}")
        return None

def download_datagov_resource(resource_url: str, output_path: Path = None) -> Optional[pd.DataFrame]:
    """
    Download and load a Data.gov resource (CSV, JSON, etc.).

    Args:
        resource_url: URL to the resource file
        output_path: Optional path to save the file locally

    Returns:
        DataFrame if CSV, Dict if JSON, None otherwise
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        response = requests.get(resource_url, timeout=60)
        response.raise_for_status()

        # Determine file type from URL or content
        if resource_url.endswith('.csv') or 'csv' in response.headers.get('content-type', ''):
            df = pd.read_csv(pd.io.common.StringIO(response.text))
            if output_path:
                df.to_csv(output_path, index=False)
            logger.info(f"Downloaded CSV resource: {len(df)} rows")
            return df
        elif resource_url.endswith('.json') or 'json' in response.headers.get('content-type', ''):
            data = response.json()
            if output_path:
                with open(output_path, 'w') as f:
                    json.dump(data, f, indent=2)
            logger.info(f"Downloaded JSON resource")
            return data
        else:
            logger.warning(f"Unsupported resource type: {resource_url}")
            return None

    except Exception as e:
        logger.error(f"Error downloading resource {resource_url}: {e}")
        return None

# Example: Search for weather-related datasets
print("Searching Data.gov for weather datasets...")
weather_datasets = search_datagov_datasets(query="weather", limit=5, organization="noaa-gov")

if weather_datasets:
    print(f"✓ Found {weather_datasets.get('count', 0)} weather datasets")
    if weather_datasets.get('results'):
        print("\nSample datasets:")
        for dataset in weather_datasets['results'][:3]:
            print(f"  - {dataset.get('title', 'N/A')} (ID: {dataset.get('id', 'N/A')})")
else:
    print("⚠ Could not search Data.gov (requests library may not be available)")

## Section 2.2: Extract - National Weather Service API Integration

In [None]:
def get_nws_points(latitude: float, longitude: float) -> Optional[Dict]:
    """
    Get NWS forecast office and gridpoint information for a location.

    Args:
        latitude: Latitude in decimal degrees (WGS84)
        longitude: Longitude in decimal degrees (WGS84)

    Returns:
        Dictionary containing forecast URLs and gridpoint information
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        # Round to 4 decimal places as required by NWS API
        lat = round(latitude, 4)
        lon = round(longitude, 4)

        url = f"{NWS_API_BASE}/points/{lat},{lon}"
        headers = {'User-Agent': 'ETL-Pipeline/1.0 (Research Project)'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Retrieved NWS points data for ({lat}, {lon})")
        return data

    except requests.exceptions.RequestException as e:
        logger.error(f"Error querying NWS API: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return None

def get_nws_forecast(forecast_url: str) -> Optional[Dict]:
    """
    Get forecast data from NWS forecast URL.

    Args:
        forecast_url: Forecast URL from points endpoint

    Returns:
        Dictionary containing forecast data
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        headers = {'User-Agent': 'ETL-Pipeline/1.0 (Research Project)'}
        response = requests.get(forecast_url, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info("Retrieved NWS forecast data")
        return data

    except Exception as e:
        logger.error(f"Error fetching forecast: {e}")
        return None

def get_nws_gridpoint_forecast(grid_id: str, grid_x: int, grid_y: int) -> Optional[Dict]:
    """
    Get raw numerical forecast data from NWS gridpoint endpoint.

    Args:
        grid_id: Grid ID (e.g., 'OKX')
        grid_x: Grid X coordinate
        grid_y: Grid Y coordinate

    Returns:
        Dictionary containing time-series forecast data
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{NWS_API_BASE}/gridpoints/{grid_id}/{grid_x},{grid_y}/forecast"
        headers = {'User-Agent': 'ETL-Pipeline/1.0 (Research Project)'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Retrieved NWS gridpoint forecast for {grid_id}/{grid_x},{grid_y}")
        return data

    except Exception as e:
        logger.error(f"Error fetching gridpoint forecast: {e}")
        return None

def get_nws_alerts(area: str = None, severity: str = None, urgency: str = None) -> Optional[Dict]:
    """
    Get active weather alerts from NWS.

    Args:
        area: State code (e.g., 'NY', 'CA') or area code
        severity: Filter by severity (extreme, severe, moderate, minor, unknown)
        urgency: Filter by urgency (immediate, expected, future, past, unknown)

    Returns:
        Dictionary containing alert data
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{NWS_API_BASE}/alerts/active"
        params = {}
        if area:
            params['area'] = area
        if severity:
            params['severity'] = severity
        if urgency:
            params['urgency'] = urgency

        headers = {'User-Agent': 'ETL-Pipeline/1.0 (Research Project)'}
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Retrieved NWS alerts (area: {area or 'all'})")
        return data

    except Exception as e:
        logger.error(f"Error fetching alerts: {e}")
        return None

def nws_forecast_to_dataframe(forecast_data: Dict) -> Optional[pd.DataFrame]:
    """
    Convert NWS forecast data to pandas DataFrame.

    Args:
        forecast_data: Forecast data dictionary from NWS API

    Returns:
        DataFrame with forecast periods
    """
    if not forecast_data or 'properties' not in forecast_data:
        return None

    try:
        periods = forecast_data['properties'].get('periods', [])
        if not periods:
            return None

        df = pd.DataFrame(periods)
        logger.info(f"Converted {len(df)} forecast periods to DataFrame")
        return df

    except Exception as e:
        logger.error(f"Error converting forecast to DataFrame: {e}")
        return None

# Example: Get forecast for New York City (latitude: 40.7128, longitude: -74.0060)
print("Fetching NWS data for New York City...")
nyc_points = get_nws_points(40.7128, -74.0060)

if nyc_points:
    print("✓ Retrieved NWS points data")
    properties = nyc_points.get('properties', {})
    print(f"  Forecast Office: {properties.get('forecastOffice', 'N/A')}")
    print(f"  Grid ID: {properties.get('gridId', 'N/A')}")
    print(f"  Grid X: {properties.get('gridX', 'N/A')}")
    print(f"  Grid Y: {properties.get('gridY', 'N/A')}")

    # Get forecast
    forecast_url = properties.get('forecast')
    if forecast_url:
        forecast_data = get_nws_forecast(forecast_url)
        if forecast_data:
            forecast_df = nws_forecast_to_dataframe(forecast_data)
            if forecast_df is not None:
                extracted_data['nws_nyc_forecast'] = forecast_df
                print(f"✓ Added forecast data: {len(forecast_df)} periods")
else:
    print("⚠ Could not fetch NWS data (requests library may not be available)")

## Section 2.3: Resource Collation and Documentation

## Section 2.3: Extract - GeoPlatform.gov API Integration

In [None]:
def search_geoplatform_datasets(query: str = "", limit: int = 10, ngda_only: bool = False) -> Optional[Dict]:
    """
    Search GeoPlatform.gov datasets using the GeoAPI.

    Args:
        query: Search query string
        limit: Maximum number of results to return
        ngda_only: Filter to National Geospatial Data Assets (NGDA) only

    Returns:
        Dictionary containing search results
    """
    if not REQUESTS_AVAILABLE:
        logger.warning("requests library not available for API calls")
        return None

    try:
        # GeoPlatform API endpoint for searching
        url = f"{GEOPLATFORM_API_BASE}/items/search"
        params = {
            'q': query,
            'limit': limit
        }

        if ngda_only:
            params['collections'] = 'ngda'

        headers = {'Accept': 'application/json'}
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Found {len(data.get('features', []))} GeoPlatform datasets matching query: {query}")
        return data

    except requests.exceptions.RequestException as e:
        logger.error(f"Error querying GeoPlatform API: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return None

def get_geoplatform_stac_collections() -> Optional[Dict]:
    """
    Get STAC (SpatioTemporal Asset Catalog) collections from GeoPlatform.

    Returns:
        Dictionary containing STAC collections
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{GEOPLATFORM_STAC_BASE}/collections"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Retrieved {len(data.get('collections', []))} STAC collections")
        return data

    except Exception as e:
        logger.error(f"Error fetching STAC collections: {e}")
        return None

def get_geoplatform_stac_collection(collection_id: str) -> Optional[Dict]:
    """
    Get details of a specific STAC collection.

    Args:
        collection_id: Collection identifier

    Returns:
        Dictionary containing collection details
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{GEOPLATFORM_STAC_BASE}/collections/{collection_id}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Retrieved STAC collection: {collection_id}")
        return data

    except Exception as e:
        logger.error(f"Error fetching STAC collection {collection_id}: {e}")
        return None

def search_geoplatform_stac(query: Dict = None, collections: List[str] = None, limit: int = 10) -> Optional[Dict]:
    """
    Search GeoPlatform STAC catalog for items.

    Args:
        query: STAC query parameters (bbox, datetime, etc.)
        collections: List of collection IDs to search
        limit: Maximum number of results

    Returns:
        Dictionary containing STAC search results
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        url = f"{GEOPLATFORM_STAC_BASE}/search"
        params = {'limit': limit}

        if collections:
            params['collections'] = ','.join(collections)

        payload = {}
        if query:
            payload.update(query)

        headers = {
            'Accept': 'application/json',
            'Content-Type': 'application/json'
        }

        response = requests.post(url, json=payload, params=params, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        logger.info(f"Found {len(data.get('features', []))} STAC items")
        return data

    except Exception as e:
        logger.error(f"Error searching STAC catalog: {e}")
        return None

def download_geoplatform_resource(resource_url: str, output_path: Path = None) -> Optional[pd.DataFrame]:
    """
    Download a GeoPlatform resource (GeoJSON, GeoPackage, Shapefile, etc.).

    Args:
        resource_url: URL to the resource file
        output_path: Optional path to save the file locally

    Returns:
        DataFrame if GeoJSON, file path if other formats, None otherwise
    """
    if not REQUESTS_AVAILABLE:
        return None

    try:
        response = requests.get(resource_url, timeout=120)
        response.raise_for_status()

        # Determine file type from URL or content
        if resource_url.endswith('.geojson') or resource_url.endswith('.json'):
            data = response.json()
            if isinstance(data, dict) and 'features' in data:
                # GeoJSON FeatureCollection
                features = data.get('features', [])
                if features:
                    df = pd.json_normalize([f.get('properties', {}) for f in features])
                    # Add geometry if needed
                    if output_path:
                        df.to_csv(output_path.with_suffix('.csv'), index=False)
                    logger.info(f"Downloaded GeoJSON resource: {len(df)} features")
                    return df
            else:
                # Regular JSON
                if output_path:
                    with open(output_path, 'w') as f:
                        json.dump(data, f, indent=2)
                return data
        else:
            # Save other formats (GeoPackage, Shapefile, etc.) as files
            if output_path:
                with open(output_path, 'wb') as f:
                    f.write(response.content)
                logger.info(f"Downloaded resource to {output_path}")
                return str(output_path)
            else:
                logger.warning("Output path required for non-JSON resources")
                return None

    except Exception as e:
        logger.error(f"Error downloading resource {resource_url}: {e}")
        return None

# Example: Search for NGDA datasets
print("Searching GeoPlatform.gov for NGDA datasets...")
ngda_datasets = search_geoplatform_datasets(query="boundaries", limit=5, ngda_only=True)

if ngda_datasets:
    print(f"✓ Found {len(ngda_datasets.get('features', []))} NGDA datasets")
    if ngda_datasets.get('features'):
        print("\nSample datasets:")
        for feature in ngda_datasets['features'][:3]:
            props = feature.get('properties', {})
            print(f"  - {props.get('title', 'N/A')} (ID: {feature.get('id', 'N/A')})")
else:
    print("⚠ Could not search GeoPlatform (requests library may not be available)")

# Example: Get STAC collections
print("\nFetching GeoPlatform STAC collections...")
stac_collections = get_geoplatform_stac_collections()

if stac_collections:
    print(f"✓ Retrieved {len(stac_collections.get('collections', []))} STAC collections")
    if stac_collections.get('collections'):
        print("\nSample collections:")
        for collection in stac_collections['collections'][:3]:
            print(f"  - {collection.get('title', 'N/A')} (ID: {collection.get('id', 'N/A')})")

## Section 2.4: Resource Collation and Documentation

In [None]:
def collate_data_resources() -> Dict:
    """
    Collate and document all data resources used in this ETL pipeline.

    Returns:
        Dictionary containing resource metadata
    """
    resources = {
        'database': DB_NAME,
        'collation_timestamp': datetime.now().isoformat(),
        'data_sources': {
            'local_files': {
                'description': 'Local CSV, JSON, and SQL files',
                'location': str(DATA_DIR),
                'files': []
            },
            'data_gov': {
                'description': 'Data.gov CKAN API - Federal open data portal',
                'api_base': DATA_GOV_CKAN_API_BASE,
                'documentation': 'https://data.gov/developers/apis/',
                'datasets': []
            },
            'national_weather_service': {
                'description': 'National Weather Service API - Weather forecasts and alerts',
                'api_base': NWS_API_BASE,
                'documentation': 'https://weather-gov.github.io/api/',
                'endpoints': {
                    'points': '/points/{lat},{lon}',
                    'forecast': '/gridpoints/{gridId}/{gridX},{gridY}/forecast',
                    'alerts': '/alerts/active'
                },
                'data_collected': []
            },
            'geoplatform_gov': {
                'description': 'GeoPlatform.gov - Federal geospatial data platform (FAIR principles)',
                'api_base': GEOPLATFORM_API_BASE,
                'stac_base': GEOPLATFORM_STAC_BASE,
                'web_base': GEOPLATFORM_WEB_BASE,
                'documentation': 'https://www.geoplatform.gov/',
                'stac_docs': 'https://stac.geoplatform.gov',
                'api_docs': 'https://geoapi.geoplatform.gov',
                'endpoints': {
                    'search': '/items/search',
                    'stac_collections': '/collections',
                    'stac_search': '/search'
                },
                'data_formats': ['GeoJSON', 'GeoPackage', 'Shapefile', 'WMS', 'WFS'],
                'ngda_themes': 18,
                'datasets': []
            }
        },
        'extraction_summary': {
            'total_sources': len(extracted_data),
            'source_names': list(extracted_data.keys())
        }
    }

    # Document local files
    csv_files = list(DATA_DIR.glob("*.csv"))
    json_files = list(DATA_DIR.glob("*.json"))
    sql_files = list(DATA_DIR.glob("*.sql"))

    for csv_file in csv_files:
        resources['data_sources']['local_files']['files'].append({
            'name': csv_file.name,
            'type': 'CSV',
            'path': str(csv_file)
        })

    for json_file in json_files:
        resources['data_sources']['local_files']['files'].append({
            'name': json_file.name,
            'type': 'JSON',
            'path': str(json_file)
        })

    for sql_file in sql_files:
        resources['data_sources']['local_files']['files'].append({
            'name': sql_file.name,
            'type': 'SQL',
            'path': str(sql_file)
        })

    # Document Data.gov datasets if searched
    if 'weather_datasets' in locals() and weather_datasets:
        for dataset in weather_datasets.get('results', [])[:10]:  # Limit to 10
            resources['data_sources']['data_gov']['datasets'].append({
                'id': dataset.get('id'),
                'title': dataset.get('title'),
                'organization': dataset.get('organization', {}).get('name', 'N/A'),
                'modified': dataset.get('metadata_modified'),
                'resources': len(dataset.get('resources', []))
            })

    # Document NWS data collected
    if 'nyc_points' in locals() and nyc_points:
        resources['data_sources']['national_weather_service']['data_collected'].append({
            'type': 'points',
            'location': 'New York City (40.7128, -74.0060)',
            'timestamp': datetime.now().isoformat(),
            'forecast_office': nyc_points.get('properties', {}).get('forecastOffice')
        })

    if 'nws_nyc_forecast' in extracted_data:
        resources['data_sources']['national_weather_service']['data_collected'].append({
            'type': 'forecast',
            'location': 'New York City',
            'periods': len(extracted_data['nws_nyc_forecast']),
            'timestamp': datetime.now().isoformat()
        })

    # Document GeoPlatform datasets if searched
    if 'ngda_datasets' in locals() and ngda_datasets:
        for feature in ngda_datasets.get('features', [])[:10]:  # Limit to 10
            props = feature.get('properties', {})
            resources['data_sources']['geoplatform_gov']['datasets'].append({
                'id': feature.get('id'),
                'title': props.get('title'),
                'description': props.get('description', '')[:200],  # Truncate long descriptions
                'ngda_theme': props.get('ngdaTheme'),
                'modified': props.get('updated'),
                'bbox': feature.get('bbox'),
                'links': [link.get('href') for link in feature.get('links', []) if link.get('rel') == 'self']
            })

    if 'stac_collections' in locals() and stac_collections:
        for collection in stac_collections.get('collections', [])[:10]:  # Limit to 10
            resources['data_sources']['geoplatform_gov']['datasets'].append({
                'id': collection.get('id'),
                'title': collection.get('title'),
                'description': collection.get('description', '')[:200],
                'type': 'stac_collection',
                'extent': collection.get('extent'),
                'links': [link.get('href') for link in collection.get('links', [])]
            })

    return resources

# Collate resources
data_resources = collate_data_resources()

# Save resources documentation
RESOURCES_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(RESOURCES_FILE, 'w') as f:
    json.dump(data_resources, f, indent=2, default=str)

print(f"✓ Resources collated and saved to {RESOURCES_FILE}")
print(f"\nResource Summary:")
print(f"  Local files: {len(data_resources['data_sources']['local_files']['files'])}")
print(f"  Data.gov datasets: {len(data_resources['data_sources']['data_gov']['datasets'])}")
print(f"  NWS data collections: {len(data_resources['data_sources']['national_weather_service']['data_collected'])}")
print(f"  GeoPlatform datasets: {len(data_resources['data_sources']['geoplatform_gov']['datasets'])}")
print(f"  Total extracted sources: {data_resources['extraction_summary']['total_sources']}")

## Section 2.5: Source Metadata Tracking

In [None]:
def create_source_metadata() -> Dict:
    """
    Create comprehensive source metadata tracking for all data sources.
    This provides a detailed reference list of where data is sourced from.

    Returns:
        Dictionary containing detailed source metadata
    """
    source_metadata = {
        'database': DB_NAME,
        'metadata_version': '1.0',
        'created_timestamp': datetime.now().isoformat(),
        'last_updated': datetime.now().isoformat(),
        'sources': {
            'data_gov': {
                'name': 'Data.gov',
                'type': 'federal_open_data_portal',
                'url': 'https://data.gov',
                'api_endpoints': {
                    'ckan_api': {
                        'base_url': DATA_GOV_CKAN_API_BASE,
                        'description': 'CKAN API for dataset metadata search',
                        'authentication': 'none_required',
                        'rate_limit': 'none_specified',
                        'documentation': 'https://data.gov/developers/apis/',
                        'endpoints': {
                            'package_search': '/package_search',
                            'package_show': '/package_show'
                        }
                    },
                    'api_data_gov': {
                        'base_url': 'https://api.data.gov',
                        'description': 'API management service for federal agency datasets',
                        'authentication': 'api_key_required',
                        'api_key_signup': 'https://api.data.gov/signup/',
                        'rate_limit': '1000_requests_per_hour',
                        'documentation': 'https://api.data.gov/docs/developer-manual/',
                        'demo_key': 'DEMO_KEY (limited: 30 req/hour, 50/day)'
                    }
                },
                'data_formats': ['CSV', 'JSON', 'XML', 'RDF', 'API'],
                'datasets_accessed': []
            },
            'national_weather_service': {
                'name': 'National Weather Service',
                'type': 'weather_api',
                'url': 'https://www.weather.gov',
                'api_endpoints': {
                    'base_url': NWS_API_BASE,
                    'description': 'Public REST API for NWS weather data',
                    'authentication': 'none_required',
                    'user_agent_required': True,
                    'rate_limit': 'none_official_be_respectful',
                    'documentation': 'https://weather-gov.github.io/api/',
                    'openapi_spec': 'https://api.weather.gov/openapi.json',
                    'endpoints': {
                        'points': {
                            'path': '/points/{lat},{lon}',
                            'description': 'Get forecast office and gridpoint info',
                            'coordinate_system': 'WGS84 (EPSG:4326)',
                            'precision': '4_decimal_places_maximum'
                        },
                        'forecast': {
                            'path': '/gridpoints/{gridId}/{gridX},{gridY}/forecast',
                            'description': 'Raw numerical forecast data on 2.5km grid'
                        },
                        'alerts': {
                            'path': '/alerts/active',
                            'description': 'Active weather alerts',
                            'parameters': ['area', 'severity', 'urgency']
                        }
                    }
                },
                'data_formats': ['GeoJSON (RFC 7946)', 'CAP XML'],
                'datasets_accessed': []
            },
            'geoplatform_gov': {
                'name': 'GeoPlatform.gov',
                'type': 'geospatial_data_platform',
                'url': GEOPLATFORM_WEB_BASE,
                'description': 'Federal geospatial data platform following FAIR principles',
                'authority': 'Geospatial Data Act of 2018',
                'api_endpoints': {
                    'geoapi': {
                        'base_url': GEOPLATFORM_API_BASE,
                        'description': 'GeoPlatform REST API',
                        'authentication': 'none_required',
                        'documentation': 'https://geoapi.geoplatform.gov',
                        'endpoints': {
                            'items_search': '/items/search',
                            'description': 'Search for geospatial datasets'
                        }
                    },
                    'stac': {
                        'base_url': GEOPLATFORM_STAC_BASE,
                        'description': 'SpatioTemporal Asset Catalog (STAC) interface',
                        'authentication': 'none_required',
                        'documentation': 'https://stac.geoplatform.gov',
                        'endpoints': {
                            'collections': '/collections',
                            'collection': '/collections/{collectionId}',
                            'search': '/search'
                        }
                    }
                },
                'data_formats': ['GeoJSON', 'GeoPackage', 'Shapefile', 'WMS', 'WFS', 'Map Vector', 'XYZ Raster'],
                'catalogs': {
                    'artifact_catalog': 'Automatically converted data files (GeoPackage, Shapefiles, GeoJSON)',
                    'tile_service_catalog': 'Map tiles in Map Vector and XYZ Raster formats',
                    'map_services_catalog': 'WMS/WFS services with download options'
                },
                'ngda_themes': 18,
                'ngda_description': 'National Geospatial Data Assets across 18 data themes',
                'datasets_accessed': []
            }
        },
        'extraction_history': [],
        'data_lineage': {}
    }

    # Add extraction history from current session
    if extracted_data:
        for source_name, data in extracted_data.items():
            source_info = {
                'source_name': source_name,
                'extraction_timestamp': datetime.now().isoformat(),
                'data_type': type(data).__name__,
                'row_count': len(data) if isinstance(data, pd.DataFrame) else 'N/A',
                'source_system': 'unknown'
            }

            # Determine source system
            if 'nws' in source_name.lower():
                source_info['source_system'] = 'national_weather_service'
            elif 'geoplatform' in source_name.lower() or 'ngda' in source_name.lower():
                source_info['source_system'] = 'geoplatform_gov'
            elif any(keyword in source_name.lower() for keyword in ['data', 'gov', 'federal']):
                source_info['source_system'] = 'data_gov'

            source_metadata['extraction_history'].append(source_info)

    return source_metadata

# Create comprehensive source metadata
source_metadata = create_source_metadata()

# Save source metadata
SOURCE_METADATA_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(SOURCE_METADATA_FILE, 'w') as f:
    json.dump(source_metadata, f, indent=2, default=str)

print(f"✓ Source metadata saved to {SOURCE_METADATA_FILE}")
print(f"\nSource Metadata Summary:")
print(f"  Total sources documented: {len(source_metadata['sources'])}")
print(f"  Extraction history entries: {len(source_metadata['extraction_history'])}")
print(f"\nDocumented Sources:")
for source_key, source_info in source_metadata['sources'].items():
    print(f"  - {source_info['name']}: {source_info['type']}")

## Section 3: Transform - Data Cleaning and Transformation

In [None]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean DataFrame: handle missing values, remove duplicates, etc."""
    if df is None or df.empty:
        return df

    # Remove duplicates
    initial_rows = len(df)
    df = df.drop_duplicates()
    duplicates_removed = initial_rows - len(df)

    # Handle missing values
    missing_before = df.isnull().sum().sum()
    # Fill numeric columns with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    # Fill text columns with mode
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else '')
    missing_after = df.isnull().sum().sum()

    logger.info(f"Cleaned data: removed {duplicates_removed} duplicates, filled {missing_before - missing_after} missing values")
    return df

# Clean extracted data
cleaned_data = {}
for name, data in extracted_data.items():
    if isinstance(data, pd.DataFrame):
        cleaned_data[name] = clean_dataframe(data)
    else:
        cleaned_data[name] = data

print(f"✓ Cleaned {len(cleaned_data)} data sources")

In [None]:
def validate_dataframe(df: pd.DataFrame, required_columns: List[str] = None) -> Dict:
    """Validate DataFrame structure and data quality."""
    if df is None or df.empty:
        return {'valid': False, 'errors': ['DataFrame is empty or None']}

    validation_results = {
        'valid': True,
        'row_count': len(df),
        'column_count': len(df.columns),
        'missing_values': df.isnull().sum().to_dict(),
        'duplicate_rows': df.duplicated().sum(),
        'errors': []
    }

    # Check required columns
    if required_columns:
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            validation_results['valid'] = False
            validation_results['errors'].append(f"Missing required columns: {missing_cols}")

    return validation_results

# Validate cleaned data
validation_results = {}
for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame):
        validation_results[name] = validate_dataframe(data)

# Display validation results
for name, results in validation_results.items():
    status = "✓" if results['valid'] else "✗"
    print(f"{status} {name}: {results['row_count']} rows, {results['column_count']} columns")
    if results['errors']:
        for error in results['errors']:
            print(f"  Error: {error}")

## Section 4: Load - Data Loading to Target Database

In [None]:
def load_to_postgresql(df: pd.DataFrame, table_name: str, connection_string: str, if_exists: str = 'replace') -> bool:
    """Load DataFrame to PostgreSQL table."""
    if not SQLALCHEMY_AVAILABLE or connection_string is None:
        logger.warning("PostgreSQL connection not available")
        return False

    try:
        engine = create_engine(connection_string)
        df.to_sql(table_name, engine, if_exists=if_exists, index=False)
        logger.info(f"Loaded {len(df)} rows to PostgreSQL table {table_name}")
        return True
    except Exception as e:
        logger.error(f"Error loading to PostgreSQL: {e}")
        return False

def load_to_databricks(df: pd.DataFrame, table_name: str, connection_string: str) -> bool:
    """Load DataFrame to Databricks table."""
    if not SQLALCHEMY_AVAILABLE or connection_string is None:
        logger.warning("Databricks connection not available")
        return False

    try:
        engine = create_engine(connection_string)
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        logger.info(f"Loaded {len(df)} rows to Databricks table {table_name}")
        return True
    except Exception as e:
        logger.error(f"Error loading to Databricks: {e}")
        return False

# Load data to target databases
load_results = {}

for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame) and not data.empty:
        table_name = name.lower().replace(' ', '_')

        # PostgreSQL
        if POSTGRES_CONNECTION_STRING:
            load_results[f"{name}_postgres"] = load_to_postgresql(
                data, table_name, POSTGRES_CONNECTION_STRING
            )

        # Databricks
        if SNOWFLAKE_CONNECTION_STRING:
            load_results[f"{name}_databricks"] = load_to_databricks(
                data, table_name, SNOWFLAKE_CONNECTION_STRING
            )

print(f"✓ Loaded {sum(load_results.values())} datasets to target databases")

## Section 5: Validate - Data Quality Checks

In [None]:
def generate_data_quality_report(df: pd.DataFrame, table_name: str) -> Dict:
    """Generate comprehensive data quality report."""
    if df is None or df.empty:
        return {'table': table_name, 'status': 'empty'}

    report = {
        'table': table_name,
        'row_count': len(df),
        'column_count': len(df.columns),
        'missing_values': int(df.isnull().sum().sum()),
        'missing_percentage': float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100),
        'duplicate_rows': int(df.duplicated().sum()),
        'data_types': df.dtypes.astype(str).to_dict(),
        'numeric_stats': {},
        'timestamp': datetime.now().isoformat()
    }

    # Add statistics for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        report['numeric_stats'] = df[numeric_cols].describe().to_dict()

    return report

# Generate quality reports
quality_reports = {}
for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame):
        quality_reports[name] = generate_data_quality_report(data, name)

# Display quality reports
for name, report in quality_reports.items():
    print(f"\n=== {name} ===")
    print(f"Rows: {report['row_count']}")
    print(f"Columns: {report['column_count']}")
    print(f"Missing values: {report['missing_values']} ({report['missing_percentage']:.2f}%)")
    print(f"Duplicate rows: {report['duplicate_rows']}")

## Section 6: Monitor - Pipeline Monitoring and Logging

In [None]:
# Save pipeline execution metadata
pipeline_metadata = {
    'database': DB_NAME,
    'execution_timestamp': datetime.now().isoformat(),
    'data_sources': list(extracted_data.keys()),
    'extracted_count': len(extracted_data),
    'cleaned_count': len(cleaned_data),
    'validation_results': validation_results,
    'load_results': load_results,
    'quality_reports': quality_reports,
    'status': 'completed'
}

# Save metadata to JSON
metadata_file = DB_PATH / "metadata" / "pipeline_metadata.json"
metadata_file.parent.mkdir(parents=True, exist_ok=True)

with open(metadata_file, 'w') as f:
    json.dump(pipeline_metadata, f, indent=2, default=str)

print(f"✓ Pipeline metadata saved to {metadata_file}")

# Display summary
print("\n" + "="*80)
print("PIPELINE EXECUTION SUMMARY")
print("="*80)
print(f"Database: {pipeline_metadata['database']}")
print(f"Execution time: {pipeline_metadata['execution_timestamp']}")
print(f"Data sources extracted: {pipeline_metadata['extracted_count']}")
print(f"Datasets cleaned: {pipeline_metadata['cleaned_count']}")
print(f"Successful loads: {sum(pipeline_metadata['load_results'].values())}")
print(f"Status: {pipeline_metadata['status']}")