# ETL/ELT Pipeline - db-16

Flood Risk Assessment Database ETL pipeline for FEMA, NOAA, USGS, and NASA data sources.

## Setup and Configuration

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import logging
import requests

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('etl_pipeline')

# Configuration
DB_NUM = 16
DB_NAME = 'db16'
BASE_DIR = Path('.').parent
DATA_DIR = BASE_DIR / 'data'
print(f'ETL Pipeline for db-{DB_NUM}')
print(f'Base directory: {BASE_DIR}')

## Extract Phase

### FEMA National Flood Hazard Layer

In [None]:
# FEMA Flood Map API
# Documentation: https://www.fema.gov/flood-maps/tools-resources/flood-map-products/national-flood-hazard-layer
FEMA_API_BASE = 'https://hazards.fema.gov/gis/nfhl/rest/services'

def extract_fema_flood_zones(state_fips, max_records=1000):
    """Extract FEMA flood zone data for a state."""
    url = f'{FEMA_API_BASE}/public/NFHL/MapServer/28/query'
    params = {
        'where': f"DFIRM_ID LIKE '{state_fips}%'",
        'outFields': '*',
        'f': 'json',
        'resultRecordCount': max_records
    }
    try:
        response = requests.get(url, params=params, timeout=30)
        if response.status_code == 200:
            data = response.json()
            features = data.get('features', [])
            logger.info(f'FEMA: Extracted {len(features)} features for {state_fips}')
            return features
    except Exception as e:
        logger.error(f'FEMA extraction error: {e}')
    return []

print('FEMA extraction function defined')

### NOAA Sea Level Rise

In [None]:
# NOAA CO-OPS API
# Documentation: https://api.tidesandcurrents.noaa.gov/api/prod/
NOAA_API_BASE = 'https://api.tidesandcurrents.noaa.gov/dpapi/prod/webapi'

def extract_noaa_sea_level_rise(station_id):
    """Extract NOAA sea level rise projections for a station."""
    url = f'{NOAA_API_BASE}/product/sltrends.json'
    params = {'station': station_id}
    try:
        response = requests.get(url, params=params, timeout=30)
        if response.status_code == 200:
            data = response.json()
            logger.info(f'NOAA: Extracted SLR data for station {station_id}')
            return data
    except Exception as e:
        logger.error(f'NOAA extraction error: {e}')
    return None

print('NOAA extraction function defined')

### USGS Streamflow Data

In [None]:
# USGS Water Services API
# Documentation: https://waterservices.usgs.gov/
USGS_API_BASE = 'https://waterservices.usgs.gov/nwis'

def extract_usgs_streamflow(state_code, period='P30D'):
    """Extract USGS streamflow data for a state."""
    url = f'{USGS_API_BASE}/iv/'
    params = {
        'format': 'json',
        'stateCd': state_code,
        'parameterCd': '00060,00065',  # Discharge and gage height
        'period': period,
        'siteType': 'ST',  # Stream sites
    }
    try:
        response = requests.get(url, params=params, timeout=60)
        if response.status_code == 200:
            data = response.json()
            ts = data.get('value', {}).get('timeSeries', [])
            logger.info(f'USGS: Extracted {len(ts)} time series for {state_code}')
            return ts
    except Exception as e:
        logger.error(f'USGS extraction error: {e}')
    return []

print('USGS extraction function defined')

### NASA Flood Models

In [None]:
# NASA Global Flood Monitoring System
# Documentation: https://flood.umd.edu/
NASA_GFMS_BASE = 'https://flood.umd.edu'

def extract_nasa_flood_data(date_str):
    """Extract NASA GFMS flood data for a date."""
    logger.info(f'NASA: Extracting flood data for {date_str}')
    # NASA GFMS provides binary data files
    # Actual extraction would download .bin or .tif files
    return None

print('NASA extraction function defined')

## Transform Phase

In [None]:
# Transform functions for each data source

def transform_fema_data(features):
    """Transform FEMA features into DataFrame."""
    records = []
    for f in features:
        attrs = f.get('attributes', {})
        records.append({
            'zone_code': attrs.get('FLD_ZONE', ''),
            'zone_description': attrs.get('ZONE_SUBTY', ''),
            'base_flood_elevation': attrs.get('STATIC_BFE'),
            'state_code': attrs.get('DFIRM_ID', '')[:2],
        })
    return pd.DataFrame(records)

print('Transform functions defined')

## Load Phase

In [None]:
# Load functions (requires psycopg2 and database connection)

def load_to_postgresql(df, table_name, conn_params):
    """Load DataFrame into PostgreSQL table."""
    try:
        import psycopg2
        from psycopg2.extras import execute_values
        conn = psycopg2.connect(**conn_params)
        cur = conn.cursor()
        cols = ', '.join(df.columns)
        vals = [tuple(row) for _, row in df.iterrows()]
        sql = f'INSERT INTO {table_name} ({cols}) VALUES %s ON CONFLICT DO NOTHING'
        execute_values(cur, sql, vals)
        conn.commit()
        logger.info(f'Loaded {len(vals)} rows into {table_name}')
        cur.close()
        conn.close()
    except Exception as e:
        logger.error(f'Load error: {e}')

print('Load functions defined')

## Pipeline Execution

In [None]:
# Run the full ETL pipeline
print('ETL Pipeline ready')
print('To run extraction, call the extract_* functions with appropriate parameters')
print('To transform, call transform_* functions')
print('To load, call load_to_postgresql with connection parameters')