# Fetch Satellite Images from NAIP API

This notebook downloads 4-channel NAIP satellite imagery (RGB + NIR) for each house location using the USGS NAIP ImageServer API. Images are saved as TIFF files in `naip_images/train_224/` and `naip_images/test_224/` directories. The script uses parallel processing to efficiently download images for all house IDs in the training and test datasets.


In [None]:
import pandas as pd
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pyproj import Transformer
from tqdm import tqdm
import rasterio 

# ================= CONFIGURATION =================
# Input Files
TRAIN_CSV_PATH = 'train_tabular.csv'        
TEST_CSV_PATH = 'test_tabular.csv'                

# Output Directories
BASE_OUTPUT_DIR = "naip_images"
TRAIN_IMG_DIR = os.path.join(BASE_OUTPUT_DIR, "train_224")
TEST_IMG_DIR = os.path.join(BASE_OUTPUT_DIR, "test_224")

# Image Params
# ================= PERFECTED HYPERPARAMS (ZOOM 19) =================
# 1. Image Dimensions
IMAGE_SIZE = 224          
# Zoom 19 is approx 0.3 meters/pixel.

VIEW_WIDTH_METERS = 190   # 190m context window.
                          # Matches Zoom Level 19 resolution.
                          # Captures House + Neighbors + Street at high detail.

# 3. API Strictness
PIXEL_TYPE = "U8"         # Unsigned 8-bit. Standard for RGBN. 

# 4. Lat/Long Logic
# We use these coords ONLY to calculate the center point in meters.
# =========================================================

# API Params
NAIP_URL = "https://imagery.nationalmap.gov/arcgis/rest/services/USGSNAIPImagery/ImageServer/exportImage"
MAX_WORKERS = 20   
# =================================================

def setup_environment():
    """Ensures directories exist."""
    for d in [TRAIN_IMG_DIR, TEST_IMG_DIR]:
        if not os.path.exists(d):
            os.makedirs(d)

# Coordinate Transformer: WGS84 (Lat/Lon) -> Web Mercator (Meters)
transformer = Transformer.from_crs("epsg:4326", "epsg:3857", always_xy=True)

def get_bbox_meters(lat, lon, width_meters):
    """Guarantees fixed physical scale everywhere."""
    x_meters, y_meters = transformer.transform(lon, lat)
    
    half_width = width_meters / 2
    
    xmin = x_meters - half_width
    ymin = y_meters - half_width
    xmax = x_meters + half_width
    ymax = y_meters + half_width
    
    return f"{xmin},{ymin},{xmax},{ymax}"

def download_image(row, save_folder):
    """Downloads a 4-Channel TIFF."""
    # Robust ID handling
    try:
        house_id = str(int(row['id'])) 
    except ValueError:
        house_id = str(row['id'])

    lat = row['lat']
    lon = row['long']
    
    filename = os.path.join(save_folder, f"{house_id}.tif")
    
    if os.path.exists(filename):
        return "Skipped"

    bbox = get_bbox_meters(lat, lon, VIEW_WIDTH_METERS)
    
    # --- PERFECTED PARAMS ---
    params = {
        "bbox": bbox,
        "bboxSR": "3857",     
        "size": f"{IMAGE_SIZE},{IMAGE_SIZE}", 
        "imageSR": "3857",    
        "format": "tiff",     
        "pixelType": "U8",    
        "bandIds": "0,1,2,3", # <--- CRITICAL: RGB + NIR
        "f": "image"          
    }
    
    try:
        response = requests.get(NAIP_URL, params=params, timeout=20)
        
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '')
            if 'image' in content_type or 'tiff' in content_type:
                with open(filename, 'wb') as f:
                    f.write(response.content)
                return "Success"
            else:
                return "Error: JSON Response"
        else:
            return f"Error: Status {response.status_code}"
            
    except Exception as e:
        return f"Exception: {str(e)}"

def process_dataset(csv_path, save_folder, dataset_name):
    if not os.path.exists(csv_path):
        print(f"Skipping {dataset_name}: File '{csv_path}' not found.")
        return

    print(f"\n--- Processing {dataset_name} Data ---")
    df = pd.read_csv(csv_path)
    
    # --- ROBUST HEADER CLEANING ---
    df.columns = df.columns.str.strip().str.lower()
    
    if 'id' not in df.columns:
        print(f"CRITICAL ERROR: 'id' column not found in {csv_path}")
        return

    # Filter duplicates
    initial = len(df)
    df = df.drop_duplicates(subset=['id'])
    print(f"Cleaned {initial - len(df)} duplicates. Fetching {len(df)} images...")
    
    records = df[['id', 'lat', 'long']].to_dict('records')
    results = {"Success": 0, "Skipped": 0, "Error": 0}
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(download_image, row, save_folder): row['id'] for row in records}
        
        for future in tqdm(as_completed(futures), total=len(records), unit="img"):
            res = future.result()
            if "Success" in res: results["Success"] += 1
            elif "Skipped" in res: results["Skipped"] += 1
            else: results["Error"] += 1
            
    print(f"{dataset_name} Done! Stats: {results}")

def verify_dataset(folder):
    """Verifies that we actually got 4 channels and 224x224 size."""
    print(f"\nVerifying folder: {folder}")
    if not os.path.exists(folder): return
    
    files = [f for f in os.listdir(folder) if f.endswith('.tif')]
    if not files:
        print("No images found.")
        return

    # Check random file
    import random
    fname = random.choice(files)
    path = os.path.join(folder, fname)
    try:
        with rasterio.open(path) as src:
            print(f"Verification ID: {fname}")
            print(f" - Channels: {src.count} (Should be 4)")
            print(f" - Size: {src.width}x{src.height} (Should be 224x224)")
            
            if src.count == 4 and src.width == 224:
                print(" - SUCCESS: 4-Channel NIR + High Res Data Confirmed.")
            else:
                print(" - WARNING: Dimension or Channel mismatch!")
    except Exception as e:
        print(f"Verification failed: {e}")

def main():
    setup_environment()
    
    # Process Train (Prioritize the processed file if available)
    train_file = 'train_tabular.csv' 
    if not os.path.exists(train_file):
        train_file = TRAIN_CSV_PATH
        
    process_dataset(train_file, TRAIN_IMG_DIR, "TRAIN")
    process_dataset(TEST_CSV_PATH, TEST_IMG_DIR, "TEST")
    
    verify_dataset(TRAIN_IMG_DIR)

if __name__ == "__main__":
    main()


--- Processing TRAIN Data ---
Cleaned 99 duplicates. Fetching 16110 images...


100%|██████████| 16110/16110 [1:03:30<00:00,  4.23img/s]  


TRAIN Done! Stats: {'Success': 16050, 'Skipped': 0, 'Error': 60}

--- Processing TEST Data ---
Cleaned 8 duplicates. Fetching 5396 images...


100%|██████████| 5396/5396 [12:13<00:00,  7.36img/s]

TEST Done! Stats: {'Success': 5396, 'Skipped': 0, 'Error': 0}

Verifying folder: naip_images\train_640
Verification ID: 5152920070.tif
 - Channels: 4 (Should be 4)
 - Size: 224x224 (Should be 640x640)



