# California Wildfire Data Ingestion Pipeline

This notebook orchestrates the download of three data sources for 2020 California wildfire analysis:
1. **NASA FIRMS** - Fire detection data
2. **NOAA CDO** - Weather data (temperature, precipitation, wind)
3. **USGS 3DEP** - Digital Elevation Model (terrain data)

**Author:** Data Pipeline  
**Date:** 2024-11-24  
**Purpose:** Interactive data ingestion with validation and visualization

## Setup and Configuration

In [None]:
# Standard library imports
import os
import sys
from pathlib import Path
from datetime import date, timedelta
import time

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output

# Progress bars
from tqdm.notebook import tqdm

# Import our download modules
from config import (
    NASA_FIRMS_API_KEY, NOAA_API_KEY, 
    FIRMS_DATA_DIR, NOAA_DATA_DIR, USGS_DATA_DIR,
    CA_BBOX_W, CA_BBOX_S, CA_BBOX_E, CA_BBOX_N
)

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("‚úì Environment setup complete")
print(f"  - FIRMS data: {FIRMS_DATA_DIR}")
print(f"  - NOAA data: {NOAA_DATA_DIR}")
print(f"  - USGS data: {USGS_DATA_DIR}")

## Data Ingestion Status Dashboard

In [None]:
def check_data_status():
    """Check what data has been downloaded"""
    
    # Check FIRMS data
    firms_files = list(Path(FIRMS_DATA_DIR).glob('*.csv'))
    firms_size = sum(f.stat().st_size for f in firms_files) / (1024**2)  # MB
    
    # Check NOAA data
    noaa_files = list(Path(NOAA_DATA_DIR).glob('*.csv'))
    noaa_size = sum(f.stat().st_size for f in noaa_files) / (1024**2)  # MB
    
    # Check USGS data
    usgs_files = list(Path(USGS_DATA_DIR).glob('*.tif'))
    usgs_valid = [f for f in usgs_files if f.stat().st_size > 1024*1024]  # > 1MB
    usgs_size = sum(f.stat().st_size for f in usgs_valid) / (1024**2)  # MB
    
    # Create status dataframe
    status = pd.DataFrame({
        'Data Source': ['NASA FIRMS', 'NOAA Weather', 'USGS DEM'],
        'Files': [len(firms_files), len(noaa_files), len(usgs_valid)],
        'Size (MB)': [f'{firms_size:.1f}', f'{noaa_size:.1f}', f'{usgs_size:.1f}'],
        'Status': [
            '‚úÖ Complete' if len(firms_files) >= 37 else f'‚ö†Ô∏è {len(firms_files)}/37 files',
            '‚úÖ Complete' if len(noaa_files) >= 12 else f'‚è≥ {len(noaa_files)}/12 months',
            '‚úÖ Complete' if len(usgs_valid) >= 12 else f'‚ö†Ô∏è {len(usgs_valid)}/12 tiles'
        ]
    })
    
    display(HTML('<h3>üìä Data Ingestion Status</h3>'))
    display(status)
    
    return {
        'firms': firms_files,
        'noaa': noaa_files,
        'usgs': usgs_valid
    }

# Run status check
data_files = check_data_status()

---
## 1. NASA FIRMS - Fire Detection Data

Downloads fire detection data from MODIS satellites for California in 2020.  
**Expected**: 37 files (10-day chunks)

In [None]:
from download_nasa_firms import download_chunk, window_starts, SOURCE, WINDOW_DAYS

def download_firms_interactive():
    """Download FIRMS data with progress bar and validation"""
    
    start = date(2020, 1, 1)
    end = date(2020, 12, 31)
    
    chunks = list(window_starts(start, end, WINDOW_DAYS))
    
    print(f"üì° Downloading NASA FIRMS data: {len(chunks)} chunks")
    print(f"   Source: {SOURCE}")
    print(f"   Window: {WINDOW_DAYS} days per chunk\n")
    
    total_rows = 0
    failed = []
    
    for d in tqdm(chunks, desc="Downloading chunks"):
        remaining = (end - d).days + 1
        days = min(WINDOW_DAYS, max(1, remaining))
        
        try:
            download_chunk(d, days)
            time.sleep(0.25)  # Rate limiting
        except Exception as e:
            failed.append((d, str(e)))
            tqdm.write(f"  ‚úó Failed: {d} - {e}")
    
    # Validation
    files = list(Path(FIRMS_DATA_DIR).glob('*.csv'))
    if files:
        sample = pd.read_csv(files[0])
        total_rows = sum(len(pd.read_csv(f)) for f in files)
    
    print(f"\n‚úÖ Download complete!")
    print(f"   Files: {len(files)}")
    print(f"   Total fire detections: {total_rows:,}")
    if failed:
        print(f"   Failed chunks: {len(failed)}")
    
    return files

# Uncomment to run download
# firms_files = download_firms_interactive()

### Validate and Visualize FIRMS Data

In [None]:
def analyze_firms_data():
    """Load and analyze FIRMS fire detection data"""
    
    files = list(Path(FIRMS_DATA_DIR).glob('*.csv'))
    
    if not files:
        print("‚ö†Ô∏è No FIRMS data found. Run download first.")
        return None
    
    print(f"üìÅ Loading {len(files)} FIRMS files...")
    
    # Load all data
    dfs = []
    for f in tqdm(files, desc="Loading files"):
        df = pd.read_csv(f)
        dfs.append(df)
    
    firms_df = pd.concat(dfs, ignore_index=True)
    firms_df['acq_date'] = pd.to_datetime(firms_df['acq_date'])
    
    print(f"\nüìä FIRMS Data Summary:")
    print(f"   Total detections: {len(firms_df):,}")
    print(f"   Date range: {firms_df['acq_date'].min()} to {firms_df['acq_date'].max()}")
    print(f"   Columns: {firms_df.columns.tolist()}")
    
    # Plot fire activity over time
    fig, ax = plt.subplots(figsize=(14, 5))
    daily_fires = firms_df.groupby('acq_date').size()
    daily_fires.plot(ax=ax, color='orangered', linewidth=1.5)
    ax.set_title('Daily Fire Detections in California - 2020', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Detections')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Peak fire days
    print(f"\nüî• Top 5 fire days:")
    display(daily_fires.nlargest(5))
    
    return firms_df

# Run analysis
firms_df = analyze_firms_data()

---
## 2. NOAA Weather Data

Downloads weather data (temperature, precipitation, wind) from NOAA CDO API.  
**Expected**: 12 files (monthly, 2020)

In [None]:
from download_noaa_weather import download_full_period, DATATYPES

def download_noaa_interactive():
    """Download NOAA weather data month by month with progress tracking"""
    
    if not NOAA_API_KEY:
        print("‚ùå Missing NOAA_API_KEY in config.py")
        return []
    
    print(f"üå¶Ô∏è Downloading NOAA Weather Data")
    print(f"   Variables: {', '.join(DATATYPES)}")
    print(f"   Location: California (FIPS:06)")
    print(f"   Period: 2020 (12 months)\n")
    
    year = 2020
    success_count = 0
    failed_months = []
    
    for month in tqdm(range(1, 13), desc="Downloading months"):
        start = date(year, month, 1)
        if month == 12:
            end = date(year, 12, 31)
        else:
            end = date(year, month + 1, 1) - timedelta(days=1)
        
        month_name = start.strftime('%B %Y')
        
        try:
            tqdm.write(f"  Downloading {month_name}...")
            download_full_period(start, end)
            success_count += 1
            
            # Verify file was created
            expected_file = Path(NOAA_DATA_DIR) / f"noaa_weather_CA_{start.strftime('%Y-%m-%d')}.csv"
            if expected_file.exists():
                size_kb = expected_file.stat().st_size / 1024
                tqdm.write(f"    ‚úì {month_name}: {size_kb:.1f} KB")
            
            if month < 12:
                time.sleep(3.0)  # Rate limiting between months
                
        except Exception as e:
            tqdm.write(f"    ‚úó {month_name} failed: {e}")
            failed_months.append((month, str(e)))
            time.sleep(5.0)
    
    print(f"\n‚úÖ Download complete!")
    print(f"   Successful: {success_count}/12 months")
    if failed_months:
        print(f"   Failed: {failed_months}")
    
    return list(Path(NOAA_DATA_DIR).glob('*.csv'))

# Uncomment to run download
# noaa_files = download_noaa_interactive()

### Validate and Visualize NOAA Data

In [None]:
def analyze_noaa_data():
    """Load and analyze NOAA weather data"""
    
    files = sorted(Path(NOAA_DATA_DIR).glob('*.csv'))
    
    if not files:
        print("‚ö†Ô∏è No NOAA data found. Run download first.")
        return None
    
    print(f"üìÅ Loading {len(files)} NOAA weather files...")
    
    # Load all data
    dfs = [pd.read_csv(f) for f in tqdm(files, desc="Loading")]
    noaa_df = pd.concat(dfs, ignore_index=True)
    noaa_df['date'] = pd.to_datetime(noaa_df['date'])
    
    print(f"\nüìä NOAA Weather Data Summary:")
    print(f"   Total records: {len(noaa_df):,}")
    print(f"   Date range: {noaa_df['date'].min()} to {noaa_df['date'].max()}")
    print(f"   Data types: {noaa_df['datatype'].unique().tolist()}")
    print(f"   Unique stations: {noaa_df['station'].nunique()}")
    
    # Visualize data types distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Records per data type
    noaa_df['datatype'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('Records per Weather Variable')
    axes[0].set_xlabel('Data Type')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Records over time
    daily_counts = noaa_df.groupby('date').size()
    daily_counts.plot(ax=axes[1], color='green', linewidth=1)
    axes[1].set_title('Daily Weather Observations')
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Number of Records')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return noaa_df

# Run analysis
noaa_df = analyze_noaa_data()

---
## 3. USGS Digital Elevation Model (DEM)

Downloads terrain elevation data for California.  
**Expected**: 12 GeoTIFF tiles (4x3 grid covering California)

In [None]:
from download_usgs_dem import download_dem_tile, RESOLUTION

def download_usgs_interactive():
    """Download USGS DEM tiles with progress tracking"""
    
    print(f"üóª Downloading USGS DEM Data")
    print(f"   Resolution: {RESOLUTION}m (auto-adjusted if needed)")
    print(f"   Coverage: California (4x3 grid = 12 tiles)\n")
    
    # Generate tile grid
    cols, rows = 4, 3
    lon_step = (CA_BBOX_E - CA_BBOX_W) / cols
    lat_step = (CA_BBOX_N - CA_BBOX_S) / rows
    
    tiles = []
    for row in range(rows):
        for col in range(cols):
            min_lon = CA_BBOX_W + (col * lon_step)
            max_lon = CA_BBOX_W + ((col + 1) * lon_step)
            min_lat = CA_BBOX_S + (row * lat_step)
            max_lat = CA_BBOX_S + ((row + 1) * lat_step)
            
            filename = f"california_dem_r{row}_c{col}.tif"
            tiles.append((filename, (min_lon, min_lat, max_lon, max_lat)))
    
    successful = 0
    failed = []
    
    for filename, bbox in tqdm(tiles, desc="Downloading tiles"):
        try:
            result = download_dem_tile(bbox, resolution=RESOLUTION, output_name=filename)
            
            # Check if file is valid (> 1MB)
            filepath = Path(USGS_DATA_DIR) / filename
            if filepath.exists() and filepath.stat().st_size > 1024*1024:
                size_mb = filepath.stat().st_size / (1024**2)
                tqdm.write(f"  ‚úì {filename}: {size_mb:.1f} MB")
                successful += 1
            else:
                tqdm.write(f"  ‚úó {filename}: Invalid/too small")
                failed.append(filename)
                
        except Exception as e:
            tqdm.write(f"  ‚úó {filename}: {e}")
            failed.append(filename)
    
    print(f"\n‚úÖ Download complete!")
    print(f"   Successful: {successful}/12 tiles")
    if failed:
        print(f"   Failed: {len(failed)} tiles")
        print(f"   Failed list: {failed[:5]}..." if len(failed) > 5 else f"   Failed list: {failed}")
    
    return list(Path(USGS_DATA_DIR).glob('*.tif'))

# Uncomment to run download
# usgs_files = download_usgs_interactive()

### Validate USGS DEM Data

In [None]:
def analyze_usgs_data():
    """Analyze downloaded USGS DEM tiles"""
    
    files = list(Path(USGS_DATA_DIR).glob('*.tif'))
    valid_files = [f for f in files if f.stat().st_size > 1024*1024]  # > 1MB
    
    if not valid_files:
        print("‚ö†Ô∏è No valid USGS DEM data found. Run download first.")
        return None
    
    print(f"üìÅ Found {len(valid_files)} valid DEM tiles (out of {len(files)} total)\n")
    
    # Create coverage visualization
    tile_info = []
    for f in valid_files:
        name = f.stem
        size_mb = f.stat().st_size / (1024**2)
        
        # Parse row/col from filename
        if 'r' in name and 'c' in name:
            parts = name.split('_')
            row = int(parts[-2][1])
            col = int(parts[-1][1])
            tile_info.append({'File': f.name, 'Row': row, 'Col': col, 'Size (MB)': f'{size_mb:.1f}'})
    
    if tile_info:
        df = pd.DataFrame(tile_info).sort_values(['Row', 'Col'])
        display(HTML('<h4>Downloaded DEM Tiles:</h4>'))
        display(df)
        
        # Visualize grid coverage
        fig, ax = plt.subplots(figsize=(8, 6))
        coverage = np.zeros((3, 4))  # 3 rows x 4 cols
        for tile in tile_info:
            coverage[tile['Row'], tile['Col']] = 1
        
        im = ax.imshow(coverage, cmap='RdYlGn', vmin=0, vmax=1, aspect='auto')
        ax.set_xticks(range(4))
        ax.set_yticks(range(3))
        ax.set_xlabel('Column')
        ax.set_ylabel('Row')
        ax.set_title('California DEM Tile Coverage (Green = Downloaded)')
        
        # Add text annotations
        for i in range(3):
            for j in range(4):
                status = '‚úì' if coverage[i, j] else '‚úó'
                ax.text(j, i, status, ha='center', va='center', 
                       color='white' if coverage[i, j] else 'red', fontsize=20)
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nüìä Coverage: {len(valid_files)}/12 tiles ({len(valid_files)/12*100:.1f}%)")
    
    return valid_files

# Run analysis
usgs_files = analyze_usgs_data()

---
## Final Summary and Data Export

In [None]:
def generate_final_report():
    """Generate comprehensive data ingestion report"""
    
    # Check all data
    firms_files = list(Path(FIRMS_DATA_DIR).glob('*.csv'))
    noaa_files = list(Path(NOAA_DATA_DIR).glob('*.csv'))
    usgs_files = [f for f in Path(USGS_DATA_DIR).glob('*.tif') 
                  if f.stat().st_size > 1024*1024]
    
    # Calculate sizes
    firms_size = sum(f.stat().st_size for f in firms_files) / (1024**2)
    noaa_size = sum(f.stat().st_size for f in noaa_files) / (1024**2)
    usgs_size = sum(f.stat().st_size for f in usgs_files) / (1024**2)
    total_size = firms_size + noaa_size + usgs_size
    
    # Generate report
    report = f"""
    ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
    ‚ïë   California Wildfire Data Ingestion Report - 2020       ‚ïë
    ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
    
    üì° NASA FIRMS (Fire Detection)
       Files: {len(firms_files)}/37
       Size:  {firms_size:.1f} MB
       Status: {'‚úÖ Complete' if len(firms_files) >= 37 else '‚ö†Ô∏è Incomplete'}
    
    üå¶Ô∏è NOAA Weather Data
       Files: {len(noaa_files)}/12
       Size:  {noaa_size:.1f} MB
       Status: {'‚úÖ Complete' if len(noaa_files) >= 12 else '‚ö†Ô∏è Incomplete'}
    
    üóª USGS Digital Elevation Model
       Files: {len(usgs_files)}/12
       Size:  {usgs_size:.1f} MB
       Status: {'‚úÖ Complete' if len(usgs_files) >= 12 else '‚ö†Ô∏è Partial'}
    
    üì¶ Total Dataset Size: {total_size:.1f} MB
    
    ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    Next Steps:
      1. Data cleaning and preprocessing
      2. Spatial alignment and temporal synchronization
      3. Feature engineering for ML models
      4. Exploratory data analysis and visualization
    """
    
    print(report)
    
    # Return summary dict
    return {
        'firms': {'files': len(firms_files), 'size_mb': firms_size, 'complete': len(firms_files) >= 37},
        'noaa': {'files': len(noaa_files), 'size_mb': noaa_size, 'complete': len(noaa_files) >= 12},
        'usgs': {'files': len(usgs_files), 'size_mb': usgs_size, 'complete': len(usgs_files) >= 12},
        'total_size_mb': total_size
    }

# Generate report
summary = generate_final_report()

---
## Quick Re-download Failed Data

If any data source is incomplete, run the appropriate cell to retry:

In [None]:
# Quick status check
check_data_status()

In [None]:
# Re-download FIRMS if needed
# firms_files = download_firms_interactive()

In [None]:
# Re-download NOAA if needed
# noaa_files = download_noaa_interactive()

In [None]:
# Re-download USGS if needed
# usgs_files = download_usgs_interactive()