# California Wildfire Data Preprocessing Pipeline

This notebook performs comprehensive data preprocessing:
1. **Data Validation** - Check schema compliance and data quality
2. **Data Cleaning** - Remove outliers, handle missing values, deduplicate
3. **CRS Alignment** - Reproject all data to California Albers (EPSG:3310)
4. **Spatial Integration** - Create unified spatial grid
5. **Summary Statistics** - Generate comprehensive data summaries

**Target CRS:** EPSG:3310 (NAD83 / California Albers)  
**Reason:** Equal-area projection ideal for California-wide spatial analysis

## Setup

In [None]:
import sys
import json
from pathlib import Path

# Data libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
from rasterio.plot import show

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Import our preprocessing modules
from validate_and_clean import DataValidator, DataCleaner
from align_crs import CRSAligner, SpatialJoiner
from summaries import DataSummarizer

# Add data_sources to path for config
sys.path.append('../data_sources')
from config import FIRMS_DATA_DIR, NOAA_DATA_DIR, USGS_DATA_DIR

# Configure display
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("‚úì Environment setup complete")
print(f"\nData Directories:")
print(f"  FIRMS: {FIRMS_DATA_DIR}")
print(f"  NOAA:  {NOAA_DATA_DIR}")
print(f"  USGS:  {USGS_DATA_DIR}")

## Load Schema

In [None]:
# Load schema
with open('schema.json', 'r') as f:
    schema = json.load(f)

# Display target CRS info
target_crs = schema['target_crs']

display(HTML(f"""
<h3>Target Coordinate Reference System</h3>
<table style='width:100%'>
    <tr><td><b>EPSG</b></td><td>{target_crs['epsg']}</td></tr>
    <tr><td><b>Name</b></td><td>{target_crs['name']}</td></tr>
    <tr><td><b>Units</b></td><td>{target_crs['units']}</td></tr>
    <tr><td><b>Reason</b></td><td>{target_crs['reason']}</td></tr>
</table>
"""))

print(f"\nSource CRS:")
print(f"  FIRMS: {schema['firms']['crs']}")
print(f"  NOAA:  {schema['noaa']['crs']}")
print(f"  USGS:  {schema['usgs']['crs']}")

---
## Step 1: Data Validation and Cleaning

In [None]:
print("="*70)
print("   STEP 1: DATA VALIDATION AND CLEANING")
print("="*70 + "\n")

# Initialize cleaner
cleaner = DataCleaner()

# Create output directory
cleaned_dir = Path("../data/cleaned")
cleaned_dir.mkdir(parents=True, exist_ok=True)

### 1.1 Clean FIRMS Data

In [None]:
try:
    firms_df = cleaner.clean_firms_data(
        input_dir=FIRMS_DATA_DIR,
        output_path=cleaned_dir / "firms_cleaned.parquet"
    )
    
    # Display sample
    display(HTML("<h4>Cleaned FIRMS Data Sample:</h4>"))
    display(firms_df.head())
    
    # Quick stats
    print(f"\nQuick Stats:")
    print(f"  Date range: {firms_df['acq_date'].min()} to {firms_df['acq_date'].max()}")
    print(f"  Confidence levels: {firms_df['confidence'].value_counts().to_dict()}")
    print(f"  Average brightness: {firms_df['brightness'].mean():.1f}K")
    print(f"  Total FRP: {firms_df['frp'].sum():.0f} MW")
    
except Exception as e:
    print(f"‚ùå FIRMS cleaning failed: {e}")

### 1.2 Clean NOAA Data

In [None]:
try:
    noaa_df = cleaner.clean_noaa_data(
        input_dir=NOAA_DATA_DIR,
        output_path=cleaned_dir / "noaa_cleaned.parquet"
    )
    
    # Display sample
    display(HTML("<h4>Cleaned NOAA Data Sample:</h4>"))
    display(noaa_df.head())
    
    # Quick stats
    print(f"\nQuick Stats:")
    print(f"  Date range: {noaa_df['date'].min()} to {noaa_df['date'].max()}")
    print(f"  Unique stations: {noaa_df['station'].nunique()}")
    
    # Data completeness
    weather_vars = ['TMAX', 'TMIN', 'PRCP', 'AWND', 'WSF2', 'WSF5']
    completeness = {var: f"{(~noaa_df[var].isnull()).sum() / len(noaa_df):.1%}" 
                   for var in weather_vars if var in noaa_df.columns}
    print(f"  Variable completeness: {completeness}")
    
except Exception as e:
    print(f"‚ùå NOAA cleaning failed: {e}")

### 1.3 Validate USGS DEM Tiles

In [None]:
try:
    valid_tiles = cleaner.validate_usgs_tiles(input_dir=USGS_DATA_DIR)
    
    print(f"\n‚úÖ Found {len(valid_tiles)} valid DEM tiles")
    if valid_tiles:
        print("\nValid tiles:")
        for tile in valid_tiles:
            print(f"  ‚Ä¢ {Path(tile).name}")
    
except Exception as e:
    print(f"‚ùå USGS validation failed: {e}")

---
## Step 2: CRS Alignment

Reproject all datasets to **EPSG:3310** (NAD83 / California Albers)

In [None]:
print("="*70)
print("   STEP 2: CRS ALIGNMENT")
print("="*70 + "\n")

# Initialize aligner
aligner = CRSAligner()

# Create output directory
aligned_dir = Path("../data/aligned")
aligned_dir.mkdir(parents=True, exist_ok=True)

### 2.1 Align FIRMS Data

In [None]:
try:
    firms_aligned = aligner.align_firms(
        input_path=str(cleaned_dir / "firms_cleaned.parquet"),
        output_path=str(aligned_dir / "firms_aligned.parquet")
    )
    
    # Visualize spatial distribution
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Original coordinates
    axes[0].scatter(firms_aligned.longitude, firms_aligned.latitude, 
                   c='red', alpha=0.1, s=1)
    axes[0].set_title('Original Coordinates (EPSG:4326)')
    axes[0].set_xlabel('Longitude')
    axes[0].set_ylabel('Latitude')
    axes[0].grid(True, alpha=0.3)
    
    # Projected coordinates
    axes[1].scatter(firms_aligned.x, firms_aligned.y, 
                   c='blue', alpha=0.1, s=1)
    axes[1].set_title('Projected Coordinates (EPSG:3310)')
    axes[1].set_xlabel('X (meters)')
    axes[1].set_ylabel('Y (meters)')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"‚ùå FIRMS alignment failed: {e}")

### 2.2 Prepare NOAA Data

In [None]:
try:
    noaa_aligned = aligner.align_noaa(
        input_path=str(cleaned_dir / "noaa_cleaned.parquet"),
        output_path=str(aligned_dir / "noaa_aligned.parquet")
    )
    
except Exception as e:
    print(f"‚ùå NOAA alignment failed: {e}")

### 2.3 Reproject USGS DEM Tiles

In [None]:
try:
    usgs_aligned_dir = aligned_dir / "usgs"
    success_count = aligner.align_all_usgs_tiles(
        input_dir=USGS_DATA_DIR,
        output_dir=str(usgs_aligned_dir)
    )
    
    # Visualize one tile if available
    aligned_tiles = list(usgs_aligned_dir.glob('*.tif'))
    if aligned_tiles:
        print(f"\nVisualizing sample tile: {aligned_tiles[0].name}")
        
        with rasterio.open(aligned_tiles[0]) as src:
            fig, ax = plt.subplots(figsize=(10, 8))
            show(src, ax=ax, cmap='terrain', title=f'DEM: {aligned_tiles[0].name}')
            plt.tight_layout()
            plt.show()
    
except Exception as e:
    print(f"‚ùå USGS alignment failed: {e}")

---
## Step 3: Generate Summaries

In [None]:
print("="*70)
print("   STEP 3: GENERATE DATA SUMMARIES")
print("="*70 + "\n")

summarizer = DataSummarizer(cleaned_data_dir=str(cleaned_dir))
summaries = summarizer.generate_all_summaries(usgs_dir=USGS_DATA_DIR)

### View Summary Details

In [None]:
# Display summaries
if 'firms' in summaries:
    display(HTML("<h3>FIRMS Summary</h3>"))
    display(summaries['firms'])

if 'noaa' in summaries:
    display(HTML("<h3>NOAA Summary</h3>"))
    display(summaries['noaa'])

if 'usgs' in summaries:
    display(HTML("<h3>USGS Tiles Summary</h3>"))
    display(summaries['usgs'])

---
## Step 4: Create Spatial Grid (Optional)

Create a unified spatial grid for aggregating all data sources

In [None]:
try:
    joiner = SpatialJoiner(aligned_data_dir=str(aligned_dir))
    
    # Create 10km x 10km grid
    grid = joiner.create_spatial_grid(cell_size=10000)  # 10km
    
    # Save grid
    grid_path = aligned_dir / "california_grid_10km.parquet"
    grid.to_parquet(grid_path)
    print(f"\nüíæ Saved spatial grid: {grid_path}")
    
    # Visualize grid
    fig, ax = plt.subplots(figsize=(12, 10))
    grid.plot(ax=ax, facecolor='none', edgecolor='gray', linewidth=0.5)
    ax.set_title('California 10km Spatial Grid (EPSG:3310)', fontsize=14)
    ax.set_xlabel('X (meters)')
    ax.set_ylabel('Y (meters)')
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"‚ö†Ô∏è Grid creation failed: {e}")

---
## Final Report

In [None]:
# Check what was created
print("="*70)
print("   PREPROCESSING PIPELINE COMPLETE")
print("="*70 + "\n")

# Cleaned data
cleaned_files = list(cleaned_dir.glob('*.parquet'))
print("\nüìÅ Cleaned Data:")
for f in cleaned_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  ‚úì {f.name} ({size_mb:.1f} MB)")

# Aligned data
aligned_files = list(aligned_dir.glob('*.parquet'))
aligned_tifs = list((aligned_dir / 'usgs').glob('*.tif')) if (aligned_dir / 'usgs').exists() else []

print("\nüìÅ Aligned Data (EPSG:3310):")
for f in aligned_files:
    size_mb = f.stat().st_size / (1024**2)
    print(f"  ‚úì {f.name} ({size_mb:.1f} MB)")
if aligned_tifs:
    print(f"  ‚úì {len(aligned_tifs)} USGS DEM tiles (reprojected)")

print("\n" + "="*70)
print("Ready for analysis and visualization!")
print("="*70)

---
## Next Steps

1. **Exploratory Data Analysis**: Analyze temporal and spatial patterns
2. **Feature Engineering**: Create ML features from cleaned data
3. **Spatial Analysis**: Join fire data with weather and terrain
4. **Visualization**: Create maps and dashboards
5. **Modeling**: Build wildfire prediction models