# LANDFIRE Existing Vegetation Type (EVT) Data Exploration

TODO: FIX, CURRENTLY BROKEN DOWNLOAD. This notebook explores the LANDFIRE Existing Vegetation Type (EVT) data and demonstrates how to integrate it into our datacube framework.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import geopandas as gpd
from matplotlib.colors import ListedColormap

# Add the scripts directory to the path so we can import our modules
sys.path.append('../scripts')
from landfire_processor import LandfireProcessor, AggregationMethod
from datacube_builder import DatacubeBuilder, InterpolationMethod

## Download and Process LANDFIRE EVT Data

First, let's download and process LANDFIRE Existing Vegetation Type (EVT) data for a specific area. We'll use a small region in South Dakota as an example.

In [None]:
# Try to import configuration if available
try:
    # Add parent directory to path to import config
    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    import config
    
    # Use config values if available
    if hasattr(config, 'BLACK_HILLS_BBOX_STRING'):
        bbox = config.BLACK_HILLS_BBOX_STRING
        print(f"Using bounding box from config: {bbox}")
    else:
        # Define our area of interest (South Dakota region)
        bbox = "-103.50 43.00 -102.00 44.00"
        print(f"Using default bounding box: {bbox}")
    
    # Use LANDFIRE config if available
    if hasattr(config, 'LANDFIRE_CONFIG'):
        data_dir = config.LANDFIRE_CONFIG.get("data_dir", "./data/landfire")
    else:
        data_dir = "./data/landfire"
except ImportError:
    # Define our area of interest (South Dakota region)
    bbox = "-103.50 43.00 -102.00 44.00"
    data_dir = "./data/landfire"
    print("Configuration module not found. Using default values.")

# Initialize the LANDFIRE processor
processor = LandfireProcessor(bbox=bbox, data_dir=data_dir)

# Check if the data directory exists
if not os.path.exists(data_dir):
    os.makedirs(data_dir, exist_ok=True)
    print(f"Created LANDFIRE data directory: {data_dir}")

# Check if we have downloaded data already
expected_zip = os.path.join(data_dir, "landfire_220EVT.zip")
if os.path.exists(expected_zip):
    print(f"Using existing downloaded data: {expected_zip}")
    processor.raw_data_path = expected_zip
else:
    # Try to download the latest EVT data (2020 version)
    try:
        try:
            # First check if the landfire package is available
            import landfire
            data_path = processor.download_data("220EVT")
            print(f"Downloaded data to: {data_path}")
        except ImportError:
            raise RuntimeError(
                "The landfire package is not installed. Cannot download LANDFIRE data.\n"
                "Please install it with: pip install landfire\n"
                "Or manually download LANDFIRE EVT data and place it at: " + expected_zip
            )
    except Exception as e:
        print(f"Error downloading data: {e}")
        
        # Check if we have any LANDFIRE data files that we could use instead
        all_zips = glob.glob(os.path.join(data_dir, "*.zip"))
        if all_zips:
            alternative = all_zips[0]
            print(f"Using alternative LANDFIRE data file: {alternative}")
            processor.raw_data_path = alternative
        else:
            raise RuntimeError("No LANDFIRE data available. This notebook requires LANDFIRE EVT data to run.")

## Extract and Load the Data

Now, let's extract the data from the ZIP file and load it into an xarray Dataset.

In [None]:
# Check if we already have the extracted data or dataset prepared
if hasattr(processor, 'processed_dataset') and processor.processed_dataset is not None:
    # If we've already processed
    evt_dataset = processor.processed_dataset
    print("Using already prepared dataset")
elif hasattr(processor, 'raw_dataset') and processor.raw_dataset is not None:
    # If raw dataset is already loaded
    evt_dataset = processor.raw_dataset
    print("Using already loaded raw dataset")
else:
    try:
        # Extract the data if we have a raw data path
        if processor.raw_data_path:
            try:
                extracted_dir = processor.extract_data()
                print(f"Extracted data to: {extracted_dir}")
            except Exception as e:
                print(f"Error extracting data: {e}")
                print("Checking if data is already extracted...")
                
                # Check if there's already extracted data
                if hasattr(processor, 'tiff_path') and os.path.exists(processor.tiff_path):
                    print(f"Using already extracted data at: {processor.tiff_path}")
                else:
                    # Look for any tiff files in the data directory
                    tiff_files = glob.glob(os.path.join(data_dir, "**/*.tif"), recursive=True)
                    if tiff_files:
                        processor.tiff_path = tiff_files[0]
                        print(f"Found TIFF file: {processor.tiff_path}")
                    else:
                        raise RuntimeError("No TIFF files found. Cannot proceed without extracted data.")
        
        # Create an xarray Dataset from the GeoTIFF
        try:
            evt_dataset = processor.create_dataset()
            print(f"Dataset created with dimensions: {evt_dataset.dims}")
            print(f"Variables: {list(evt_dataset.data_vars)}")
        except Exception as e:
            print(f"Error creating dataset: {e}")
            
            # Try an alternative approach if the standard one fails
            if not hasattr(processor, 'tiff_path') or not os.path.exists(processor.tiff_path):
                raise RuntimeError("Failed to create dataset from LANDFIRE data and no TIFF file available.")
            
            print("Attempting to create dataset with alternative approach...")
            try:
                import rasterio
                import xarray as xr
                import numpy as np
                
                # Open the TIFF file with rasterio
                with rasterio.open(processor.tiff_path) as src:
                    # Read the data
                    data = src.read(1)  # Read the first band
                    
                    # Create basic DataArray
                    da = xr.DataArray(
                        data=data,
                        dims=["y", "x"],
                        coords={
                            "y": np.arange(data.shape[0]),
                            "x": np.arange(data.shape[1])
                        },
                        name="evt"
                    )
                    
                    # Create Dataset
                    evt_dataset = xr.Dataset({"evt": da})
                    print("Created simplified dataset from TIFF file")
            except Exception as nested_e:
                print(f"Alternative dataset creation also failed: {nested_e}")
                raise RuntimeError("Failed to create dataset from LANDFIRE data.")
    except Exception as e:
        print(f"Error processing LANDFIRE data: {e}")
        raise RuntimeError("Unable to process LANDFIRE data. This notebook requires valid LANDFIRE data to run.")

## Visualize the Raw EVT Data

Let's create a visualization of the raw EVT data before processing. EVT values are categorical, representing different vegetation types.

In [None]:
# Create a simple colormap for EVT values
# In a real application, you would use the actual EVT classification colors
np.random.seed(42)  # For reproducible colors
n_colors = 50  # Adjust based on number of unique values
colors = np.random.rand(n_colors, 3)
cmap = ListedColormap(colors)

# Plot the EVT data
plt.figure(figsize=(12, 8))
evt_dataset.evt.plot(cmap=cmap)
plt.title("LANDFIRE Existing Vegetation Type (EVT)")
plt.tight_layout()
plt.show()

# Print some basic statistics
unique_values = np.unique(evt_dataset.evt.values)
print(f"Number of unique EVT values: {len(unique_values)}")
print(f"Example EVT values: {unique_values[:10]}...")

## Reproject to Latitude/Longitude Coordinates

Next, let's reproject the dataset to standard latitude/longitude coordinates for easier integration with other datasets.

In [None]:
# Reproject the dataset to lat/lon
reprojected = processor.reproject_to_latlon()
print(f"Reprojected dataset coordinates: {list(reprojected.coords)}")
print(f"Latitude range: {float(reprojected.lat.min())} to {float(reprojected.lat.max())}")
print(f"Longitude range: {float(reprojected.lon.min())} to {float(reprojected.lon.max())}")

# Plot the reprojected data
plt.figure(figsize=(12, 8))
reprojected.evt.plot(cmap=cmap)
plt.title("LANDFIRE EVT (Reprojected to Lat/Lon)")
plt.tight_layout()
plt.show()

## Apply Spatial Bucketing

Now, let's aggregate the data into spatial buckets to match our datacube framework. For categorical data like EVT, we'll use the MODE aggregation method (most common value in each bucket).

In [None]:
# Define the bucket sizes
lat_bucket_size = 0.05  # about 5 km
lon_bucket_size = 0.05  # about 5 km

# Bucket the data using MODE aggregation (most common value)
bucketed = processor.bucket_spatial(
    lat_bucket_size=lat_bucket_size,
    lon_bucket_size=lon_bucket_size,
    agg_method=AggregationMethod.MODE
)

print(f"Bucketed dataset dimensions: {bucketed.dims}")
print(f"Original dimensions: {processor.processed_dataset.dims}")
print(f"Reduction factor: {processor.processed_dataset.sizes['lat'] * processor.processed_dataset.sizes['lon'] / (len(bucketed.lat_bins) * len(bucketed.lon_bins)):.2f}x")

# Plot the bucketed data
plt.figure(figsize=(12, 8))
bucketed.evt.plot(cmap=cmap)
plt.title(f"LANDFIRE EVT (Bucketed {lat_bucket_size}° x {lon_bucket_size}°)")
plt.tight_layout()
plt.show()

## Integrate with Datacube Framework

Now, let's demonstrate how to integrate the LANDFIRE EVT data with our datacube framework by combining it with another dataset.

In [None]:
# For demonstration, we'll use actual climate data if available
# Check if we have CMIP data files available
cmip_file = "./data/macav2metdata_huss_CCSM4_r6i1p1_rcp45_2021_2025_CONUS_monthly.nc"

# Try alternative files if the main one doesn't exist
if not os.path.exists(cmip_file):
    alt_files = glob.glob("./data/macav2metdata_*_*_*_*_*_*.nc")
    if alt_files:
        cmip_file = alt_files[0]
        print(f"Using alternative CMIP file: {cmip_file}")

if os.path.exists(cmip_file):
    # Use the actual CMIP data
    try:
        temp_dataset = xr.open_dataset(cmip_file)
        var_name = list(temp_dataset.data_vars)[0]
        
        # Check if the dataset has the necessary coordinates
        if all(dim in temp_dataset.dims for dim in ['lat', 'lon', 'time']):
            print(f"Using actual CMIP data from: {cmip_file}")
            
            # Subset the data to the same region as our LANDFIRE data
            if 'lat' in reprojected.dims and 'lon' in reprojected.dims:
                lat_min = float(reprojected.lat.min())
                lat_max = float(reprojected.lat.max())
                lon_min = float(reprojected.lon.min())
                lon_max = float(reprojected.lon.max())
                
                # Find closest matching indices in the CMIP dataset
                lat_indices = np.where((temp_dataset.lat >= lat_min) & (temp_dataset.lat <= lat_max))[0]
                lon_indices = np.where((temp_dataset.lon >= lon_min) & (temp_dataset.lon <= lon_max))[0]
                
                if len(lat_indices) > 0 and len(lon_indices) > 0:
                    # Subset to the region
                    temp_dataset = temp_dataset.isel(
                        lat=slice(lat_indices[0], lat_indices[-1]+1),
                        lon=slice(lon_indices[0], lon_indices[-1]+1)
                    )
                    
                    # Plot the temperature dataset for the first time point
                    plt.figure(figsize=(10, 6))
                    temp_dataset[var_name].isel(time=0).plot(cmap='viridis')
                    plt.title(f"CMIP Data: {var_name} (First Time Point)")
                    plt.tight_layout()
                    plt.show()
                else:
                    print("No overlapping region found between LANDFIRE and CMIP data.")
                    raise ValueError("Region mismatch")
            else:
                # Just plot the first time step
                plt.figure(figsize=(10, 6))
                temp_dataset[var_name].isel(time=0).plot(cmap='viridis')
                plt.title(f"CMIP Data: {var_name} (First Time Point)")
                plt.tight_layout()
                plt.show()
        else:
            print(f"CMIP dataset does not have required dimensions (lat, lon, time)")
            raise ValueError("Missing dimensions")
            
    except Exception as e:
        print(f"Error loading or processing CMIP data: {e}")
        raise RuntimeError("Unable to use CMIP data for demonstration. This notebook requires valid CMIP data files.")
else:
    # No CMIP data available
    raise FileNotFoundError(
        "No CMIP data files found for combined analysis. Please provide at least one CMIP data file."
    )

## Combine Datasets with DatacubeBuilder

Now, let's use our DatacubeBuilder to combine the EVT and temperature datasets.

In [None]:
# Initialize the datacube builder
builder = DatacubeBuilder()

# Add our datasets
builder.add_dataset("evt", bucketed)
builder.add_dataset("temperature", temp_dataset)

# Build a unified datacube
unified_cube = builder.build_datacube(
    lat_resolution=0.1,
    lon_resolution=0.1,
    time_resolution='1MS',  # Monthly
    interpolation_method=InterpolationMethod.NEAREST  # Nearest neighbor is best for categorical data
)

print(f"Unified datacube dimensions: {unified_cube.dims}")
print(f"Variables: {list(unified_cube.data_vars)}")

## Analyze Relationships between Vegetation Type and Temperature

Now that we have vegetation and temperature data in a unified datacube, we can start to analyze relationships between them.

In [None]:
# Plot both datasets for visual comparison
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Plot EVT
unified_cube.evt_evt.isel(time=0).plot(ax=axes[0], cmap=cmap)
axes[0].set_title("Existing Vegetation Type")

# Plot Temperature for same time
unified_cube.temperature_temperature.isel(time=0).plot(ax=axes[1], cmap='viridis')
axes[1].set_title(f"Temperature ({unified_cube.time.values[0]})")

plt.tight_layout()
plt.show()

## Calculate Statistics by Vegetation Type

Let's compute temperature statistics for each vegetation type.

In [None]:
# First, let's get a flattened view of the data for the first time step
time_idx = 0
evt_flat = unified_cube.evt_evt.isel(time=time_idx).values.flatten()
temp_flat = unified_cube.temperature_temperature.isel(time=time_idx).values.flatten()

# Remove NaN values
mask = ~(np.isnan(evt_flat) | np.isnan(temp_flat))
evt_flat = evt_flat[mask]
temp_flat = temp_flat[mask]

# Calculate statistics by vegetation type
evt_types = np.unique(evt_flat)
stats = []

for evt_val in evt_types:
    # Find all temperature values for this vegetation type
    temps = temp_flat[evt_flat == evt_val]
    
    if len(temps) > 0:
        stats.append({
            "EVT": int(evt_val),
            "Count": len(temps),
            "Min Temp": temps.min(),
            "Max Temp": temps.max(),
            "Mean Temp": temps.mean(),
            "Std Temp": temps.std()
        })

# Create a dataframe
stats_df = pd.DataFrame(stats)
stats_df = stats_df.sort_values("Mean Temp", ascending=False)

# Display the top 10 vegetation types by mean temperature
stats_df.head(10)

## Visualize Temperature Distribution by Vegetation Type

Let's create a visualization of temperature distributions for different vegetation types.

In [None]:
# Select the top 5 most common vegetation types
top_types = stats_df.sort_values("Count", ascending=False).head(5)["EVT"].values

# Create a box plot of temperatures for each vegetation type
plt.figure(figsize=(14, 8))

box_data = []
labels = []

for evt_val in top_types:
    temps = temp_flat[evt_flat == evt_val]
    box_data.append(temps)
    labels.append(f"EVT {evt_val}")

plt.boxplot(box_data, labels=labels)
plt.title("Temperature Distribution by Vegetation Type")
plt.xlabel("Vegetation Type")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Time Series Analysis for a Specific Location

Let's extract and analyze the temperature time series for a specific location and vegetation type.

In [None]:
# Find coordinates for a location with a specific vegetation type
# First, get a specific vegetation type (use the most common one)
common_evt = stats_df.sort_values("Count", ascending=False).iloc[0]["EVT"]

# Find the first occurrence of this type
evt_data = unified_cube.evt_evt.isel(time=0)
coords = np.where(evt_data.values == common_evt)

if len(coords[0]) > 0:
    # Get the lat/lon indices
    lat_idx = coords[0][0]
    lon_idx = coords[1][0]
    
    # Get actual coordinates
    lat_val = float(unified_cube.lat[lat_idx])
    lon_val = float(unified_cube.lon[lon_idx])
    
    print(f"Selected location: Lat {lat_val:.4f}, Lon {lon_val:.4f}")
    print(f"Vegetation type: {int(evt_data.values[lat_idx, lon_idx])}")
    
    # Extract temperature time series
    temp_series = unified_cube.temperature_temperature.sel(lat=lat_val, lon=lon_val, method="nearest")
    
    # Plot the time series
    plt.figure(figsize=(12, 6))
    temp_series.plot(marker='o')
    plt.title(f"Temperature Time Series for Vegetation Type {int(common_evt)}")
    plt.xlabel("Date")
    plt.ylabel("Temperature (°C)")
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print(f"No locations found with vegetation type {common_evt}")

## Save the Combined Datacube

Finally, let's save our combined datacube for future use.

In [None]:
# Save the datacube to a NetCDF file
output_path = "./data/processed/evt_temperature_datacube.nc"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
builder.save_datacube(output_path)
print(f"Saved combined datacube to {output_path}")

## Conclusion

In this notebook, we've demonstrated how to:

1. Download and process LANDFIRE Existing Vegetation Type (EVT) data
2. Convert the data to an xarray Dataset format
3. Reproject the data to latitude/longitude coordinates
4. Apply spatial bucketing using mode (most common value) aggregation for categorical data
5. Combine the EVT data with temperature data in a unified datacube
6. Analyze relationships between vegetation types and temperature
7. Save the combined datacube for future use

This approach can be extended to include additional datasets and to analyze more complex relationships between vegetation types and climate variables.