##### Wildfire Prediction Model Prototype
##### Purpose: Test modeling approaches before implementing in production code

In [1]:
import os
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import sys
sys.path.append('..')  # Add parent directory to path
from src.data_ingestion.weather_data_service import WeatherDataService


In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [3]:
# Load data
def load_data():
    """Load NASA FIRMS data from the data directory."""
    try:
        data_path = os.path.join('..', 'data', 'nasa_firms_data.json')
        df = pd.read_csv(data_path)
        logging.info(f"Loaded data with shape: {df.shape}")
        return df
    except FileNotFoundError as e:
        logging.error(f"Data file not found: {e}")
        raise
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

In [4]:
# Preprocessing function
def preprocess_data(df):
    """Clean and prepare data for modeling."""
    # Convert date to datetime
    df['acq_date'] = pd.to_datetime(df['acq_date'])
       
    # Check for and handle missing values
    if df.isnull().sum().any():
       df = df.dropna()  # or use imputation
       
    # Filter for US region (if needed)
    df = df[(df['latitude'] >= 24) & (df['latitude'] <= 50) & 
           (df['longitude'] >= -125) & (df['longitude'] <= -66)]
    return df

In [5]:
# Feature engineering function
def engineer_features(df):
    """Create features for wildfire prediction."""
    # Create grid cells (e.g., 0.1° x 0.1°)
    df['lat_grid'] = np.floor(df['latitude'] * 10) / 10
    df['lon_grid'] = np.floor(df['longitude'] * 10) / 10

    # Aggregate by grid cell and date
    grid_counts = df.groupby(['lat_grid', 'lon_grid', 'acq_date']).size().reset_index(name='fire_count')
       
    # Get unique date range from data
    start_date = df['acq_date'].min().strftime('%Y-%m-%d')
    end_date = df['acq_date'].max().strftime('%Y-%m-%d')
       
    # Initialize weather service
    weather_service = WeatherDataService()

    # Define region bounds for the data (use USA bounds to match your bulk data)
    region_bounds = {
        'min_lat': 24.0, 'max_lat': 50.0,
        'min_lon': -125.0, 'max_lon': -66.0
    }
       
    # Fetch weather data from bulk file
    weather_service.fetch_and_store_weather_data(start_date, end_date, region_bounds)
       
    # Get weather data for our grid cells
    df_with_weather = weather_service.get_weather_for_locations(grid_counts)
       
    # Rename weather columns to match our expected feature names
    if df_with_weather is not None:
        df_with_weather = df_with_weather.rename(columns={
            'temperature_max': 'temperature',
            'wind_speed_max': 'wind_speed',
            'humidity_max': 'humidity'
        })
        return df_with_weather
    else:
        # If we still couldn't get weather data, raise an error
        raise ValueError("Could not obtain weather data. Please check the bulk_weather_data.csv file.")

In [6]:
def train_model(X_train, y_train):
    """Train a wildfire prediction model."""
    # Handle class imbalance
    # (Most grid cells will not have fires)
       
    # Initialize and train model
    model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

In [9]:
# Main workflow
try:
    # Load data
    df = load_data()
    
    # Preprocess
    df_clean = preprocess_data(df)

    df_clean = df_clean.copy()
    
    # Engineer features with better error handling
    try:
        # Initialize weather service
        weather_service = WeatherDataService()
        
        # Create grid cells
        df_clean['lat_grid'] = np.floor(df_clean['latitude'] * 10) / 10
        df_clean['lon_grid'] = np.floor(df_clean['longitude'] * 10) / 10
        
        # Aggregate by grid cell and date
        grid_counts = df_clean.groupby(['lat_grid', 'lon_grid', 'acq_date']).size().reset_index(name='fire_count')
        
        # Get date range
        start_date = df_clean['acq_date'].min().strftime('%Y-%m-%d')
        end_date = df_clean['acq_date'].max().strftime('%Y-%m-%d')
        
        # Define region bounds
        region_bounds = {
            'min_lat': 24.0, 'max_lat': 50.0,
            'min_lon': -125.0, 'max_lon': -66.0
        }
        
        # MANUAL FIX: Create a proper weather dataframe directly from the CSV
        import os
        bulk_weather_file = os.path.join('..', 'data', 'bulk_weather_data.csv')
        weather_df = pd.read_csv(bulk_weather_file)
        
        # Add coordinates for USA data
        weather_df['latitude'] = 39.8  # Center of continental US
        weather_df['longitude'] = -98.5
        
        # Convert datetime to proper date format
        weather_df['date'] = pd.to_datetime(weather_df['datetime']).dt.date
        
        # Save to parquet for the service to use
        weather_parquet = os.path.join('..', 'data', 'weather_data.parquet')
        weather_df.to_parquet(weather_parquet, index=False)
        
        # Now get weather for locations
        df_features = weather_service.get_weather_for_locations(grid_counts)
        
        logging.info(f"Available columns in weather data: {df_features.columns.tolist()}")
        
        # Ensure we have the right column names
        if 'tempmax' in df_features.columns and 'temperature' not in df_features.columns:
            df_features['temperature'] = df_features['tempmax']
        if 'windspeed' in df_features.columns and 'wind_speed' not in df_features.columns:
            df_features['wind_speed'] = df_features['windspeed']
        if 'precip' in df_features.columns and 'precipitation' not in df_features.columns:
            df_features['precipitation'] = df_features['precip']
        if 'humidity' not in df_features.columns and 'humidity_max' in df_features.columns:
            df_features['humidity'] = df_features['humidity_max']
        
        # Define features and target
        feature_cols = ['temperature', 'precipitation', 'wind_speed', 'humidity']
        
        # Check if all required columns exist
        missing_cols = [col for col in feature_cols if col not in df_features.columns]
        if missing_cols:
            logging.warning(f"Missing columns: {missing_cols}. Adding default values.")
            for col in missing_cols:
                if col == 'temperature':
                    df_features[col] = 25.0  # Default temperature
                elif col == 'precipitation':
                    df_features[col] = 0.0   # Default precipitation
                elif col == 'wind_speed':
                    df_features[col] = 10.0  # Default wind speed
                elif col == 'humidity':
                    df_features[col] = 50.0  # Default humidity
        
        # Define features and target
        X = df_features[feature_cols]
        y = df_features['fire_count'] > 0  # Binary classification: fire or no fire
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        model = train_model(X_train_scaled, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test_scaled)
        print(classification_report(y_test, y_pred))
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Importance': model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print("\nFeature Importance:")
        print(feature_importance)
        
    except Exception as e:
        logging.error(f"Error in feature engineering or modeling: {e}")
        import traceback
        traceback.print_exc()
        
except Exception as e:
    logging.error(f"Error in model prototype: {e}")
    import traceback
    traceback.print_exc()

2025-05-13 09:56:45,834 INFO Loaded data with shape: (998, 13)
2025-05-13 09:56:45,839 INFO Weather data will be saved to: /workspace/data
2025-05-13 09:56:45,866 INFO Available columns in weather data: ['lat_grid', 'lon_grid', 'acq_date', 'fire_count', 'date', 'name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'sunrise', 'sunset', 'moonphase', 'conditions', 'description', 'icon', 'stations', 'latitude', 'longitude']


              precision    recall  f1-score   support

        True       1.00      1.00      1.00        55

    accuracy                           1.00        55
   macro avg       1.00      1.00      1.00        55
weighted avg       1.00      1.00      1.00        55


Feature Importance:
         Feature  Importance
0    temperature         0.0
1  precipitation         0.0
2     wind_speed         0.0
3       humidity         0.0


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
