In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from sklearn.preprocessing import StandardScaler

In [None]:
# Load merged data
merged_data_path = os.path.join('..', 'data', 'merged_firms_weather.json')
with open(merged_data_path, 'r') as f:
    data = json.load(f)

In [None]:
# Convert to DataFrame for easier manipulation
records = []
for item in data:
    record = {}
    # Extract event data
    for key, value in item['event'].items():
        record[f"event_{key}"] = value
    
    # Extract weather data
    if item['weather']:
        for key, value in item['weather'].items():
            record[f"weather_{key}"] = value
    
    # Extract vegetation data
    if item.get('vegetation'):
        for key, value in item['vegetation'].items():
            record[f"vegetation_{key}"] = value
    
    records.append(record)

df = pd.DataFrame(records)

# Feature Engineering

In [None]:
# 1. Calculate NDVI anomaly
# This measures how much the current NDVI deviates from historical averages
# For demonstration, we'll use a simple approach - actual implementation would use historical data
if 'vegetation_ndvi' in df.columns:
    # Simulated historical average NDVI (would come from historical data)
    # Group by location (rounded lat/lon) to get typical NDVI values for each area
    location_avg_ndvi = df.groupby(['event_latitude', 'event_longitude'])['vegetation_ndvi'].mean()
    
    # Add this back to the dataframe
    for idx, row in df.iterrows():
        lat = row['event_latitude']
        lon = row['event_longitude']
        if (lat, lon) in location_avg_ndvi:
            df.at[idx, 'ndvi_historical_avg'] = location_avg_ndvi[(lat, lon)]
    
    # Calculate anomaly
    df['ndvi_anomaly'] = df['vegetation_ndvi'] - df['ndvi_historical_avg']


In [None]:
# 2. Calculate Normalized Difference Water Index (NDWI)
# For demonstration - in real implementation, would come from satellite data
# NDWI = (NIR - SWIR) / (NIR + SWIR)
# We'll simulate NDWI based on NDVI and precipitation for this example
if 'vegetation_ndvi' in df.columns and 'weather_precip' in df.columns:
    # This is a simplified approximation - real NDWI would come from satellite data
    df['ndwi_approx'] = 0.5 * df['vegetation_ndvi'] + 0.5 * np.log1p(df['weather_precip'] + 1)


In [None]:
# 3. Calculate Vegetation Health Index (VHI)
# VHI combines vegetation and temperature information
if 'vegetation_ndvi' in df.columns and 'weather_temp' in df.columns:
    # Standardize NDVI and temperature
    scaler = StandardScaler()
    if df['vegetation_ndvi'].notna().any() and df['weather_temp'].notna().any():
        ndvi_std = scaler.fit_transform(df[['vegetation_ndvi']])
        temp_std = scaler.fit_transform(df[['weather_temp']])
        
        # Higher temps typically mean lower vegetation health
        # VHI = 0.5 * (NDVI factor) + 0.5 * (Temperature factor)
        df['vhi'] = 0.5 * ndvi_std.flatten() - 0.5 * temp_std.flatten()


In [None]:
# 4. Create time-based features related to vegetation seasonal cycles
if 'event_date' in df.columns:
    df['event_date'] = pd.to_datetime(df['event_date'])
    df['day_of_year'] = df['event_date'].dt.dayofyear
    
    # Calculate days since peak greenness (approximated as day 180 - summer in Northern Hemisphere)
    df['days_from_peak_greenness'] = abs(df['day_of_year'] - 180)


# Visualize relationships


In [None]:
# NDVI vs Fire Occurrence
plt.figure(figsize=(10, 6))
sns.boxplot(x='vegetation_ndvi', data=df)
plt.title('NDVI Distribution in Fire Events')
plt.tight_layout()
plt.show()


In [None]:
# NDVI vs Temperature
if 'vegetation_ndvi' in df.columns and 'weather_temp' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='weather_temp', y='vegetation_ndvi', data=df)
    plt.title('NDVI vs Temperature')
    plt.tight_layout()
    plt.show()


In [None]:
# NDVI Anomaly vs Temperature
if 'ndvi_anomaly' in df.columns and 'weather_temp' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='weather_temp', y='ndvi_anomaly', data=df)
    plt.title('NDVI Anomaly vs Temperature')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.tight_layout()
    plt.show()


In [None]:
# Save enhanced features
df.to_csv(os.path.join('..', 'data', 'enhanced_features.csv'), index=False)
print(f"Enhanced features saved to {os.path.join('..', 'data', 'enhanced_features.csv')}")