In [1]:
# Import Libraries
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pymongo import MongoClient
from dotenv import load_dotenv
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
import extract_to_mongodb as etm
import db_utils as dbu

In [None]:
# print(dir(dbu))

In [4]:
collection_name = os.getenv('COLLECTION_NAME_CLEANED')
naturalearth_lowres = os.getenv('NATURALEARTH_SHAPEFILE_PATH')

In [5]:
print(f"Collection Name: {collection_name}")

Collection Name: wildfire_cleaned_data


Load the Data

In [7]:
# Load the cleaned data
geo_wfp = dbu.load_data_from_mongodb(collection_name)

INFO:pymongo.client:You appear to be connected to a CosmosDB cluster. For more information regarding feature compatibility and support please visit https://www.mongodb.com/supportability/cosmosdb


An error occurred: name 'reverse_geocode_nominatim' is not defined


In [None]:
geo_wfp.head(5)

In [None]:
print("\nDescribe the GeoDataFrame:")
geo_wfp.describe()

In [None]:
# Check for missing values
print("\nMissing values in the GeoDataFrame:")
print(geo_wfp.isnull().sum())

In [None]:
geo_wfp.drop(columns=['geometry'], inplace=True)
geo_wfp.duplicated().sum()

Feature Engineering

In [None]:
import geopandas as gpd
import requests

In [None]:
geo_wfp.head(5)

In [None]:
# Ensuring 'rep_date' is in datetime format is essential to generating time-based features
geo_wfp['rep_date'] = pd.to_datetime(geo_wfp['rep_date'])

In [None]:
# Generate time-based features
geo_wfp['year'] = geo_wfp['rep_date'].dt.year
geo_wfp['month'] = geo_wfp['rep_date'].dt.month
geo_wfp['day'] = geo_wfp['rep_date'].dt.day
geo_wfp['dayofweek'] = geo_wfp['rep_date'].dt.dayofweek
geo_wfp['weekofyear'] = geo_wfp['rep_date'].dt.isocalendar().week

In [None]:
# Drop unnecessary columns
columns_to_drop = ['_id', 'rep_date', 'source', 'sensor', 'satellite']
geo_wfp.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Encode categorical variables
categorical_cols = ['fuel', 'ecozone']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(geo_wfp[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
geo_wfp = pd.concat([geo_wfp.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
geo_wfp.drop(columns=categorical_cols, inplace=True)

In [None]:
# Lag features are previous values of the target variable that help in capturing temporal dependencies and will be used as predictors in time series forecasting.
geo_wfp['cfb_lag1'] = geo_wfp['cfb'].shift(1)
geo_wfp['cfb_lag2'] = geo_wfp['cfb'].shift(2)
geo_wfp['cfb_lag3'] = geo_wfp['cfb'].shift(3)

In [None]:
# Get the rolling statistics like mean and standard deviation over a window (e.g., 7 days, 30 days)  to capture trends and seasonality in the data.
geo_wfp['cfb_roll_mean_7'] = geo_wfp['cfb'].rolling(window=7).mean()
geo_wfp['cfb_roll_std_7'] = geo_wfp['cfb'].rolling(window=7).std()
geo_wfp['cfb_roll_mean_30'] = geo_wfp['cfb'].rolling(window=30).mean()
geo_wfp['cfb_roll_std_30'] = geo_wfp['cfb'].rolling(window=30).std()

In [None]:
# Interaction features like 'temp_rh_interaction' capture the combined effect of multiple variables which might be more informative than the individual features.
geo_wfp['temp_rh_interaction'] = geo_wfp['temp'] * geo_wfp['rh']

In [None]:
# Normalize features to ensure that each feature contributes equally

features_to_scale = ['temp', 'rh', 'ws', 'cfb_lag1', 'cfb_lag2', 'cfb_lag3', 
                     'cfb_roll_mean_7', 'cfb_roll_std_7', 'cfb_roll_mean_30', 
                     'cfb_roll_std_30', 'temp_rh_interaction']

scaler = StandardScaler()
geo_wfp[features_to_scale] = scaler.fit_transform(geo_wfp[features_to_scale])

In [None]:
# Transform longitude and latitude into sine and cosine components for cyclical encoding
geo_wfp['lat_sin'] = np.sin(np.radians(geo_wfp['lat']))
geo_wfp['lat_cos'] = np.cos(np.radians(geo_wfp['lat']))
geo_wfp['lon_sin'] = np.sin(np.radians(geo_wfp['lon']))
geo_wfp['lon_cos'] = np.cos(np.radians(geo_wfp['lon']))

In [None]:
# Drop original longitude and latitude columns
geo_wfp.drop(columns=['lat', 'lon'], inplace=True)

In [None]:
# Convert to GeoDataFrame
geo_wfp = gpd.GeoDataFrame(geo_wfp, geometry=gpd.points_from_xy(geo_wfp['lon_cos'], geo_wfp['lat_cos']))
geo_wfp.set_crs(epsg=4326, inplace=True)

In [None]:
geo_wfp.head()

In [None]:
geo_wfp.info()

In [None]:
geo_wfp.columns

## Exploratory Data Analysis

In [None]:
# Select a subset of relevant columns for correlation heatmap
relevant_columns = ['temp', 'rh', 'ws', 'cfb_lag1', 'cfb_lag2', 'cfb_lag3', 
                    'cfb_roll_mean_7', 'cfb_roll_std_7', 'cfb_roll_mean_30', 
                    'cfb_roll_std_30', 'temp_rh_interaction', 'cfb']

In [None]:
# Correlation heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(geo_wfp[relevant_columns].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

Temperature (temp):
Correlation with CFB: Positive correlation of 0.25, indicating that as temperature increases, CFB tends to increase.
Correlation with RH: Negative correlation of -0.35, suggesting that higher temperatures are often associated with lower relative humidity.

Relative Humidity (rh):
Correlation with CFB: Negative correlation of -0.32, indicating that lower humidity levels are associated with higher CFB values.
Correlation with Temperature: As mentioned, there is a negative correlation with temperature.

Wind Speed (ws):
Correlation with CFB: Weak positive correlation of 0.06, indicating minimal direct impact on CFB.

Lag Features (cfb_lag1, cfb_lag2, cfb_lag3):
Correlation with CFB: Positive correlations (0.37, 0.32, 0.30), indicating that previous CFB values are good predictors of current CFB values.
Inter-Correlation: Strong inter-correlation among lag features, especially between adjacent lags.

Rolling Mean Features (cfb_roll_mean_7, cfb_roll_mean_30):
Correlation with CFB: Strong positive correlations (0.63 and 0.52 respectively). These features effectively capture the trend and are important for forecasting.

Rolling Standard Deviation Features (cfb_roll_std_7, cfb_roll_std_30):
Correlation with CFB: Moderate correlations (0.26 and 0.23), suggesting that variability over these periods has some predictive power.

Interaction Feature (temp_rh_interaction):
Correlation with CFB: Weak negative correlation (-0.12), indicating that the interaction between temperature and humidity has a small inverse relationship with CFB.
Correlation with Individual Features: Strong positive correlation with temperature (0.43) and relative humidity (0.63), as expected from the interaction.

In [None]:
# Boxplots of significant features
columns_to_visualize = ['ws', 'pcp', 'dmc', 'dc', 'ros', 'hfi', 'cfl', 'tfc0']

fig, axes = plt.subplots(len(columns_to_visualize), 1, figsize=(12, 8))
for i, column in enumerate(columns_to_visualize):
    sns.boxplot(x=geo_wfp[column], ax=axes[i], color='skyblue')
    axes[i].set_title(f'Boxplot of {column}')
plt.tight_layout()
plt.show()

Wind Speed (ws): Directly influences fire spread.
Precipitation (pcp): Affects fuel moisture and ignition likelihood.
Duff Moisture Code (dmc): Indicates moisture content in organic materials.
Drought Code (dc): Long-term indicator of dry conditions.
Rate of Spread (ros): Critical for understanding fire dynamics.
Head Fire Intensity (hfi): Measures energy release from the fire.
Crown Fraction Burned (cfl): Assesses impact on forest structure.
Total Fuel Consumption (tfc0): Provides insights into fire intensity.

In [None]:
# Create a 'year_month' column for grouping
geo_wfp['year_month'] = geo_wfp['year'].astype(str) + '-' + geo_wfp['month'].astype(str)

In [None]:
# Calculate the mean 'cfb' for each 'year_month'
monthly_avg_cfb = geo_wfp.groupby('year_month')['cfb'].mean()

In [None]:
# Generate all month labels
all_months = pd.date_range(start='2020-01-01', end='2023-12-31', freq='M').strftime('%Y-%m')

In [None]:
# Plot the time series
plt.figure(figsize=(15, 5))
monthly_avg_cfb.plot()
plt.title('Time Series of CFB by Month')
plt.xlabel('Date')
plt.ylabel('CFB')
plt.xticks(ticks=range(len(all_months)), labels=all_months, rotation=90)  # Set all month labels
plt.show()

Peaks during certain months, particularly in the summer (June, July). Indicates a seasonal trend where wildfires are more severe in summer months.

In [None]:
# Plot time series of CFB by year
plt.figure(figsize=(15, 5))
annual_avg_cfb = geo_wfp.groupby('year')['cfb'].mean().reset_index()
plt.plot(annual_avg_cfb['year'], annual_avg_cfb['cfb'], marker='o')
plt.title('Time Series of CFB by Year')
plt.xlabel('Year')
plt.ylabel('CFB')
plt.grid(True)
plt.xticks(annual_avg_cfb['year'])
plt.show()

In [None]:
# Pairplot to visualize relationships between features and the target
sns.pairplot(geo_wfp, vars=columns_to_visualize + ['cfb'])
plt.show()

Temperature (temp) vs. cfb:

A positive relationship where higher temperatures generally correlate with higher cfb values.
This confirms the importance of temperature in influencing wildfire severity.
Relative Humidity (rh) vs. cfb:

A negative relationship where lower humidity levels correlate with higher cfb values.
Highlights the role of dry conditions in exacerbating wildfires.
Wind Speed (ws) vs. cfb:

The relationship is less clear, with cfb values spread across different wind speeds.
Wind speed might have an indirect or situational impact rather than a direct one.
Drought Code (dc) vs. cfb:

Higher dc values (indicating severe drought conditions) show a spread across higher cfb values.
Suggests that drought conditions contribute significantly to wildfire severity.
Rate of Spread (ros) vs. cfb:

Positive correlation where higher ros values (faster spreading fires) are associated with higher cfb values.
Indicates that fast-spreading fires are likely more severe.
Historical Fire Data (Lag Features) vs. cfb:

Positive correlations with previous cfb values, particularly cfb_lag1, cfb_lag2, and cfb_lag3.
Shows the importance of historical fire activity in predicting current fire severity.
Rolling Statistics:

cfb_roll_mean_7 and cfb_roll_mean_30 show strong positive correlations with cfb.
Indicates that smoothed trends over time are valuable predictors of current fire severity.

In [None]:
# Wildfire Incidents by Month
monthly_incidents = geo_wfp.groupby(['year', 'month']).size().reset_index(name='count')
monthly_incidents['date'] = pd.to_datetime(monthly_incidents[['year', 'month']].assign(day=1))

plt.figure(figsize=(15, 5))
sns.lineplot(x='date', y='count', data=monthly_incidents, marker='o')
plt.title('Wildfire Incidents by Month')
plt.xlabel('Date')
plt.ylabel('Number of Incidents')
plt.grid(True)
plt.show()

In [None]:
# Group the data by year and count the number of incidents per year
wildfire_incidents_by_year = geo_wfp.groupby('year').size()

# Plot the yearly count of wildfire incidents
plt.figure(figsize=(12, 6))
wildfire_incidents_by_year.plot(kind='bar', color='firebrick', alpha=0.7)
plt.title('Wildfire Incidents by Year')
plt.xlabel('Year')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45)
plt.show()

Based on these insights, the features below will help capture the temporal dependencies and environmental conditions influencing wildfire severity

1. Lag Features: cfb_lag1, cfb_lag2, cfb_lag3

2. Rolling Statistics: cfb_roll_mean_7, cfb_roll_mean_30

3. Weather Variables: temp, rh, ws

4. Interaction Terms: temp_rh_interaction

5. Time-Based Features: year, month, day, dayofweek, weekofyear

6. Encoded Categorical Variables: Encoded fuel, ecozone, and agency


# Features to Keep
Target Variable: cfb (to predict the wildfire severity)

Time-based Features:
year
month
day

Weather Conditions:
temp (Temperature)
rh (Relative Humidity)
ws (Wind Speed)
pcp (Precipitation)

Fire Weather Index System Components:
ffmc (Fine Fuel Moisture Code)
dmc (Duff Moisture Code)
dc (Drought Code)
isi (Initial Spread Index)
bui (Build-Up Index)
fwi (Fire Weather Index)

Topography and Vegetation:
elev (Elevation)
sfc (Surface Fuel Consumption)
tfc (Total Fuel Consumption)
sfc0 (Surface Fuel Consumption at Initial Spread)
cfl (Crown Fraction Burned)
tfc0 (Total Fuel Consumption at Initial Spread)

Latitude and Longitude (Cyclical Encoding):
lat_sin, lat_cos, lon_sin, lon_cos

# Features to Drop

agency

Categorical Variables After Encoding:
Original fuel, ecozone columns after encoding

Geospatial Data:
Original lat, lon

Lag Features and Rolling Statistics:
cfb_lag1, cfb_lag2, cfb_lag3
cfb_roll_mean_7, cfb_roll_std_7, cfb_roll_mean_30, cfb_roll_std_30

temp_rh_interaction

dayofweek
weekofyear

In [None]:
# Define the columns to drop based on the current DataFrame
columns_to_drop_final = [
    'fuel_C2', 'fuel_C3', 'fuel_C4', 'fuel_C5', 'fuel_C6', 'fuel_C7', 'fuel_D1',
    'fuel_D2', 'fuel_M1_25', 'fuel_M1_35', 'fuel_M1_50', 'fuel_M1_65',
    'fuel_M1_75', 'fuel_M2_25', 'fuel_M2_35', 'fuel_M2_50', 'fuel_M2_65',
    'fuel_M2_75', 'fuel_O1a', 'fuel_O1b', 'fuel_S1', 'fuel_S2', 'fuel_bog',
    'fuel_farm', 'fuel_low_veg', 'fuel_non_fuel', 'fuel_unknown',
    'fuel_urban', 'fuel_water', 'ecozone_1', 'ecozone_10', 'ecozone_11',
    'ecozone_12', 'ecozone_13', 'ecozone_14', 'ecozone_15', 'ecozone_2',
    'ecozone_3', 'ecozone_4', 'ecozone_5', 'ecozone_5a', 'ecozone_5b',
    'ecozone_6', 'ecozone_6a', 'ecozone_6b', 'ecozone_7', 'ecozone_8',
    'ecozone_9', 'geometry', 'agency', 'fuel', 'ecozone', 'cfb_lag1', 'cfb_lag2', 'cfb_lag3',
    'cfb_roll_mean_7', 'cfb_roll_std_7', 'cfb_roll_mean_30', 'cfb_roll_std_30',
    'temp_rh_interaction', 'dayofweek', 'weekofyear'
]

# Filter the list to only include columns that exist in the DataFrame
columns_to_drop_final = [col for col in columns_to_drop_final if col in geo_wfp.columns]

# Drop the redundant or irrelevant features
geo_wfp.drop(columns=columns_to_drop_final, inplace=True)

In [None]:
# Print the final columns of the DataFrame
print("Final columns of the DataFrame:")
print(geo_wfp.columns)

In [None]:
geo_wfp.info()

In [None]:
from geopy.geocoders import Nominatim
import math

In [None]:
#analysis only, this part can be removed
geo_wfp.to_csv('engineered_wildfire_data.csv', index=False)

In [None]:
# Save the csv featured engineering data to mongodb
# dbu.insert_data_to_mongodb('engineered_wildfire_data.csv', os.getenv('COLLECTION_NAME_FEATUREENGINEERED'))


In [None]:
# Save the dataframe -  featured engineering data to mongodb
# dbu.insert_df_only_to_mongodb(geo_wfp, os.getenv('COLLECTION_NAME_FEATUREENGINEERED'))