
### Predicting Burnt Area During Forest Fires
##### This notebook analyzes and predicts the burnt area during the forest fire incidents based on meteorological and other related conditions during forest fire incidents in Portugal between 2009 and 2018.
    

In [38]:

%pip install pandas kagglehub matplotlib seaborn scikit-learn numpy
    

Note: you may need to restart the kernel to use updated packages.


In [39]:

import os
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np



    ## Download Dataset
    Downloads the latest version of the dataset using KaggleHub.
    

In [40]:

dataset_path = kagglehub.dataset_download("sumitm004/forest-fire-area")
print(f"Dataset downloaded to: {dataset_path}")


Dataset downloaded to: C:\Users\anish\.cache\kagglehub\datasets\sumitm004\forest-fire-area\versions\6



    ## Load Dataset
    Loads the dataset and preprocesses categorical variables.
    

In [41]:

df = pd.read_csv(os.path.join(dataset_path, "forestfires.csv"))
df = pd.get_dummies(df, columns=['month', 'day'], drop_first=True)
df.head()


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_may,month_nov,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,False,False,False,False,False,False,False,False,False,False
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,False,False,True,False,False,False,False,False,True,False
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,False,False,True,False,False,True,False,False,False,False
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,False,False,False,False,False,False,False,False,False,False
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,False,False,False,False,False,False,True,False,False,False



    ## Preprocess Data
    Standardizes numerical features using StandardScaler.
    

In [42]:

scaler = StandardScaler()
numerical_features = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
df.head()


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_may,month_nov,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,1.0083,0.5699,-0.806,-1.3233,-1.8305,-0.8609,-1.8426,0.4117,1.4986,-0.0733,...,False,False,False,False,False,False,False,False,False,False
1,1.0083,-0.244,-0.0081,-1.1795,0.4889,-0.5097,-0.1533,-0.6925,-1.7418,-0.0733,...,False,False,True,False,False,False,False,False,True,False
2,1.0083,-0.244,-0.0081,-1.0498,0.5607,-0.5097,-0.7394,-0.6925,-1.5183,-0.0733,...,False,False,True,False,False,True,False,False,False,False
3,1.4409,1.3837,0.1914,-1.2124,-1.8983,-0.0048,-1.8254,3.2335,-0.0098,0.6032,...,False,False,False,False,False,False,False,False,False,False
4,1.4409,1.3837,-0.2438,-0.931,-1.7986,0.127,-1.291,3.3562,-1.2389,-0.0733,...,False,False,False,False,False,False,True,False,False,False


## Feature Engineering
### Creates new features such as log-transformed area and fire potential.
    

In [45]:

df['log_area'] = np.log1p(df['area'])
df['fire_potential'] = df['temp'] * df['wind']
df.head()


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed,log_area,fire_potential
0,1.0083,0.5699,-0.806,-1.3233,-1.8305,-0.8609,-1.8426,0.4117,1.4986,-0.0733,...,False,False,False,False,False,False,False,False,0.0,-2.7614
1,1.0083,-0.244,-0.0081,-1.1795,0.4889,-0.5097,-0.1533,-0.6925,-1.7418,-0.0733,...,True,False,False,False,False,False,True,False,0.0,0.267
2,1.0083,-0.244,-0.0081,-1.0498,0.5607,-0.5097,-0.7394,-0.6925,-1.5183,-0.0733,...,True,False,False,True,False,False,False,False,0.0,1.1226
3,1.4409,1.3837,0.1914,-1.2124,-1.8983,-0.0048,-1.8254,3.2335,-0.0098,0.6032,...,False,False,False,False,False,False,False,False,0.0,0.018
4,1.4409,1.3837,-0.2438,-0.931,-1.7986,0.127,-1.291,3.3562,-1.2389,-0.0733,...,False,False,False,False,True,False,False,False,0.0,1.5995


## Exploratory Data Analysis
### Generates a correlation heatmap and burnt area distribution plots.
    

In [None]:

plt.figure(figsize=(14, 10))  # Increase figure size
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 8}, linewidths=0.5)
plt.yticks(fontsize=10)  # Reduce y-axis label size
plt.title("Feature Correlation Heatmap", fontsize=12, fontweight='bold')
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(df['area'], bins=100, kde=True)
plt.title("Burnt Area Distribution")
plt.xlabel("Burnt Area (ha)")
plt.ylabel("Frequency")
plt.xlim(0, df['area'].quantile(0.99))
plt.show()



    ## Train and Evaluate Model
    Trains a Random Forest model with hyperparameter tuning and evaluates performance.
    

In [None]:

features = df.drop(columns=['area', 'log_area'])
target = df['log_area']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': best_model.feature_importances_})
print("Top Features:")
print(feature_importances.sort_values(by='Importance', ascending=False).head(10))
