# ðŸ”¥ AI-Based Forest Fire & Smoke Detection

---

## Project Overview

This notebook implements an **end-to-end machine learning pipeline** for forest fire and smoke detection from aerial imagery. The pipeline covers:

1. **Data Loading & EDA** - Statistical analysis and data understanding
2. **Feature Engineering** - Domain-specific feature creation
3. **Feature Selection** - Multiple selection methods
4. **Model Development** - Multi-model comparison
5. **Hyperparameter Tuning** - Optimization with cross-validation
6. **Model Interpretability** - SHAP-based explanations
7. **Spatial Analysis** - Risk heatmap visualization

**Author:** Avirup Roy  
**Date:** January 2026  
**Version:** 1.0.0

---

## 1. Environment Setup & Imports

Import all required libraries and set reproducibility seeds.

In [None]:
# =============================================================================
# IMPORTS & CONFIGURATION
# =============================================================================

# Standard Library
import random
import sys
import json
import warnings
warnings.filterwarnings('ignore')

# Data Manipulation
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu, shapiro

# Machine Learning
import sklearn
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score,
    cross_validate, GridSearchCV, RandomizedSearchCV
)
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, IsolationForest
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Evaluation Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, precision_recall_curve, matthews_corrcoef,
    average_precision_score, cohen_kappa_score,
    balanced_accuracy_score, log_loss
)
from sklearn.inspection import permutation_importance

# Feature Selection
from sklearn.feature_selection import (
    SelectKBest, f_classif, mutual_info_classif,
    RFE, SelectFromModel, VarianceThreshold, SequentialFeatureSelector
)
from sklearn.decomposition import PCA

# Model Interpretability
import shap

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model Persistence
import joblib

# Spatial Analysis
from scipy.spatial import distance
from sklearn.cluster import DBSCAN, KMeans

# =============================================================================
# REPRODUCIBILITY CONFIGURATION
# =============================================================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Display settings
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')

# Version info
print("=" * 50)
print("ENVIRONMENT INFORMATION")
print("=" * 50)
print(f"Python version: {sys.version.split()[0]}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Random Seed: {SEED}")
print("=" * 50)

## 2. Data Loading & Initial EDA

Load the dataset and perform initial exploratory analysis.

In [None]:
# =============================================================================
# DATA LOADING
# =============================================================================

DATA_PATH = 'Forest Fire Smoke Dataset.xlsx'

try:
    df = pd.read_excel(DATA_PATH)
    print(f"âœ… Dataset loaded successfully: {DATA_PATH}")
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

# Dataset Overview
print("\n" + "=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Shape: {df.shape[0]} rows Ã— {df.shape[1]} columns")
print(f"\nColumn Names:\n{df.columns.tolist()}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")

# Target Distribution
print("\n" + "=" * 50)
print("TARGET DISTRIBUTION")
print("=" * 50)
target_dist = df['fire_label'].value_counts(normalize=True) * 100
print(f"Class 0 (No Fire): {target_dist[0]:.2f}%")
print(f"Class 1 (Fire):    {target_dist[1]:.2f}%")

# Display first rows
df.head()

In [None]:
# =============================================================================
# DESCRIPTIVE STATISTICS
# =============================================================================

print("DESCRIPTIVE STATISTICS")
print("=" * 50)
df.describe().T

## 3. Statistical Analysis

Perform normality tests and statistical significance tests for features.

In [None]:
# =============================================================================
# STATISTICAL ANALYSIS
# =============================================================================

continuous_features = df.columns.drop('fire_label').tolist()

def test_normality(data, feature_name):
    """Test normality using Shapiro-Wilk test."""
    sample = data.sample(min(5000, len(data)), random_state=SEED)
    stat, p_value = shapiro(sample)
    is_normal = p_value > 0.05
    return {'feature': feature_name, 'p_value': p_value, 'is_normal': is_normal}

# Normality Tests (first 5 features)
print("NORMALITY TESTS (Shapiro-Wilk)")
print("=" * 50)
normality_results = [test_normality(df[col], col) for col in continuous_features[:5]]
print(pd.DataFrame(normality_results))

# Feature Significance Tests
print("\nFEATURE SIGNIFICANCE TESTS")
print("=" * 50)
results_sig = []
for col in continuous_features:
    fire_samples = df[df['fire_label'] == 1][col]
    no_fire_samples = df[df['fire_label'] == 0][col]
    
    _, p_t = ttest_ind(fire_samples, no_fire_samples)
    _, p_mw = mannwhitneyu(fire_samples, no_fire_samples, alternative='two-sided')
    
    results_sig.append({
        'Feature': col,
        'T-Test p': round(p_t, 6),
        'Mann-Whitney p': round(p_mw, 6),
        'Significant': p_mw < 0.05
    })

sig_df = pd.DataFrame(results_sig)
print(f"\nSignificant features (p < 0.05): {sig_df['Significant'].sum()} / {len(sig_df)}")
sig_df

## 4. Outlier Detection

In [None]:
# =============================================================================
# OUTLIER DETECTION
# =============================================================================

def detect_outliers_iqr(df, column):
    """Detect outliers using IQR method."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("OUTLIER DETECTION (IQR Method)")
print("=" * 50)
outlier_summary = []
for col in continuous_features[:5]:
    count, lower, upper = detect_outliers_iqr(df, col)
    outlier_summary.append({'Feature': col, 'Outliers': count, 'Lower': round(lower, 2), 'Upper': round(upper, 2)})

pd.DataFrame(outlier_summary)

## 5. Feature Engineering

Create domain-specific features for fire detection.

In [None]:
# =============================================================================
# FEATURE ENGINEERING
# =============================================================================

print("FEATURE ENGINEERING")
print("=" * 50)
original_features = df.shape[1]

# 1. Spectral Ratios
df['red_green_ratio'] = df['mean_red'] / (df['mean_green'] + 1e-10)
df['red_blue_ratio_calc'] = df['mean_red'] / (df['mean_blue'] + 1e-10)
print("âœ… Created spectral ratio features")

# 2. Texture Aggregations
color_cols = ['mean_red', 'mean_green', 'mean_blue']
df['color_mean'] = df[color_cols].mean(axis=1)
df['color_std'] = df[color_cols].std(axis=1)
print("âœ… Created texture aggregation features")

# 3. Log Transformation
if 'intensity_std' in df.columns:
    df['log_intensity_std'] = np.log1p(df['intensity_std'])
    print("âœ… Created log-transformed features")

# 4. Spatial Coordinates (for demonstration)
if 'x_coord' not in df.columns:
    df['x_coord'] = np.random.randint(0, 1000, size=len(df))
    df['y_coord'] = np.random.randint(0, 1000, size=len(df))
    print("âœ… Added simulated spatial coordinates")

print(f"\nFeatures: {original_features} â†’ {df.shape[1]} (+{df.shape[1] - original_features} new)")

## 6. Feature Selection

In [None]:
# =============================================================================
# FEATURE SELECTION
# =============================================================================

X = df.drop('fire_label', axis=1)
y = df['fire_label']

print("FEATURE SELECTION")
print("=" * 50)

# 1. Variance Threshold
selector_vt = VarianceThreshold(threshold=0.01)
X_vt = selector_vt.fit_transform(X)
print(f"After VarianceThreshold: {X_vt.shape[1]} features")

# 2. SelectKBest
k_best = min(10, X.shape[1])
selector_kb = SelectKBest(score_func=f_classif, k=k_best)
X_kb = selector_kb.fit_transform(X, y)
selected_features_kb = X.columns[selector_kb.get_support()].tolist()
print(f"\nTop {k_best} features via SelectKBest:")
print(selected_features_kb)

# 3. RFE
rf_estimator = RandomForestClassifier(n_estimators=50, random_state=SEED, n_jobs=-1)
n_rfe = min(8, X.shape[1])
selector_rfe = RFE(estimator=rf_estimator, n_features_to_select=n_rfe, step=1)
selector_rfe.fit(X, y)
selected_features_rfe = X.columns[selector_rfe.support_].tolist()
print(f"\nTop {n_rfe} features via RFE:")
print(selected_features_rfe)

## 7. Model Development & Evaluation

In [None]:
# =============================================================================
# MODEL DEVELOPMENT
# =============================================================================

print("MODEL DEVELOPMENT")
print("=" * 50)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)
print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set:  {X_test.shape[0]} samples")

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=SEED),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=SEED, eval_metric='logloss', verbosity=0),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=SEED, verbose=-1)
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    results.append({
        'Model': name,
        'Accuracy': round(accuracy_score(y_test, y_pred), 4),
        'Precision': round(precision_score(y_test, y_pred), 4),
        'Recall': round(recall_score(y_test, y_pred), 4),
        'F1-Score': round(f1_score(y_test, y_pred), 4),
        'ROC-AUC': round(roc_auc_score(y_test, y_prob), 4)
    })

results_df = pd.DataFrame(results).sort_values('F1-Score', ascending=False)
print("\nMODEL COMPARISON RESULTS")
print("=" * 50)
results_df

## 8. Hyperparameter Tuning

In [None]:
# =============================================================================
# HYPERPARAMETER TUNING
# =============================================================================

print("HYPERPARAMETER TUNING (XGBoost)")
print("=" * 50)

param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

random_search = RandomizedSearchCV(
    XGBClassifier(random_state=SEED, eval_metric='logloss', verbosity=0),
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='f1',
    random_state=SEED,
    n_jobs=-1
)
random_search.fit(X_train_scaled, y_train)
best_xgb = random_search.best_estimator_

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best F1-Score (CV): {random_search.best_score_:.4f}")

## 9. Model Interpretability (SHAP)

In [None]:
# =============================================================================
# MODEL INTERPRETABILITY
# =============================================================================

print("SHAP ANALYSIS")
print("=" * 50)

rf_model = models['Random Forest']
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_scaled)

print("Generating SHAP Summary Plot...")
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values[1], X_test, feature_names=X.columns.tolist(), show=False)
plt.tight_layout()
plt.show()
print("âœ… SHAP analysis complete")

## 10. Spatial Analysis & Risk Heatmap

In [None]:
# =============================================================================
# SPATIAL ANALYSIS
# =============================================================================

print("SPATIAL RISK ANALYSIS")
print("=" * 50)

def aggregate_predictions_by_grid(df, probabilities, grid_size=100):
    """Aggregate predictions spatially into grid cells."""
    df_copy = df.copy()
    df_copy['risk_score'] = probabilities
    df_copy['grid_x'] = (df_copy['x_coord'] // grid_size).astype(int)
    df_copy['grid_y'] = (df_copy['y_coord'] // grid_size).astype(int)
    
    grid_agg = df_copy.groupby(['grid_x', 'grid_y']).agg({
        'risk_score': ['mean', 'max', 'count']
    }).reset_index()
    grid_agg.columns = ['grid_x', 'grid_y', 'avg_risk', 'max_risk', 'tile_count']
    return grid_agg

best_model = models['Random Forest']
all_probs = best_model.predict_proba(scaler.transform(X))[:, 1]
grid_data = aggregate_predictions_by_grid(df, all_probs)

# Heatmap
plt.figure(figsize=(12, 10))
pivot_table = grid_data.pivot(index='grid_y', columns='grid_x', values='avg_risk')
sns.heatmap(pivot_table, cmap='YlOrRd', cbar_kws={'label': 'Fire Risk Score'})
plt.title('Forest Fire Risk Heatmap', fontsize=14, fontweight='bold')
plt.xlabel('Grid X')
plt.ylabel('Grid Y')
plt.tight_layout()
plt.show()
print("âœ… Risk heatmap generated")

## 11. Model Export & Reproducibility

In [None]:
# =============================================================================
# MODEL EXPORT
# =============================================================================

print("MODEL EXPORT")
print("=" * 50)

# Save model
joblib.dump(best_model, 'fire_detection_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("âœ… Model saved: fire_detection_model.pkl")
print("âœ… Scaler saved: feature_scaler.pkl")

# Save configuration
model_config = {
    'model_type': 'RandomForestClassifier',
    'features_used': X.columns.tolist(),
    'random_state': SEED,
    'scaler': 'StandardScaler',
    'train_samples': len(X_train),
    'test_samples': len(X_test)
}

with open('model_config.json', 'w') as f:
    json.dump(model_config, f, indent=4)
print("âœ… Configuration saved: model_config.json")

print("\n" + "=" * 50)
print("PIPELINE COMPLETE!")
print("=" * 50)