# üî• AI-Based Forest Fire & Smoke Detection
**Author:** Avirup Roy | **Version:** 3.0 | **Date:** February 2026

---
## Pipeline Overview
1. Data Loading & EDA
2. Statistical Analysis & Multicollinearity (VIF)
3. Feature Engineering & Selection
4. **Dual Approach:** Anomaly Detection (No SMOTE) vs Classifiers (With SMOTE)
5. Cross-Validation & Hyperparameter Tuning
6. Model Comparison & Export

In [None]:
# === IMPORTS ===
import warnings; warnings.filterwarnings('ignore')
import random, sys, json
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu, multivariate_normal
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, IsolationForest)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, matthews_corrcoef, roc_curve, precision_recall_curve)
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE, VarianceThreshold

# Optional
try: from xgboost import XGBClassifier
except: XGBClassifier = None
try: from lightgbm import LGBMClassifier
except: LGBMClassifier = None
try: from imblearn.over_sampling import SMOTE; SMOTE_OK = True
except: SMOTE_OK = False
try: from statsmodels.stats.outliers_influence import variance_inflation_factor; VIF_OK = True
except: VIF_OK = False

SEED = 42; random.seed(SEED); np.random.seed(SEED)
print(f"‚úÖ Environment Ready | SMOTE: {SMOTE_OK} | VIF: {VIF_OK}")

## 1. Data Loading & EDA

In [None]:
df = pd.read_excel('Forest Fire Smoke Dataset.xlsx')
print(f"Dataset: {df.shape[0]} rows √ó {df.shape[1]} cols")
class_counts = df['fire_label'].value_counts()
imbalance_ratio = class_counts[0] / class_counts[1]
print(f"Class 0: {class_counts[0]} | Class 1: {class_counts[1]} | Ratio: {imbalance_ratio:.2f}:1")

# Descriptive Stats
desc = df.describe().T
desc['skew'], desc['kurt'] = df.skew(), df.kurtosis()
desc

In [None]:
# Distributions
fig, axes = plt.subplots(2, 5, figsize=(16, 6))
for i, col in enumerate(df.columns.drop('fire_label')[:10]):
    ax = axes.flatten()[i]
    sns.histplot(data=df, x=col, hue='fire_label', kde=True, ax=ax, palette='coolwarm')
    ax.set_title(col, fontsize=9)
plt.tight_layout(); plt.show()

## 2. Correlation & Multicollinearity

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='RdBu_r', center=0, mask=np.triu(np.ones_like(df.corr(), dtype=bool)))
plt.title('Correlation Matrix'); plt.tight_layout(); plt.show()

# VIF
if VIF_OK:
    X_vif = StandardScaler().fit_transform(df.drop('fire_label', axis=1))
    vif = pd.DataFrame({'Feature': df.columns.drop('fire_label'), 'VIF': [variance_inflation_factor(X_vif, i) for i in range(X_vif.shape[1])]})
    print(vif.sort_values('VIF', ascending=False).to_string(index=False))

## 3. Statistical Tests & Feature Engineering

In [None]:
# Significance Tests
sig_results = []
for col in df.columns.drop('fire_label'):
    fire, no_fire = df[df['fire_label']==1][col], df[df['fire_label']==0][col]
    _, p = mannwhitneyu(fire, no_fire)
    d = (fire.mean() - no_fire.mean()) / np.sqrt(((len(fire)-1)*fire.std()**2 + (len(no_fire)-1)*no_fire.std()**2) / (len(fire)+len(no_fire)-2))
    sig_results.append({'Feature': col, 'p-value': round(p, 6), 'Cohen_d': round(d, 3), 'Sig': p < 0.05})
sig_df = pd.DataFrame(sig_results).sort_values('Cohen_d', key=abs, ascending=False)
print(f"Significant features: {sig_df['Sig'].sum()}/{len(sig_df)}")
sig_df.head(10)

In [None]:
# Feature Engineering
orig = df.shape[1]
df['red_green_ratio'] = df['mean_red'] / (df['mean_green'] + 1e-10)
df['red_blue_calc'] = df['mean_red'] / (df['mean_blue'] + 1e-10)
df['color_mean'] = df[['mean_red', 'mean_green', 'mean_blue']].mean(axis=1)
df['color_std'] = df[['mean_red', 'mean_green', 'mean_blue']].std(axis=1)
if 'intensity_std' in df.columns: df['log_intensity'] = np.log1p(df['intensity_std'].clip(0))
if 'hot_pixel_fraction' in df.columns: df['log_hotpixel'] = np.log1p(df['hot_pixel_fraction'])
df['x_coord'] = np.random.randint(0, 1000, len(df))
df['y_coord'] = np.random.randint(0, 1000, len(df))
print(f"Features: {orig} ‚Üí {df.shape[1]} (+{df.shape[1]-orig})")

## 4. Feature Selection & Data Prep

In [None]:
X, y = df.drop('fire_label', axis=1), df['fire_label']

# SelectKBest
kb = SelectKBest(f_classif, k=min(12, X.shape[1])).fit(X, y)
print("Top features (F-score):")
print(pd.DataFrame({'Feature': X.columns, 'Score': kb.scores_}).sort_values('Score', ascending=False).head(10).to_string(index=False))

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
scaler = StandardScaler()
X_train_sc, X_test_sc = scaler.fit_transform(X_train), scaler.transform(X_test)
print(f"\nTrain: {len(y_train)} | Test: {len(y_test)}")

## 5A. Approach A: Anomaly Detection (No SMOTE)

In [None]:
print("=" * 50)
print("APPROACH A: ANOMALY DETECTION")
print("=" * 50)
X_normal = X_train_sc[y_train == 0]
contam = y_train.mean()

anomaly_models = {
    'Elliptic Envelope': EllipticEnvelope(contamination=contam, random_state=SEED),
    'Isolation Forest': IsolationForest(contamination=contam, random_state=SEED, n_jobs=-1),
    'One-Class SVM': OneClassSVM(nu=contam, kernel='rbf')
}

anomaly_results = []
for name, model in anomaly_models.items():
    model.fit(X_normal)
    pred = np.where(model.predict(X_test_sc) == -1, 1, 0)
    anomaly_results.append({'Model': name, 'Acc': round(accuracy_score(y_test, pred), 4),
        'Prec': round(precision_score(y_test, pred), 4), 'Recall': round(recall_score(y_test, pred), 4),
        'F1': round(f1_score(y_test, pred), 4), 'MCC': round(matthews_corrcoef(y_test, pred), 4)})

anomaly_df = pd.DataFrame(anomaly_results).sort_values('F1', ascending=False)
anomaly_df

## 5B. Approach B: Classifiers (With SMOTE)

In [None]:
print("=" * 50)
print("APPROACH B: SMOTE + CLASSIFIERS")
print("=" * 50)

if SMOTE_OK:
    X_sm, y_sm = SMOTE(random_state=SEED).fit_resample(X_train_sc, y_train)
    print(f"SMOTE: {np.bincount(y_train)} ‚Üí {np.bincount(y_sm)}")
else:
    X_sm, y_sm = X_train_sc, y_train

models = {
    'Logistic': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    'Random Forest': RandomForestClassifier(100, class_weight='balanced', random_state=SEED, n_jobs=-1),
    'Gradient Boost': GradientBoostingClassifier(100, random_state=SEED),
    'AdaBoost': AdaBoostClassifier(100, random_state=SEED),
    'ExtraTrees': ExtraTreesClassifier(100, class_weight='balanced', random_state=SEED, n_jobs=-1),
    'SVM': SVC(probability=True, class_weight='balanced', random_state=SEED)
}
if XGBClassifier: models['XGBoost'] = XGBClassifier(100, scale_pos_weight=imbalance_ratio, random_state=SEED, verbosity=0)
if LGBMClassifier: models['LightGBM'] = LGBMClassifier(100, class_weight='balanced', random_state=SEED, verbose=-1)

results, trained = [], {}
for name, model in models.items():
    model.fit(X_sm, y_sm); trained[name] = model
    pred = model.predict(X_test_sc)
    prob = model.predict_proba(X_test_sc)[:, 1] if hasattr(model, 'predict_proba') else None
    results.append({'Model': name, 'Acc': round(accuracy_score(y_test, pred), 4),
        'Prec': round(precision_score(y_test, pred), 4), 'Recall': round(recall_score(y_test, pred), 4),
        'F1': round(f1_score(y_test, pred), 4), 'ROC-AUC': round(roc_auc_score(y_test, prob), 4) if prob is not None else None,
        'PR-AUC': round(average_precision_score(y_test, prob), 4) if prob is not None else None})

results_df = pd.DataFrame(results).sort_values('F1', ascending=False)
results_df

## 6. Hyperparameter Tuning

In [None]:
# Tune Random Forest
rf_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=SEED, n_jobs=-1),
    {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]},
    n_iter=8, cv=3, scoring='f1', random_state=SEED, n_jobs=-1
)
rf_search.fit(X_sm, y_sm)
print(f"Best RF: {rf_search.best_params_} | F1: {rf_search.best_score_:.4f}")

# Evaluate
best_rf = rf_search.best_estimator_
y_pred = best_rf.predict(X_test_sc)
y_prob = best_rf.predict_proba(X_test_sc)[:, 1]
print(f"\nTuned RF on Test: F1={f1_score(y_test, y_pred):.4f} | ROC-AUC={roc_auc_score(y_test, y_prob):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

## 7. ROC & PR Curves

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for name, m in list(trained.items())[:5]:
    if hasattr(m, 'predict_proba'):
        prob = m.predict_proba(X_test_sc)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, prob)
        axes[0].plot(fpr, tpr, label=f"{name} ({roc_auc_score(y_test, prob):.3f})")
        prec, rec, _ = precision_recall_curve(y_test, prob)
        axes[1].plot(rec, prec, label=f"{name} ({average_precision_score(y_test, prob):.3f})")
axes[0].plot([0,1], [0,1], 'k--'); axes[0].set_title('ROC Curves'); axes[0].legend(fontsize=8)
axes[1].set_title('PR Curves'); axes[1].legend(fontsize=8)
plt.tight_layout(); plt.show()

## 8. Feature Importance & Comparison

In [None]:
fi = pd.DataFrame({'Feature': X.columns, 'Importance': best_rf.feature_importances_}).sort_values('Importance', ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(data=fi.head(12), x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance'); plt.tight_layout(); plt.show()

# Compare approaches
print("\n" + "=" * 50)
best_a, best_b = anomaly_df.iloc[0], results_df.iloc[0]
print(f"Best Anomaly: {best_a['Model']} (F1={best_a['F1']})")
print(f"Best Classifier: {best_b['Model']} (F1={best_b['F1']})")
final = best_b['Model'] if best_b['F1'] > best_a['F1'] else best_a['Model']
print(f"\nüèÜ WINNER: {final}")

## 9. Export

In [None]:
joblib.dump(best_rf, 'fire_detection_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
json.dump({'model': 'RandomForest', 'features': X.columns.tolist(), 'params': rf_search.best_params_}, open('model_config.json', 'w'), indent=2)
print("‚úÖ Exported: fire_detection_model.pkl, feature_scaler.pkl, model_config.json")
print("\n" + "=" * 50 + "\nüéâ PIPELINE COMPLETE!\n" + "=" * 50)