In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score

# !pip install optuna
import optuna


set_config(transform_output="pandas")
plt.style.use('fivethirtyeight')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('preprocessed_fire_data_clean.csv')

In [3]:
df.shape

(200000, 22)

In [4]:
def remove_high_corr(df, threshold=0.85):

    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) == 0:
        print("No numeric columns found")
        return df
    
    corr_matrix = numeric_df.corr().abs()
    
    upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] >= threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    columns_to_remove = set()
    for col1, col2 in high_corr_pairs:
        columns_to_remove.add(col2)  
    
    df_clean = df.drop(columns=columns_to_remove)
    
    print(f"Removed {len(columns_to_remove)} columns: {list(columns_to_remove)}")
    return df_clean

In [5]:
df_clean = remove_high_corr(df, threshold=0.85)

Removed 4 columns: ['is_afternoon', 'week_of_year', 'day_of_year', 'acq_hour']


In [6]:
# Separate features and target
X = df_clean.drop(['severity_class','instrument'], axis=1)
y = df_clean['severity_class']

In [7]:
train_mask = df_clean['year'].isin([2020, 2021, 2022]) # Training on 2020,2021,2022
test_mask = df_clean['year'].isin([2023, 2024]) # Testing on 2023,2024

In [8]:
X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

In [9]:
categorical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['object']]
categorical_cols

['satellite', 'daynight', 'season', 'region']

In [10]:
ohe_encode=['satellite','daynight','season','region']

In [11]:
numerical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['int64','float64']]
numerical_cols

['latitude',
 'longitude',
 'bright_ti4',
 'scan',
 'track',
 'confidence',
 'version',
 'bright_ti5',
 'year',
 'month',
 'is_fire_season',
 'is_daytime']

In [12]:
preprocessor=ColumnTransformer(transformers=[
    ("ohe",OneHotEncoder(drop='first',sparse_output=False),ohe_encode),
    ("ordinal",OrdinalEncoder(categories=[['Summer', 'Autumn', 'Winter', 'Spring']]),["season"]),
    ("scaler",StandardScaler(),numerical_cols)
],remainder='passthrough')

In [13]:
pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor)
]
)

In [14]:
X_train_trans=pipeline.fit_transform(X_train)
X_test_trans=pipeline.transform(X_test)

In [15]:
X_train_trans

Unnamed: 0,ohe__daynight_N,ohe__season_Spring,ohe__season_Summer,ohe__season_Winter,ohe__region_North,ohe__region_South,ordinal__season,scaler__latitude,scaler__longitude,scaler__bright_ti4,scaler__scan,scaler__track,scaler__confidence,scaler__version,scaler__bright_ti5,scaler__year,scaler__month,scaler__is_fire_season,scaler__is_daytime
4,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.919787,-0.006960,0.343207,-0.474795,-0.302197,0.155708,0.0,0.507840,-1.201649,0.026119,-0.672395,0.665605
8,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.841074,-0.347221,-0.094947,0.653599,-0.559361,0.155708,0.0,-0.003207,0.022163,-0.573244,-0.672395,0.665605
9,0.0,1.0,0.0,0.0,0.0,0.0,3.0,-0.612836,-0.311499,1.653494,1.217795,1.755118,1.845041,0.0,0.990545,1.245975,1.224844,1.487221,0.665605
11,1.0,0.0,0.0,1.0,1.0,0.0,2.0,1.049544,-0.300408,-1.619350,-0.249116,-0.216476,0.155708,0.0,-0.859675,0.022163,0.026119,-0.672395,-1.502393
14,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.710556,-0.602925,1.653494,-0.587634,0.983625,1.845041,0.0,0.698265,0.022163,0.026119,-0.672395,0.665605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.903169,-0.179374,-1.584360,-0.700474,-0.387919,0.155708,0.0,-0.777305,-1.201649,-0.573244,-0.672395,-1.502393
199995,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.800497,1.055155,1.653494,-0.813313,0.897903,1.845041,0.0,0.411299,0.022163,-1.472288,1.487221,0.665605
199996,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.107810,0.770635,0.533301,-0.813313,0.897903,0.155708,0.0,0.618552,0.022163,1.224844,1.487221,0.665605
199997,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.504468,0.180015,1.653494,2.797546,2.269447,1.845041,0.0,-0.507168,0.022163,1.524525,1.487221,0.665605


In [17]:
#### Stacking Classifier:
# Best parameter for lgbm:
best_param_lgbm={
    'n_estimators': 200, 
    'learning_rate': 0.10203428371924908, 
    'max_depth': 19
}

# Best parameter for random forest:
best_param_rf={
    'n_estimators': 180, 
    'max_depth': 20
}
best_rf=RandomForestClassifier(**best_param_rf)
best_lgbm=LGBMClassifier(**best_param_lgbm)

In [18]:
def objective(trial):
        model_name = trial.suggest_categorical("model", ["LR", "KNN", "DT"])

        if model_name == "LR":
            fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
            base_model = LogisticRegression(fit_intercept=fit_intercept)

        elif model_name == "KNN":
            n_neighbors = trial.suggest_int("n_neighbors", 3, 11)
            weights = trial.suggest_categorical("weights", ["uniform", "distance"])
            p = trial.suggest_int("p", 1, 2)
            base_model = KNeighborsClassifier(
                n_neighbors=n_neighbors,
                weights=weights,
                p=p
            )

        elif model_name == "DT":
            max_depth = trial.suggest_int("max_depth", 2, 10)
            min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
            base_model = DecisionTreeClassifier(
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )

        stacked_regressor=StackingClassifier(
            estimators=[("rf",best_rf),("lgbm",best_lgbm)],
            final_estimator=base_model,n_jobs=-1
        )


        stacked_regressor.fit(X_train_trans,y_train)
        y_pred=stacked_regressor.predict(X_test_trans)
        error=f1_score(y_test,y_pred,average='weighted')

        return error

In [19]:
study=optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=15,n_jobs=-1)


[I 2025-10-29 22:44:24,207] A new study created in memory with name: no-name-9461d725-7161-4a1e-beff-7ba996775ae6
[I 2025-10-29 22:47:15,380] Trial 0 finished with value: 0.7836947502706428 and parameters: {'model': 'KNN', 'n_neighbors': 9, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.7836947502706428.
[I 2025-10-29 22:48:11,021] Trial 3 finished with value: 0.7985773203148858 and parameters: {'model': 'LR', 'fit_intercept': True}. Best is trial 3 with value: 0.7985773203148858.
[I 2025-10-29 22:48:17,463] Trial 6 finished with value: 0.7972729565362814 and parameters: {'model': 'DT', 'max_depth': 5, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.7985773203148858.
[I 2025-10-29 22:48:52,475] Trial 4 finished with value: 0.7990614288411196 and parameters: {'model': 'LR', 'fit_intercept': True}. Best is trial 4 with value: 0.7990614288411196.
[I 2025-10-29 22:49:04,689] Trial 5 finished with value: 0.7972729565362814 and parameters: {'model': 'DT', 'max_depth': 4, 

In [20]:

study.best_value

0.7990645412165412

In [21]:

best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'model': 'LR', 'fit_intercept': False}
Best trial accuracy: 0.7990645412165412
