In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,f1_score

# !pip install optuna
import optuna


set_config(transform_output="pandas")
plt.style.use('fivethirtyeight')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('preprocessed_fire_data_clean.csv')

In [4]:
df.shape

(200000, 22)

In [5]:
def remove_high_corr(df, threshold=0.85):

    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) == 0:
        print("No numeric columns found")
        return df
    
    corr_matrix = numeric_df.corr().abs()
    
    upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] >= threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    columns_to_remove = set()
    for col1, col2 in high_corr_pairs:
        columns_to_remove.add(col2)  
    
    df_clean = df.drop(columns=columns_to_remove)
    
    print(f"Removed {len(columns_to_remove)} columns: {list(columns_to_remove)}")
    return df_clean

In [6]:
df_clean = remove_high_corr(df, threshold=0.85)

Removed 4 columns: ['week_of_year', 'day_of_year', 'is_afternoon', 'acq_hour']


In [7]:
# Separate features and target
X = df_clean.drop(['severity_class','instrument'], axis=1)
y = df_clean['severity_class']

In [8]:
train_mask = df_clean['year'].isin([2020, 2021, 2022]) # Training on 2020,2021,2022
test_mask = df_clean['year'].isin([2023, 2024]) # Testing on 2023,2024

In [9]:
X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

In [10]:
categorical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['object']]
categorical_cols

['satellite', 'daynight', 'season', 'region']

In [11]:
ohe_encode=['satellite','daynight','season','region']

In [12]:
numerical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['int64','float64']]
numerical_cols

['latitude',
 'longitude',
 'bright_ti4',
 'scan',
 'track',
 'confidence',
 'version',
 'bright_ti5',
 'year',
 'month',
 'is_fire_season',
 'is_daytime']

In [13]:
preprocessor=ColumnTransformer(transformers=[
    ("ohe",OneHotEncoder(drop='first',sparse_output=False),ohe_encode),
    ("ordinal",OrdinalEncoder(categories=[['Summer', 'Autumn', 'Winter', 'Spring']]),["season"]),
    ("scaler",StandardScaler(),numerical_cols)
],remainder='passthrough')

In [14]:
pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor)
]
)

In [15]:
X_train_trans=pipeline.fit_transform(X_train)
X_test_trans=pipeline.transform(X_test)

In [16]:
X_train_trans

Unnamed: 0,ohe__daynight_N,ohe__season_Spring,ohe__season_Summer,ohe__season_Winter,ohe__region_North,ohe__region_South,ordinal__season,scaler__latitude,scaler__longitude,scaler__bright_ti4,scaler__scan,scaler__track,scaler__confidence,scaler__version,scaler__bright_ti5,scaler__year,scaler__month,scaler__is_fire_season,scaler__is_daytime
4,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.919787,-0.006960,0.343207,-0.474795,-0.302197,0.155708,0.0,0.507840,-1.201649,0.026119,-0.672395,0.665605
8,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.841074,-0.347221,-0.094947,0.653599,-0.559361,0.155708,0.0,-0.003207,0.022163,-0.573244,-0.672395,0.665605
9,0.0,1.0,0.0,0.0,0.0,0.0,3.0,-0.612836,-0.311499,1.653494,1.217795,1.755118,1.845041,0.0,0.990545,1.245975,1.224844,1.487221,0.665605
11,1.0,0.0,0.0,1.0,1.0,0.0,2.0,1.049544,-0.300408,-1.619350,-0.249116,-0.216476,0.155708,0.0,-0.859675,0.022163,0.026119,-0.672395,-1.502393
14,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.710556,-0.602925,1.653494,-0.587634,0.983625,1.845041,0.0,0.698265,0.022163,0.026119,-0.672395,0.665605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.903169,-0.179374,-1.584360,-0.700474,-0.387919,0.155708,0.0,-0.777305,-1.201649,-0.573244,-0.672395,-1.502393
199995,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.800497,1.055155,1.653494,-0.813313,0.897903,1.845041,0.0,0.411299,0.022163,-1.472288,1.487221,0.665605
199996,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.107810,0.770635,0.533301,-0.813313,0.897903,0.155708,0.0,0.618552,0.022163,1.224844,1.487221,0.665605
199997,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.504468,0.180015,1.653494,2.797546,2.269447,1.845041,0.0,-0.507168,0.022163,1.524525,1.487221,0.665605


In [17]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 5, 40),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'verbosity': -1,
        'n_jobs': -1
      }


    model=LGBMClassifier(**params)
    model.fit(X_train_trans,y_train)
    y_pred=model.predict(X_test_trans)
    error=f1_score(y_test,y_pred,average='weighted')
    scores = cross_val_score(model,
                      X_train_trans,
                      y_train,
                      scoring="f1",
                      cv=5,n_jobs=-1).mean()

    return error

In [18]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=30,n_jobs=-1)


[I 2025-10-29 22:26:10,769] A new study created in memory with name: no-name-e2c2b675-4c8b-4918-9104-f1681bbb2d61
[I 2025-10-29 22:26:39,415] Trial 3 finished with value: 0.6043369950268936 and parameters: {'learning_rate': 0.0020597505024506756, 'num_leaves': 43, 'feature_fraction': 0.575674677696008, 'bagging_fraction': 0.5780768205630983, 'bagging_freq': 1, 'max_depth': 15, 'min_data_in_leaf': 52, 'lambda_l1': 0.8289107138377866, 'lambda_l2': 4.408769568073678}. Best is trial 3 with value: 0.6043369950268936.
[I 2025-10-29 22:26:48,635] Trial 4 finished with value: 0.6043369950268936 and parameters: {'learning_rate': 0.0015132677330869004, 'num_leaves': 47, 'feature_fraction': 0.51527402252827, 'bagging_fraction': 0.9575232119500499, 'bagging_freq': 5, 'max_depth': 19, 'min_data_in_leaf': 74, 'lambda_l1': 4.5176853186541965, 'lambda_l2': 4.517531617625338}. Best is trial 3 with value: 0.6043369950268936.
[I 2025-10-29 22:26:49,042] Trial 1 finished with value: 0.7942902137032729 and

In [19]:
study.best_value

0.7993517362302607

In [20]:
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'learning_rate': 0.0634930867697205, 'num_leaves': 29, 'feature_fraction': 0.876805103274048, 'bagging_fraction': 0.7192117889138065, 'bagging_freq': 3, 'max_depth': 11, 'min_data_in_leaf': 69, 'lambda_l1': 2.202802606944329, 'lambda_l2': 2.3602174623340955}
Best trial accuracy: 0.7993517362302607


In [21]:
lgm_params={
    
}

In [22]:

lgm=LGBMClassifier(**lgm_params)

In [23]:
lgm.fit(X_train_trans, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 92491, number of used features: 18
[LightGBM] [Info] Start training from score -0.304828
[LightGBM] [Info] Start training from score -1.456596
[LightGBM] [Info] Start training from score -3.515874


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [24]:
y_train_pred = lgm.predict(X_train_trans)
y_test_pred = lgm.predict(X_test_trans)

In [25]:
print(f"The train accuracy score is:{round(accuracy_score(y_train,y_train_pred),4)}")
print(f"The test accuracy score is:{round(accuracy_score(y_test,y_test_pred),4)}")

The train accuracy score is:0.8347
The test accuracy score is:0.8079


In [26]:
print(f"The train f1 score is:{round(f1_score(y_train,y_train_pred, average='weighted'),4)}")
print(f"The test f1 score is:{round(f1_score(y_test,y_test_pred, average='weighted'),4)}")

The train f1 score is:0.8263
The test f1 score is:0.7994
