In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.utils.class_weight import compute_class_weight



# !pip install optuna
import optuna


set_config(transform_output="pandas")
plt.style.use('fivethirtyeight')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('preprocessed_fire_data_clean.csv')

In [3]:
df.shape

(200000, 22)

In [4]:
def remove_high_corr(df, threshold=0.85):

    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) == 0:
        print("No numeric columns found")
        return df
    
    corr_matrix = numeric_df.corr().abs()
    
    upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] >= threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    columns_to_remove = set()
    for col1, col2 in high_corr_pairs:
        columns_to_remove.add(col2)  
    
    df_clean = df.drop(columns=columns_to_remove)
    
    print(f"Removed {len(columns_to_remove)} columns: {list(columns_to_remove)}")
    return df_clean

In [5]:
df_clean = remove_high_corr(df, threshold=0.85)

Removed 4 columns: ['acq_hour', 'is_afternoon', 'week_of_year', 'day_of_year']


In [7]:
df_clean.columns

Index(['latitude', 'longitude', 'bright_ti4', 'scan', 'track', 'satellite',
       'instrument', 'confidence', 'version', 'bright_ti5', 'daynight', 'year',
       'month', 'season', 'is_fire_season', 'is_daytime', 'region',
       'severity_class'],
      dtype='object')

In [None]:
# Separate features and target
X = df_clean.drop(['severity_class','instrument','version'], axis=1)
y = df_clean['severity_class']

In [9]:
train_mask = df_clean['year'].isin([2020, 2021, 2022]) # Training on 2020,2021,2022
test_mask = df_clean['year'].isin([2023, 2024]) # Testing on 2023,2024

In [10]:
X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

In [11]:
categorical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['object']]
categorical_cols

['satellite', 'daynight', 'season', 'region']

In [12]:
ohe_encode=['satellite','daynight','season','region']

In [13]:
numerical_cols=[cols for cols in X_train.columns if X_train[cols].dtypes in ['int64','float64']]
numerical_cols

['latitude',
 'longitude',
 'bright_ti4',
 'scan',
 'track',
 'confidence',
 'bright_ti5',
 'month',
 'is_fire_season',
 'is_daytime']

In [14]:
preprocessor=ColumnTransformer(transformers=[
    ("ohe",OneHotEncoder(drop='first',sparse_output=False),ohe_encode),
    ("ordinal",OrdinalEncoder(categories=[['Summer', 'Autumn', 'Winter', 'Spring']]),["season"]),
    ("scaler",StandardScaler(),numerical_cols)
],remainder='passthrough')

In [15]:
pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor)
]
)

In [16]:
X_train_trans=pipeline.fit_transform(X_train)
X_test_trans=pipeline.transform(X_test)

In [17]:
X_train_trans

Unnamed: 0,ohe__daynight_N,ohe__season_Spring,ohe__season_Summer,ohe__season_Winter,ohe__region_North,ohe__region_South,ordinal__season,scaler__latitude,scaler__longitude,scaler__bright_ti4,scaler__scan,scaler__track,scaler__confidence,scaler__bright_ti5,scaler__month,scaler__is_fire_season,scaler__is_daytime
4,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.919787,-0.006960,0.343207,-0.474795,-0.302197,0.155708,0.507840,0.026119,-0.672395,0.665605
8,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.841074,-0.347221,-0.094947,0.653599,-0.559361,0.155708,-0.003207,-0.573244,-0.672395,0.665605
9,0.0,1.0,0.0,0.0,0.0,0.0,3.0,-0.612836,-0.311499,1.653494,1.217795,1.755118,1.845041,0.990545,1.224844,1.487221,0.665605
11,1.0,0.0,0.0,1.0,1.0,0.0,2.0,1.049544,-0.300408,-1.619350,-0.249116,-0.216476,0.155708,-0.859675,0.026119,-0.672395,-1.502393
14,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.710556,-0.602925,1.653494,-0.587634,0.983625,1.845041,0.698265,0.026119,-0.672395,0.665605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.903169,-0.179374,-1.584360,-0.700474,-0.387919,0.155708,-0.777305,-0.573244,-0.672395,-1.502393
199995,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.800497,1.055155,1.653494,-0.813313,0.897903,1.845041,0.411299,-1.472288,1.487221,0.665605
199996,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.107810,0.770635,0.533301,-0.813313,0.897903,0.155708,0.618552,1.224844,1.487221,0.665605
199997,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.504468,0.180015,1.653494,2.797546,2.269447,1.845041,-0.507168,1.524525,1.487221,0.665605


In [18]:
#### Stacking Classifier:
# Best parameter for lgbm:
best_param_lgbm={
    'learning_rate': 0.0634930867697205, 
    'num_leaves': 29, 
    'feature_fraction': 0.876805103274048, 
    'bagging_fraction': 0.7192117889138065, 
    'bagging_freq': 3, 
    'max_depth': 11, 
    'min_data_in_leaf': 69, 
    'lambda_l1': 2.202802606944329, 
    'lambda_l2': 2.3602174623340955
}

# Best parameter for random forest:
best_param_rf={
    'n_estimators': 180, 
    'max_depth': 20
}

# Best parameter for Logistc Regression:
best_param_lr={
    'fit_intercept': False
}

best_rf=RandomForestClassifier(**best_param_rf)
best_lgbm=LGBMClassifier(**best_param_lgbm)
lr=LogisticRegression(**best_param_lr)

In [19]:
clf=StackingClassifier(
    estimators=[("rf",best_rf),("lgbm",best_lgbm)],
    final_estimator=lr,
    cv=5,
    passthrough=True,
    n_jobs=-1)
clf.fit(X_train_trans,y_train)

0,1,2
,estimators,"[('rf', ...), ('lgbm', ...)]"
,final_estimator,LogisticRegre...tercept=False)
,cv,5
,stack_method,'auto'
,n_jobs,-1
,passthrough,True
,verbose,0

0,1,2
,n_estimators,180
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,29
,max_depth,11
,learning_rate,0.0634930867697205
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,False
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [20]:
y_pred_train=clf.predict(X_train_trans)
y_pred_test=clf.predict(X_test_trans)









In [21]:
print(f"The train accuracy score is:{round(accuracy_score(y_train,y_pred_train),4)}")
print(f"The test accuracy score is:{round(accuracy_score(y_test,y_pred_test),4)}")

The train accuracy score is:0.8774
The test accuracy score is:0.8057


In [22]:
print(f"The train f1 score is:{round(f1_score(y_train,y_pred_train, average='weighted'),4)}")
print(f"The test f1 score is:{round(f1_score(y_test,y_pred_test, average='weighted'),4)}")

The train f1 score is:0.8712
The test f1 score is:0.7968
