In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('preprocessed_fire_data_clean.csv')

In [3]:
df.head()

Unnamed: 0,latitude,longitude,scan,track,satellite,instrument,confidence,version,daynight,year,month,day_of_year,week_of_year,season,is_fire_season,is_daytime,acq_hour,is_afternoon,region,severity_class
0,-12.07912,134.5526,0.45,0.47,N,VIIRS,75,2,D,2023,6,180,26,Winter,0,1,5,0,North,1
1,-15.7543,129.01073,0.37,0.58,N,VIIRS,75,2,D,2021,11,320,46,Spring,1,1,4,0,North,0
2,-21.43215,117.2464,0.5,0.41,N,VIIRS,75,2,D,2024,6,158,23,Winter,0,1,6,0,Central,2
3,-17.89678,130.74759,0.49,0.65,N,VIIRS,75,2,D,2024,4,111,16,Autumn,0,1,4,0,North,0
4,-24.16475,151.12544,0.37,0.58,N,VIIRS,75,2,D,2023,10,294,42,Spring,0,1,4,0,Central,0


### Removing multicollinear features:

In [4]:
def remove_high_corr(df, threshold=0.85):

    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) == 0:
        print("No numeric columns found")
        return df
    
    corr_matrix = numeric_df.corr().abs()
    
    upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] >= threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    columns_to_remove = set()
    for col1, col2 in high_corr_pairs:
        columns_to_remove.add(col2)  
    
    df_clean = df.drop(columns=columns_to_remove)
    
    print(f"Removed {len(columns_to_remove)} columns: {list(columns_to_remove)}")
    return df_clean

In [5]:
df_clean = remove_high_corr(df, threshold=0.85)

Removed 4 columns: ['is_afternoon', 'day_of_year', 'acq_hour', 'week_of_year']


In [6]:
df_clean.head()

Unnamed: 0,latitude,longitude,scan,track,satellite,instrument,confidence,version,daynight,year,month,season,is_fire_season,is_daytime,region,severity_class
0,-12.07912,134.5526,0.45,0.47,N,VIIRS,75,2,D,2023,6,Winter,0,1,North,1
1,-15.7543,129.01073,0.37,0.58,N,VIIRS,75,2,D,2021,11,Spring,1,1,North,0
2,-21.43215,117.2464,0.5,0.41,N,VIIRS,75,2,D,2024,6,Winter,0,1,Central,2
3,-17.89678,130.74759,0.49,0.65,N,VIIRS,75,2,D,2024,4,Autumn,0,1,North,0
4,-24.16475,151.12544,0.37,0.58,N,VIIRS,75,2,D,2023,10,Spring,0,1,Central,0


In [7]:
# Separate features and target
X = df_clean.drop('severity_class', axis=1)
y = df_clean['severity_class']

In [8]:
# Identify categorical columns for encoding
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}")

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

Categorical columns to encode: ['satellite', 'instrument', 'daynight', 'season', 'region']


In [9]:
train_mask = df_clean['year'].isin([2020, 2021, 2022]) # Training on 2020,2021,2022
test_mask = df_clean['year'].isin([2023, 2024]) # Testing on 2023,2024

In [10]:
X_train = X_encoded[train_mask]
X_test = X_encoded[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

In [11]:
print(f"Training set: {len(X_train):,}")
print(f"Test set: {len(X_test):,}")

Training set: 92,786
Test set: 107,214


In [12]:

print("Scaling features")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Features scaled")

Scaling features
Features scaled


In [13]:

models = {}

# Model 1: Logistic Regression (Baseline)
print("Logistic Regression")
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=10,
    multi_class='multinomial',
    class_weight='balanced' 
)
lr_model.fit(X_train_scaled, y_train)
models['Logistic Regression'] = (lr_model, X_train_scaled, X_test_scaled)
print("Training complete")

Logistic Regression
Training complete


In [14]:
# Model 2: Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=50,
    min_samples_leaf=20,
)
rf_model.fit(X_train, y_train)
models['Random Forest'] = (rf_model, X_train, X_test)
print("Training complete")



Training complete


In [15]:
print("MODEL EVALUATION")

results_summary = []

for model_name, (model, X_train_data, X_test_data) in models.items():
    print(f"\n{model_name}")
    print("="*50)
    
    # Predictions for both training and testing
    y_pred_train = model.predict(X_train_data)
    y_pred_test = model.predict(X_test_data)
    
    # Metrics for training set
    accuracy_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train, average='weighted')
    
    # Metrics for testing set
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='weighted')
    
    print(f"Training -  Accuracy: {accuracy_train:.4f}, F1: {f1_train:.4f}")
    print(f"Testing  -  Accuracy: {accuracy_test:.4f}, F1: {f1_test:.4f}")
    
    # Store results
    results_summary.append({
        'Model': model_name,
        'Train_Accuracy': accuracy_train,
        'Test_Accuracy': accuracy_test,
        'Train_F1': f1_train,
        'Test_F1': f1_test
    })
    


MODEL EVALUATION

Logistic Regression
Training -  Accuracy: 0.5908, F1: 0.6459
Testing  -  Accuracy: 0.5554, F1: 0.6185

Random Forest
Training -  Accuracy: 0.7984, F1: 0.7676
Testing  -  Accuracy: 0.7599, F1: 0.7223
