In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('preprocessed_fire_data_clean.csv')

In [3]:
df.head()

Unnamed: 0,latitude,longitude,bright_ti4,scan,track,satellite,instrument,confidence,version,bright_ti5,...,month,day_of_year,week_of_year,season,is_fire_season,is_daytime,acq_hour,is_afternoon,region,severity_class
0,-20.78661,132.38303,349.53,0.38,0.36,N,VIIRS,50,2,320.51,...,10,287,41,Spring,0,1,4,0,Central,1
1,-17.00355,133.44322,343.15,0.4,0.44,N,VIIRS,75,2,295.42,...,12,345,50,Summer,1,1,5,0,North,0
2,-21.43215,117.2464,340.71,0.5,0.41,N,VIIRS,75,2,303.82,...,6,158,23,Winter,0,1,6,0,Central,2
3,-13.0757,133.61002,336.26,0.48,0.4,N,VIIRS,75,2,289.84,...,5,130,19,Autumn,0,1,4,0,North,0
4,-12.99031,135.68378,341.91,0.42,0.45,N,VIIRS,75,2,305.88,...,7,212,31,Winter,0,1,4,0,North,0


### Removing multicollinear features:

In [4]:
def remove_high_corr(df, threshold=0.85):

    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) == 0:
        print("No numeric columns found")
        return df
    
    corr_matrix = numeric_df.corr().abs()
    
    upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] >= threshold:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))
    
    columns_to_remove = set()
    for col1, col2 in high_corr_pairs:
        columns_to_remove.add(col2)  
    
    df_clean = df.drop(columns=columns_to_remove)
    
    print(f"Removed {len(columns_to_remove)} columns: {list(columns_to_remove)}")
    return df_clean

In [5]:
df_clean = remove_high_corr(df, threshold=0.85)

Removed 4 columns: ['week_of_year', 'is_afternoon', 'acq_hour', 'day_of_year']


In [6]:
# Test model consistency across multiple random samples
def validate_sample_consistency(model, X, y, n_samples=5, sample_size=0.3, random_state=42):
    
    f1_scores = []
    
    print(f"Testing model consistency across {n_samples} samples")
    
    for i in range(n_samples):
        X_sample, _, y_sample, _ = train_test_split(
            X, y, 
            train_size=sample_size, 
            random_state=random_state + i,
            stratify=y
        )
        
        # Make predictions and calculate F1
        y_pred_sample = model.predict(X_sample)
        f1 = f1_score(y_sample, y_pred_sample, average='weighted')
        f1_scores.append(f1)
        
        print(f"Sample {i+1}: F1 = {f1:.4f}")
    
    # Calculate consistency metrics
    f1_mean = np.mean(f1_scores)
    f1_std = np.std(f1_scores)
    f1_cv = (f1_std / f1_mean) * 100  # Coefficient of variation
    
    print(f"Consistency Summary:")
    print(f"Mean F1: {f1_mean:.4f}")
    print(f"Std Dev: {f1_std:.4f}")
    print(f"Coefficient of Variation: {f1_cv:.2f}%")
    print(f"Consistency: {'GOOD' if f1_cv < 5 else 'VARIABLE' if f1_cv < 10 else 'UNSTABLE'}")
    
    return f1_scores


In [7]:
df_clean.head()

Unnamed: 0,latitude,longitude,bright_ti4,scan,track,satellite,instrument,confidence,version,bright_ti5,daynight,year,month,season,is_fire_season,is_daytime,region,severity_class
0,-20.78661,132.38303,349.53,0.38,0.36,N,VIIRS,50,2,320.51,D,2023,10,Spring,0,1,Central,1
1,-17.00355,133.44322,343.15,0.4,0.44,N,VIIRS,75,2,295.42,D,2023,12,Summer,1,1,North,0
2,-21.43215,117.2464,340.71,0.5,0.41,N,VIIRS,75,2,303.82,D,2024,6,Winter,0,1,Central,2
3,-13.0757,133.61002,336.26,0.48,0.4,N,VIIRS,75,2,289.84,D,2024,5,Autumn,0,1,North,0
4,-12.99031,135.68378,341.91,0.42,0.45,N,VIIRS,75,2,305.88,D,2020,7,Winter,0,1,North,0


In [8]:
# Separate features and target
X = df_clean.drop('severity_class', axis=1)
y = df_clean['severity_class']

In [9]:
# Identify categorical columns for encoding
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}")

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

Categorical columns to encode: ['satellite', 'instrument', 'daynight', 'season', 'region']


In [10]:
train_mask = df_clean['year'].isin([2020, 2021, 2022]) # Training on 2020,2021,2022
test_mask = df_clean['year'].isin([2023, 2024]) # Testing on 2023,2024

In [11]:
X_train = X_encoded[train_mask]
X_test = X_encoded[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

In [12]:
print(f"Training set: {len(X_train):,}")
print(f"Test set: {len(X_test):,}")

Training set: 92,491
Test set: 107,509


In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Features scaled")

Features scaled


In [14]:

models = {}

# Model 1: Logistic Regression (Baseline)
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=10,
    multi_class='multinomial',
    class_weight='balanced' 
)
lr_model.fit(X_train_scaled, y_train)
models['Logistic Regression'] = (lr_model, X_train_scaled, X_test_scaled)
print("Training complete")

Training complete


In [15]:
# Model 2: Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20
)
rf_model.fit(X_train, y_train)
models['Random Forest'] = (rf_model, X_train, X_test)
print("Training complete")



Training complete


In [16]:
print("MODEL EVALUATION")

results_summary = []

for model_name, (model, X_train_data, X_test_data) in models.items():
    print(f"\n{model_name}")
    print("="*50)
    
    # Predictions for both training and testing
    y_pred_train = model.predict(X_train_data)
    y_pred_test = model.predict(X_test_data)
    
    # Metrics for training set
    accuracy_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train, average='weighted')
    
    # Metrics for testing set
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='weighted')
    
    print(f"Training -  Accuracy: {accuracy_train:.4f}, F1: {f1_train:.4f}")
    print(f"Testing  -  Accuracy: {accuracy_test:.4f}, F1: {f1_test:.4f}")
    
    # Store results
    results_summary.append({
        'Model': model_name,
        'Train_Accuracy': accuracy_train,
        'Test_Accuracy': accuracy_test,
        'Train_F1': f1_train,
        'Test_F1': f1_test
    })
    


MODEL EVALUATION

Logistic Regression
Training -  Accuracy: 0.6909, F1: 0.7295
Testing  -  Accuracy: 0.6502, F1: 0.6981

Random Forest
Training -  Accuracy: 0.9658, F1: 0.9651
Testing  -  Accuracy: 0.8040, F1: 0.7943


In [17]:
feature_names = X_encoded.columns.tolist()

In [18]:
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n  Top 5 Important Features:")
for idx, row in feature_importance.tail(10).iterrows():
    print(f"    {row['feature']:<25} {row['importance']:.4f}")


  Top 5 Important Features:
    month                     0.0307
    daynight_N                0.0304
    year                      0.0212
    is_fire_season            0.0065
    region_North              0.0055
    season_Spring             0.0054
    season_Summer             0.0049
    season_Winter             0.0048
    region_South              0.0046
    version                   0.0000


In [19]:
# ADD THE CONSISTENCY CHECK RIGHT HERE:
print("\n" + "="*70)
print("MODEL EVALUATION WITH CONSISTENCY CHECK")
print("="*70)

for model_name, (model, X_train_data, X_test_data) in models.items():
    print(f"\n{model_name}")
    print("="*50)
    
    # Your existing evaluation (optional - you can keep or remove this duplicate)
    y_pred_test = model.predict(X_test_data)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='weighted')
    print(f"Overall Test - Accuracy: {accuracy_test:.4f}, F1: {f1_test:.4f}")
    
    # NEW: Sample consistency check
    f1_scores = validate_sample_consistency(model, X_test_data, y_test, n_samples=5)


MODEL EVALUATION WITH CONSISTENCY CHECK

Logistic Regression
Overall Test - Accuracy: 0.6502, F1: 0.6981
Testing model consistency across 5 samples
Sample 1: F1 = 0.6969
Sample 2: F1 = 0.6976
Sample 3: F1 = 0.6969
Sample 4: F1 = 0.6981
Sample 5: F1 = 0.6975
Consistency Summary:
Mean F1: 0.6974
Std Dev: 0.0005
Coefficient of Variation: 0.07%
Consistency: GOOD

Random Forest
Overall Test - Accuracy: 0.8040, F1: 0.7943
Testing model consistency across 5 samples
Sample 1: F1 = 0.7956
Sample 2: F1 = 0.7952
Sample 3: F1 = 0.7945
Sample 4: F1 = 0.7949
Sample 5: F1 = 0.7959
Consistency Summary:
Mean F1: 0.7952
Std Dev: 0.0005
Coefficient of Variation: 0.06%
Consistency: GOOD
