This is the code for random forest training by Raymond Xu.

 In this edition, the longitude and latitude of the original wildfire dataset is used as inputs.

 Here the temporal sequence is not considered in the data splitting.


In [1]:
''' This is Research Project titled ML Algorithms for Alberta Forest Occurence Prediction.
    This is the 8th Engineering Research Project, and is hereby delcared as

                            Project Christopher

    Version 2.0 - Random Forest Classifier
    Data Source: European Space Agency - ERA5
                 Government of Alberta - Historical Wildfire registry and Fire Weather Indices
                 Natural Resources Canada - Vegetation Classification of Canada

    AI Diederik - Hongik Ingan, For the Benefits of All Mankind
'''

import math
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif,chi2
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
set_config(transform_output = "pandas")

In [2]:
# Load wildfire and non-wildfire datasets
main_df = pd.read_csv("downsampled_df.csv",index_col=0)

# Remove the first unnamed column
wf_df=main_df[main_df['fire']==1]
nwf_df=main_df[main_df['fire']==0]

# Define Split ratio, aka percentage of the combined data goes to training
split=[0.6,0.2,0.2]
wf_sort = wf_df.sort_values(by='date')
nwf_sort = nwf_df.sort_values(by='date')


for i in [0,1,2]:
    wf_memory=0
    nfw_memory=0
    wf_selected=[]
    nwf_selected=[]
    wf_size = int(split[i] * len(wf_df))
    nwf_size = int(split[i] * len(nwf_df))
    if i==0:
        wf_selected=wf_sort[:wf_size]
        nwf_selected=nwf_sort[:nwf_size]
        wf_memory=wf_size
        nwf_memory=nwf_size
        train_data = pd.concat([wf_selected, nwf_selected])
    if i==1:
        wf_selected=wf_sort[wf_memory:wf_memory+wf_size]
        nwf_selected=nwf_sort[nwf_memory:nwf_memory+nwf_size]
        wf_memory=wf_size
        nwf_memory=nwf_size
        test_data = pd.concat([wf_selected, nwf_selected])
    if i==2:
        wf_selected=wf_sort[wf_memory:]
        nwf_selected=nwf_sort[nwf_memory:]
        val_data = pd.concat([wf_selected, nwf_selected])

X_train = train_data.drop(columns={'fire','date'})
X_test = test_data.drop(columns={'fire','date'})
X_val = val_data.drop(columns={'fire','date'})
y_train = train_data['fire']
y_test = test_data['fire']
y_val = val_data['fire']


Set up the pipeline

In [7]:
# Manual separate numerical and categorical columns

# Select variables
#pass_features = [ 'leaf_area_index_high_vegetation']
#categorical_features = []
#numeric_features = ['fire_count_past_3Days', 'global_noon_LST_2m_temperature', 'FFMC', 'DMC', 'FWI', 'BUI', 'global_noon_LST_relative_humidity', '24hr_max_temperature']

# Almost all variables
pass_features = [ 'leaf_area_index_high_vegetation','slope_of_sub_gridscale_orography']
categorical_features = ['type_of_high_vegetation','type_of_low_vegetation']
numeric_features = ['high_vegetation_cover',
       'low_vegetation_cover', 
       '24hr_accumulated_precipitation', 
       '24hr_max_temperature',
       'global_noon_LST_2m_temperature', 
       'global_noon_LST_relative_humidity',
       'global_noon_LST_wind_speed', 
       'BUI', 
       'DC', 
       'DMC', 
       'FFMC', 
       'FWI',
       'fire_count_past_3Days', 
       'fire_count_past_30Days',
       '24hr_max_temperature_1dayLag', 
       '24hr_max_temperature_2dayLag',
       'global_noon_LST_2m_temperature_1dayLag',
       'global_noon_LST_2m_temperature_2dayLag']

# Define numeric and categorical transformer below
scale=ColumnTransformer([('scale_transformer',StandardScaler(),numeric_features)],verbose_feature_names_out=False).set_output(transform='pandas')

cate=ColumnTransformer([('categorical_transformer',OneHotEncoder(sparse_output=False),categorical_features)],verbose_feature_names_out=False).set_output(transform='pandas')

pss=ColumnTransformer([('Pass_transformer','passthrough',pass_features)],verbose_feature_names_out=False).set_output(transform='pandas')

Data_pipeline = Pipeline(steps=[
    ('Feature Union',FeatureUnion([('numeric', scale),('categorical',cate),('pass',pss)])),
    ]
    )

In [8]:
X_train_processed=Data_pipeline.fit_transform(X_train)
X_test_processed=Data_pipeline.transform(X_test)

In [9]:
X_train.keys()

Index(['latitude', 'longitude', 'high_vegetation_cover',
       'leaf_area_index_high_vegetation', 'leaf_area_index_low_vegetation',
       'low_vegetation_cover', 'slope_of_sub_gridscale_orography',
       'type_of_high_vegetation', 'type_of_low_vegetation',
       '24hr_accumulated_precipitation', '24hr_max_temperature',
       'global_noon_LST_2m_temperature', 'global_noon_LST_relative_humidity',
       'global_noon_LST_wind_speed', 'BUI', 'DC', 'DMC', 'FFMC', 'FWI',
       'fire_count_past_3Days', 'fire_count_past_7Days',
       'fire_count_past_10Days', 'fire_count_past_30Days',
       '24hr_max_temperature_1dayLag', '24hr_max_temperature_2dayLag',
       'global_noon_LST_2m_temperature_1dayLag',
       'global_noon_LST_2m_temperature_2dayLag'],
      dtype='object')

In [10]:
# nested cross-validation
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer


#F1 scorer
f1 = make_scorer(f1_score , average='macro')

p_grid = {"C": [1, 10, 100]} #"gamma": [0.01, 0.1]


sv_classifier = SVC(kernel="rbf",random_state=42)

grid_search = GridSearchCV(sv_classifier, p_grid, cv=5, scoring=f1,n_jobs=-1,verbose=1)
grid_search.fit(X_train_processed, y_train)


# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print(f"F1 Score: {100*grid_search.best_score_:.2f}%")

best_classifier = grid_search.best_estimator_

# For the feature space:
# pass_features = [ 'leaf_area_index_high_vegetation']
# categorical_features = []
# numeric_features = ['fire_count_past_3Days', 'global_noon_LST_2m_temperature', 'FFMC', 'DMC', 'FWI', 'BUI', 'global_noon_LST_relative_humidity', '24hr_max_temperature']
# Best params are:
# Fitting 5 folds for each of 3 candidates, totalling 15 fits
# Best parameters: {'C': 10}
# F1 Score: 74.12%

# For a (nearly) full feature space:
# Fitting 5 folds for each of 3 candidates, totalling 15 fits
# Best parameters: {'C': 10}
# F1 Score: 74.60%

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'C': 10}
F1 Score: 74.60%


In [11]:
# nested cross-validation
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer

#F1 scorer
f1 = make_scorer(f1_score , average='macro')

p_grid = {"C": [8, 10],
          "gamma": ['scale', 0.01, 0.1]
}

sv_classifier = SVC(kernel="rbf",random_state=42)

grid_search = GridSearchCV(sv_classifier, p_grid, cv=4, scoring=f1,n_jobs=-1,verbose=1)
grid_search.fit(X_train_processed, y_train)


# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print(f"F1 Score: {100*grid_search.best_score_:.2f}%")

best_classifier = grid_search.best_estimator_

# For the nearly full parameter space
# Fitting 4 folds for each of 6 candidates, totalling 24 fits
# Best parameters: {'C': 8, 'gamma': 'scale'}
# F1 Score: 74.56%

Fitting 4 folds for each of 6 candidates, totalling 24 fits
Best parameters: {'C': 8, 'gamma': 'scale'}
F1 Score: 74.56%


In [13]:
import joblib
joblib.dump(best_classifier, 'svc_model_cv_V1.joblib')

['svc_model_cv_V1.joblib']

In [15]:
# Predict on the validation set
y_test_pred = best_classifier.predict(X_test_processed)

# Evaluate the accuracy of the model on the validation set
accuracy = accuracy_score(y_test, y_test_pred)
print("Validation set accuracy:", accuracy)
print('f1 score: ',f1_score(y_test, y_test_pred))

Validation set accuracy: 0.8264626717665392
f1 score:  0.6021435531016563
