This is the code for random forest training by Raymond Xu.

 In this edition, the longitude and latitude of the original wildfire dataset is used as inputs.

 Here the temporal sequence is not considered in the data splitting.
 

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load wildfire and non-wildfire datasets
wf_df = pd.read_csv("G:/Shared drives/MECE 788 - Forest Fire Prediction/04_Preprocessing/Cleanup_and_point_selection/wf_original lon and lat.csv",index_col=0)
nwf_df = pd.read_csv("G:/Shared drives/MECE 788 - Forest Fire Prediction/04_Preprocessing/Cleanup_and_point_selection/downsampled_df.csv",index_col=0)

# Add a column to indicate whether the data point represents wildfire or not
wf_df['label'] = 1
nwf_df['label'] = 0

wf_df=wf_df.drop(columns={'latitude_ERA5','longitude_ERA5'})

nwf_df=nwf_df.rename({
    'latitude_ERA5':'latitude',
    'longitude_ERA5':'longitude'
})

# Combine the datasets
combined_data = pd.concat([wf_df, nwf_df], ignore_index=True)
combined_data = combined_data.drop(columns={'date'})

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1, random_state=42).reset_index()
combined_data = combined_data.iloc[: , 2:]

# Divide the dataset into training, testing, and validation sets
train_ratio = 0.7
test_ratio = 0.20
val_ratio = 0.10

train_size = int(train_ratio * len(combined_data))
test_size = int(test_ratio * len(combined_data))

train_data = combined_data[:train_size]
test_data = combined_data[train_size:train_size+test_size]
val_data = combined_data[train_size+test_size:]

# Select equal numbers of wildfire and non-wildfire data points for the training set
num_wildfires = train_data['label'].sum()
num_non_wildfires = train_size - num_wildfires

wildfire_train = train_data[train_data['label'] == 1].sample(n=num_wildfires, replace=False)
non_wildfire_train = train_data[train_data['label'] == 0].sample(n=num_non_wildfires, replace=False)
wildfire_test = test_data[test_data['label'] == 1]
non_wildfire_test = test_data[test_data['label'] == 0]
print("Training fire set size:", len(wildfire_train))
print("Training non-fire set size:", len(non_wildfire_train))
print("Training fire set size:", len(wildfire_test))
print("Training non-fire set size:", len(non_wildfire_test))

train_set = pd.concat([wildfire_train, non_wildfire_train])

# Remove selected data points from the training set
train_data = train_data.drop(train_set.index)

# Output sizes of training, testing, and validation sets
print('-------------------------------------')
print("Training set size:", len(train_set))
print("Testing set size:", len(test_data))
print("Validation set size:", len(val_data))


Training fire set size: 13965
Training non-fire set size: 39369
Training fire set size: 4013
Training non-fire set size: 11225
-------------------------------------
Training set size: 53334
Testing set size: 15238
Validation set size: 7620


In [16]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53334 entries, 24639 to 29848
Data columns (total 28 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   latitude                                53334 non-null  float64
 1   longitude                               53334 non-null  float64
 2   high_vegetation_cover                   53334 non-null  float64
 3   leaf_area_index_high_vegetation         53334 non-null  float64
 4   leaf_area_index_low_vegetation          53334 non-null  float64
 5   low_vegetation_cover                    53334 non-null  float64
 6   slope_of_sub_gridscale_orography        53334 non-null  float64
 7   type_of_high_vegetation                 53334 non-null  float64
 8   type_of_low_vegetation                  53334 non-null  float64
 9   24hr_accumulated_precipitation          53334 non-null  float64
 10  24hr_max_temperature                    53334 non-null  flo

In [4]:
# Divide the dataset into training, testing, and validation sets
# train_ratio = 0.7
# test_ratio = 0.20
# val_ratio = 0.10

#train_size = int(train_ratio * len(wf_df))
#test_size = int(test_ratio * len(wf_df))

#train_wf = wf_df[:train_size]
#test_wf = wf_df[train_size:train_size+test_size]
#val_wf = wf_df[train_size+test_size:]
#train_nwf = nwf_df[:train_size]
#test_nwf = nwf_df[train_size:train_size+test_size]
#val_nwf = nwf_df[train_size+test_size:]

# Select equal numbers of wildfire and non-wildfire data points for the training set
#print("Training set wildfire size:", len(train_wf))
#print("Training set non-wildfire size:", len(train_nwf))

#train_set = pd.concat([train_wf, train_nwf])

# Remove selected data points from the training set
#train_data = train_data.drop(train_set.index)

# Output sizes of training, testing, and validation sets
#print("Training set size:", len(train_set))
#print("Testing set size:", len(test_data))
#print("Validation set size:", len(val_data))

In [20]:
# Separate features and labels
X_train = train_set.drop(columns=['label'])
y_train = train_set['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
# Scale the features using means
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

# SelectKB
from sklearn.feature_selection import SelectKBest, f_classif,chi2
SelectKB_train = SelectKBest(f_classif,k=10)
SelectKB_train.fit(pd.DataFrame(X_train_scaled,columns=X_train.columns),y_train)
feature_names=SelectKB_train.get_feature_names_out()
print(feature_names)

# PCA Analysis
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(X_train_scaled)
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_90 = len(total_explained_variance[total_explained_variance >= .90])
n_to_reach_90 = X_train_scaled.shape[1] - n_over_90 + 1
print("Number of features in the original dataset: {}".format(X_train_scaled.shape[1]))
print("Number of features: {}\tTotal Variance Explained: {}".format(n_to_reach_90, total_explained_variance[n_to_reach_90-1]))


['leaf_area_index_high_vegetation' '24hr_max_temperature'
 'global_noon_LST_2m_temperature' 'BUI' 'FFMC' 'fire'
 '24hr_max_temperature_1dayLag' '24hr_max_temperature_2dayLag'
 'global_noon_LST_2m_temperature_1dayLag'
 'global_noon_LST_2m_temperature_2dayLag']
Number of features in the original dataset: 27
Number of features: 14	Total Variance Explained: 0.9065804145280709


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Initialize and train the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

array(['x3', 'x10', 'x11', 'x14', 'x17', 'x19', 'x20', 'x21', 'x22',
       'x23'], dtype=object)

In [7]:
# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print(f"MAPE: {100*grid_search.best_score_:.2f}%")

NameError: name 'grid_search' is not defined

In [None]:

# Predict on the validation set
y_val_pred = rf_classifier.predict(X_val_scaled)

# Evaluate the accuracy of the model on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation set accuracy:", accuracy)