In [None]:
''' 
    Version 2.0 - Support Vector Classifier
    Data Source: European Space Agency - ERA5
                 Government of Alberta - Historical Wildfire registry and Fire Weather Indices
                 Natural Resources Canada - Vegetation Classification of Canada
'''

In [1]:
# General imports
import sys
import os
import pandas as pd

# Custom functions
PROJECT_ROOT = '../'
MODEL_PATH = os.path.join(PROJECT_ROOT,'models','SVM')
sys.path.append(PROJECT_ROOT)
from scripts.data_utils import load_downsampled_df, load_full_df, get_train_validation_df, test_train_validation_split, extract_day_of_year
from models.SVM.functions import SVM_preprocess_steps, SVM_predict

# Other imports appear where needed. Specifically, imports joblib, numpy, and parts of sklearn.

**Model Creation**

In [2]:
## Test, Train, Validation Splits ##

main_df = load_downsampled_df(PROJECT_ROOT)
validation_df, test_train_df = get_train_validation_df(main_df)
X_train, X_test, X_validation, y_train, y_test, y_validation = test_train_validation_split(validation_df,test_train_df)
del main_df

In [3]:
## Pipeline ##
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(transform_output = "pandas")

date_transformer, feature_union = SVM_preprocess_steps()

# Classifier
SVM_clf = SVC(
    kernel="rbf",
    random_state=42,
    C=10,
    gamma=0.1
)

SVM_pipeline = Pipeline([
    ('day_of_year', date_transformer),
    ('feature_union', feature_union),
    ('classifier', SVM_clf)
])

In [4]:
## Train and save model ##
from joblib import dump

SVM_pipeline.fit(X_train,y_train)
dump(SVM_pipeline,os.path.join(MODEL_PATH,'SVM_full_model.joblib'))

['../models\\SVM\\SVM_full_model.joblib']

In [5]:
## Save model and preprocessing pipeline separately for ensemble models ##

preprocessing_pipeline = Pipeline([
    ('day_of_year', SVM_pipeline.named_steps['day_of_year']),
    ('feature_union', SVM_pipeline.named_steps['feature_union'])
])

dump(preprocessing_pipeline,os.path.join(MODEL_PATH,'SVM_preprocessing_pipeline.joblib'))
dump(SVM_pipeline.named_steps['classifier'],os.path.join(MODEL_PATH,'SVM_model.joblib'))

['../models\\SVM\\SVM_model.joblib']

**Evaluation**

In [2]:
from scripts.Visualization_functions import generate_visualizations, print_metrics

In [3]:
# Get the validation dataset
main_df = load_full_df(PROJECT_ROOT)
validation_df, test_train_df = get_train_validation_df(main_df)
_, X_test, X_validation, _, y_test, y_validation = test_train_validation_split(validation_df,test_train_df)

In [6]:
# Load the pipeline
from joblib import load

SVM_pipeline = load(os.path.join(MODEL_PATH,'SVM_full_model.joblib'))

In [5]:
# Generate CM and incorrect prediction heatmaps
y_validation_pred = SVM_pipeline.predict(X_validation)
generate_visualizations(X_validation,y_validation_pred,y_validation,main_df,os.path.join(MODEL_PATH,'SVM_Visuals'))

In [6]:
print_metrics(y_validation,y_validation_pred)

Validation set accuracy: 0.9876643141081438
f1 score:  0.1484883275928052
Precision: 0.10424502955400322
Recall: 0.2579787234042553


In [8]:
# Save validation set predictions
import numpy as np

np.save(os.path.join(MODEL_PATH,'SVM_y_val_pred.npy'),y_validation_pred)

**Hyperparameter Testing and Model Tuning**

In [15]:
## Grid Search for Current Model ##
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

#F1 scorer
f1 = make_scorer(f1_score , average='macro')

# Parameters
p_grid = {"classifier__C": [8, 10, 15],
          "classifier__gamma": ['scale', 0.01, 0.1]
}

sv_classifier = SVC(kernel="rbf",random_state=42)
grid_search = GridSearchCV(SVM_pipeline, p_grid, cv=4, scoring=f1,n_jobs=-1,verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print(f"F1 Score: {100*grid_search.best_score_:.2f}%")

best_classifier = grid_search.best_estimator_

# Fitting 4 folds for each of 9 candidates, totalling 36 fits
# Best parameters: {'C': 10, 'gamma': 0.1}
# F1 Score: 74.14%

Fitting 4 folds for each of 9 candidates, totalling 36 fits
Best parameters: {'classifier__C': 15, 'classifier__gamma': 0.1}
F1 Score: 74.25%
