In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

Basic SVM Framework: Uses radial basis function (could also try sigmoid but that requires a lot more tuning)

- Redid train and test splits by cluster proportion
- Tuned for c and gamma values

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Load data
data = pd.read_csv('modeling_data.csv')
X = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']].copy()
X['Building.Age'] = 2024 - X['Year.Built']
X.drop(columns=['Year.Built'], inplace=True)
X = X.to_numpy()
y = data['BestTreeSpecies_encoded'].to_numpy()

# Data splitting
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5, stratify=y_, random_state=42)

# Use stratified K-fold due to imbalance
K = 3
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

# SVM parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# Initialize SVM with RBF kernel
svm_rbf = SVC(kernel='rbf', decision_function_shape='ovr', random_state=42)

# Grid search
grid_search_svm = GridSearchCV(svm_rbf, param_grid, cv=skf, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)
print(f'Best params for SVM (RBF): {grid_search_svm.best_params_}')

# Best model
best_svm_rbf = grid_search_svm.best_estimator_

# Evaluate on training, validation, and test sets
accuracy_train = accuracy_score(y_train, best_svm_rbf.predict(X_train))
accuracy_val = accuracy_score(y_val, best_svm_rbf.predict(X_val))
accuracy_test = accuracy_score(y_test, best_svm_rbf.predict(X_test))

print(f"Training Accuracy for SVM (RBF): {accuracy_train * 100:.2f}%")
print(f"Validation Accuracy for SVM (RBF): {accuracy_val * 100:.2f}%")
print(f"Test Accuracy for SVM (RBF): {accuracy_test * 100:.2f}%")


Best params for SVM (RBF): {'C': 1, 'gamma': 0.001}
Training Accuracy for SVM (RBF): 90.82%
Validation Accuracy for SVM (RBF): 75.12%
Test Accuracy for SVM (RBF): 74.63%
