In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

get data (replace this chunk later with a better preprocessing data format)

In [30]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# read in data
data = pd.read_csv('business_trees_1km_with_weather_clusters.csv')

# process to get most common tree species associated with business
data['NearbyTreeSpecies'] = data['NearbyTreeSpecies'].astype(str)

# Function to find the most common species
def get_most_common_species(nearby_species):
    if ', ' not in nearby_species:
        return nearby_species  # If there's only one species, return it
    species_list = nearby_species.split(', ')  # Split the string into a list of species
    species_counts = Counter(species_list)  # Count occurrences of each species
    most_common_species, _ = species_counts.most_common(1)[0]  # Get the most common species
    return most_common_species

# Apply the function to create a new column for the predicted species
data['BestTreeSpecies'] = data['NearbyTreeSpecies'].apply(get_most_common_species)

# Encode categorical features, including the target if it's categorical
encoder = LabelEncoder()
data['Category_encoded'] = encoder.fit_transform(data['Category'])
data['BestTreeSpecies_encoded'] = encoder.fit_transform(data['BestTreeSpecies'])

data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna(subset=['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded'])

data.head()

Unnamed: 0.1,Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,...,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies,cluster,BestTreeSpecies,Category_encoded,BestTreeSpecies_encoded
0,1,2017.0,187746.0,,,7419.0,,1940.0,0.0,,...,,,,30.251707,-97.697313,,2,,27,20
1,2,2017.0,230740.0,,,49024.0,,1983.0,15.58,39.58,...,,,,30.328069,-97.672401,,2,,10,18
2,3,2017.0,729528.0,,,101953.0,,2009.0,32.063,76.26,...,,,,30.236494,-97.721013,,1,,24,18
3,5,2017.0,526381.0,,,146142.0,,1999.0,8.049,4.79,...,,,,30.373732,-97.719236,,2,,16,18
4,6,2017.0,200001.0,,,15350.0,,1998.0,100.129,,...,,,,30.279196,-97.740475,,2,,18,18


Basic SVM Framework: Uses radial basis function (could also try sigmoid but that requires a lot more tuning)

- Redo train and test splits by cluster proportion
- Tune for c and gamma values

In [34]:
# Define features and target
features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long']]
target = data['BestTreeSpecies_encoded']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Initialize the SVM classifier with One-vs-All strategy
svm_ova = SVC(decision_function_shape='ovr', kernel='rbf')
svm_ova.fit(X_train, y_train)

# Predict and evaluate the One-vs-All model
y_pred_ova = svm_ova.predict(X_test)
print("One-vs-All Accuracy:", accuracy_score(y_test, y_pred_ova))

One-vs-All Accuracy: 0.6799276672694394


In [35]:
# Initialize lists to store accuracy scores
accuracies = []
num_runs = 50  # Number of times to run the model

# Run multiple times to evaluate convergence
for i in range(num_runs):
     # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=i)

    svm_ova = SVC(decision_function_shape='ovr', kernel='rbf')
    svm_ova.fit(X_train, y_train)

    # Predict and evaluate the One-vs-All model
    y_pred_ova = svm_ova.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred_ova))

# Calculate average accuracies
avg_accuracy = np.mean(accuracies)

print(f"Average SVM Accuracy over {num_runs} runs: {avg_accuracy * 100:.2f}%")


Average SVM Accuracy over 50 runs: 66.08%
