In [51]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Decision Tree Era

In [57]:
from collections import Counter

# read data (if necessary) and make sure list is string
data = pd.read_csv('business_trees_1km_with_weather_clusters.csv')
data['NearbyTreeSpecies'] = data['NearbyTreeSpecies'].astype(str)

# Function to find the most common species
def get_most_common_species(nearby_species):
    if ', ' not in nearby_species:
        return nearby_species  # If there's only one species, return it
    species_list = nearby_species.split(', ')  # Split the string into a list of species
    species_counts = Counter(species_list)  # Count occurrences of each species
    most_common_species, _ = species_counts.most_common(1)[0]  # Get the most common species
    return most_common_species

# Apply the function to create a new column for the predicted species
data['BestTreeSpecies'] = data['NearbyTreeSpecies'].apply(get_most_common_species)

data.head()

Unnamed: 0.1,Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,...,Total.Green.House.Gas.Emissions..MtCO2e.,Portfolio.Manager.Organization,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies,cluster,BestTreeSpecies
0,1,2017.0,187746,"5126 E 5 ST, AUSTIN TX,","5126 E 5 ST, AUSTIN TX,",7419.0,,1940,0.0,,...,,,"5126 E 5 ST\r\nAUSTIN, TX\r\n(30.251707, -97.6...","Pantur, Inc.","30.251707, -97.697313",30.251707,-97.697313,"Hackberry, Hackberry, Oak, Oak, Oak, Mesquite,...",2,Pecan
1,2,2017.0,230740,"2500 RIDGEPOINT DR, AUSTIN TX, 78728","2500 RIDGEPOINT DR, AUSTIN TX, 78728",49024.0,INDUSTRIAL (W/sqft > 3),1983,15.58,39.58,...,,,"2500 RIDGEPOINT DR\r\nAUSTIN, TX 78728\r\n(30....",Equitable Commercial Realty,"30.328069, -97.672401",30.328069,-97.672401,Oak,2,Oak
2,3,2017.0,729528,"1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741","1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741",101953.0,STORE\GROCERY (W/sqft > 5),2009,32.063,76.26,...,,,"1700 S PLEASANT VALLEY RD\r\nAUSTIN, TX 78741\...","Mimco, Inc.","30.236494, -97.721013",30.236494,-97.721013,"Oak, Mulberry, Chinaberry, Chinaberry, Chinabe...",1,Oak
3,5,2017.0,526381,2011 W RUNDBERG LN Austin TX 78758,2011 W RUNDBERG LN Austin TX 78758,146142.0,OFFICE LG > 35000,1999,8.049,4.79,...,,,"2011 W RUNDBERG LN\r\nAustin, TX 78758\r\n(30....",AISD,"30.373732, -97.719236",30.373732,-97.719236,"Oak, Oak, Pecan, Pecan, Sycamore, Pear, Pear, ...",2,Oak
4,6,2017.0,200001,"202 W 17 ST, AUSTIN TX, 78701","202 W 17 ST, AUSTIN TX, 78701",15350.0,OFFICE MED 10-35,1998,100.129,,...,,,"202 W 17 ST\r\nAUSTIN, TX 78701\r\n(30.279196,...",Charter Communications,"30.279196, -97.740475",30.279196,-97.740475,"Loquat, Pecan, Crape Myrtle, Crape Myrtle, Oak...",2,Oak


encode for categories

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder


# Encode categorical features, including the target if it's categorical
encoder = LabelEncoder()
data['Category_encoded'] = encoder.fit_transform(data['Category'])
data['BestTreeSpecies_encoded'] = encoder.fit_transform(data['BestTreeSpecies'])
data.head()

Unnamed: 0.1,Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,...,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies,cluster,BestTreeSpecies,Category_encoded,BestTreeSpecies_encoded
0,1,2017.0,187746,"5126 E 5 ST, AUSTIN TX,","5126 E 5 ST, AUSTIN TX,",7419.0,,1940,0.0,,...,"5126 E 5 ST\r\nAUSTIN, TX\r\n(30.251707, -97.6...","Pantur, Inc.","30.251707, -97.697313",30.251707,-97.697313,"Hackberry, Hackberry, Oak, Oak, Oak, Mesquite,...",2,Pecan,27,20
1,2,2017.0,230740,"2500 RIDGEPOINT DR, AUSTIN TX, 78728","2500 RIDGEPOINT DR, AUSTIN TX, 78728",49024.0,INDUSTRIAL (W/sqft > 3),1983,15.58,39.58,...,"2500 RIDGEPOINT DR\r\nAUSTIN, TX 78728\r\n(30....",Equitable Commercial Realty,"30.328069, -97.672401",30.328069,-97.672401,Oak,2,Oak,10,18
2,3,2017.0,729528,"1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741","1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741",101953.0,STORE\GROCERY (W/sqft > 5),2009,32.063,76.26,...,"1700 S PLEASANT VALLEY RD\r\nAUSTIN, TX 78741\...","Mimco, Inc.","30.236494, -97.721013",30.236494,-97.721013,"Oak, Mulberry, Chinaberry, Chinaberry, Chinabe...",1,Oak,24,18
3,5,2017.0,526381,2011 W RUNDBERG LN Austin TX 78758,2011 W RUNDBERG LN Austin TX 78758,146142.0,OFFICE LG > 35000,1999,8.049,4.79,...,"2011 W RUNDBERG LN\r\nAustin, TX 78758\r\n(30....",AISD,"30.373732, -97.719236",30.373732,-97.719236,"Oak, Oak, Pecan, Pecan, Sycamore, Pear, Pear, ...",2,Oak,16,18
4,6,2017.0,200001,"202 W 17 ST, AUSTIN TX, 78701","202 W 17 ST, AUSTIN TX, 78701",15350.0,OFFICE MED 10-35,1998,100.129,,...,"202 W 17 ST\r\nAUSTIN, TX 78701\r\n(30.279196,...",Charter Communications,"30.279196, -97.740475",30.279196,-97.740475,"Loquat, Pecan, Crape Myrtle, Crape Myrtle, Oak...",2,Oak,18,18


fix na's (right now it's just removing but maybe later imputing would be better)

In [59]:
data.head()

# get number and proportion of nas in the features and targets
print(data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded', 'cluster']].isna().sum())
print(data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded', 'cluster']].isna().mean())

# maybe a future improvement is imputing kWH.sqft but for now i'm just dropping the rows with nas
data = data.dropna(subset=['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded', 'cluster'])
print(data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded', 'cluster']].isna().sum())
print(data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'BestTreeSpecies_encoded', 'cluster']].isna().mean())

Total.SqFt                   6
Category_encoded             0
Year.Built                  64
kWh.sqft                   827
lat                          0
long                         0
BestTreeSpecies_encoded      0
cluster                      0
dtype: int64
Total.SqFt                 0.002137
Category_encoded           0.000000
Year.Built                 0.022792
kWh.sqft                   0.294516
lat                        0.000000
long                       0.000000
BestTreeSpecies_encoded    0.000000
cluster                    0.000000
dtype: float64
Total.SqFt                 0
Category_encoded           0
Year.Built                 0
kWh.sqft                   0
lat                        0
long                       0
BestTreeSpecies_encoded    0
cluster                    0
dtype: int64
Total.SqFt                 0.0
Category_encoded           0.0
Year.Built                 0.0
kWh.sqft                   0.0
lat                        0.0
long                       0.0
BestT

In [60]:
# Define features and target
features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']]
target = data['BestTreeSpecies_encoded']

# convert to numeric for modeling purposes (will have to change the year back later)
features = features.apply(pd.to_numeric, errors='coerce')

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Initialize and train the decision tree (gini)
clf_gini = DecisionTreeClassifier(criterion='gini')  # Use 'entropy' if you'd like to try that
clf_gini.fit(X_train, y_train)

# Initialize and train the decision tree (entropy)
clf_ent = DecisionTreeClassifier(criterion='entropy')  # Use 'entropy' if you'd like to try that
clf_ent.fit(X_train, y_train)

# Evaluate accuracy
accuracy_gini = clf_gini.score(X_test, y_test)
print(f"Decision Tree (gini) Accuracy: {accuracy_gini * 100:.2f}%")

accuracy_ent = clf_ent.score(X_test, y_test)
print(f"Decision Tree (entropy) Accuracy: {accuracy_ent * 100:.2f}%")


Decision Tree (gini) Accuracy: 80.76%
Decision Tree (entropy) Accuracy: 82.30%


In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Assuming 'data' is already defined and cleaned

# Define features and target
features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long']]
target = data['BestTreeSpecies_encoded']

# Convert to numeric for modeling purposes
features = features.apply(pd.to_numeric, errors='coerce')

# Initialize lists to store accuracy scores
gini_accuracies = []
entropy_accuracies = []
num_runs = 50  # Number of times to run the model

# Run multiple times to evaluate convergence
for i in range(num_runs):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=i)

    # Initialize and train the decision tree (Gini)
    clf_gini = DecisionTreeClassifier(criterion='gini')
    clf_gini.fit(X_train, y_train)

    # Evaluate accuracy for Gini
    accuracy_gini = clf_gini.score(X_test, y_test)
    gini_accuracies.append(accuracy_gini)

    # Initialize and train the decision tree (Entropy)
    clf_ent = DecisionTreeClassifier(criterion='entropy')
    clf_ent.fit(X_train, y_train)

    # Evaluate accuracy for Entropy
    accuracy_ent = clf_ent.score(X_test, y_test)
    entropy_accuracies.append(accuracy_ent)

# Calculate average accuracies
avg_gini_accuracy = np.mean(gini_accuracies)
avg_entropy_accuracy = np.mean(entropy_accuracies)

# Display results
print(f"Average Decision Tree (gini) Accuracy over {num_runs} runs: {avg_gini_accuracy * 100:.2f}%")
print(f"Average Decision Tree (entropy) Accuracy over {num_runs} runs: {avg_entropy_accuracy * 100:.2f}%")

Average Decision Tree (gini) Accuracy over 50 runs: 77.82%
Average Decision Tree (entropy) Accuracy over 50 runs: 80.47%


# NN

In [62]:
def find_non_integer_values(column):
    # Use a try-except block to catch conversion errors
    non_integers = []
    for value in column:
        try:
            # Attempt to convert to integer
            int(value)
        except (ValueError, TypeError):
            # If conversion fails, add to the list
            non_integers.append(value)
    return non_integers

# Finding non-integer values in 'Year.Built'
non_integer_values = find_non_integer_values(data['Year.Built'])
print(set(non_integer_values))

{'.', '1995-2001'}


In [63]:
data['Year.Built'] = data['Year.Built'].replace('.', np.nan)
data['Year.Built'] = data['Year.Built'].replace('1995-2001', np.nan)
data = data.dropna(subset=['Year.Built'])
data['Year.Built'] = data['Year.Built'].astype(int)

features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']]
target = data['BestTreeSpecies_encoded']

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Creating the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('mlp', MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16, 8), activation='relu', solver='adam', max_iter=500, random_state=42))  # Step 2: MLP Classifier
])

# Performing cross-validation
cv_scores = cross_val_score(pipeline, features, target, cv=5, scoring='accuracy')  # Use 5-fold cross-validation

# Output the cross-validation scores
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean()}')
print(f'Standard Deviation of Cross-Validation Accuracy: {cv_scores.std()}')



Cross-Validation Accuracy Scores: [0.66937669 0.65582656 0.6875     0.7173913  0.68478261]
Mean Cross-Validation Accuracy: 0.6829754330151998
Standard Deviation of Cross-Validation Accuracy: 0.02064062450235686
