In [3]:
%pip install -r requirements.txt --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# read data
business_data = pd.read_csv('business_coordinates.csv')
tree_df = pd.read_csv('Tree_Inventory_20240929.csv')

Clean up tree data for processing and rename some columns (longitude) and drop some unnecessary / repetitive ones:

In [4]:
# Process tree data csv
tree_data = tree_df

# get proper latitude and longitude data
from shapely.wkt import loads

# Convert WKT to shapely Point and extract latitude and longitude
tree_data['geometry'] = tree_data['GEOMETRY'].apply(loads)
tree_data['LATITUDE'] = tree_data['geometry'].apply(lambda point: point.y)
tree_data['LONGITUDE'] = tree_data['geometry'].apply(lambda point: point.x)

# drop unnecessary columns
tree_data = tree_data.drop(columns=['geometry', 'LONGTITUDE', 'New Georeferenced Column'])
tree_data['SPECIES'] = tree_data['SPECIES'].astype(str)

tree_data.head()

Unnamed: 0,GEOMETRY,SPECIES,DIAMETER,LATITUDE,LONGITUDE
0,POINT (-97.73398904092146 30.25239671647407),Live Oak,20.0,30.252397,-97.733989
1,POINT (-97.77949770772025 30.20704479399738),Ashe Juniper,8.0,30.207045,-97.779498
2,POINT (-97.73408544155564 30.252389392695715),Live Oak,31.0,30.252389,-97.734085
3,POINT (-97.73392463841283 30.252384583450418),Live Oak,21.0,30.252385,-97.733925
4,POINT (-97.73385944263053 30.252313660255723),Live Oak,20.0,30.252314,-97.733859


Getting the trees within a specified radius (set in kilometers) for each business - this one is slow

In [17]:
# Set radius for businesses to find local trees
from geopy.distance import geodesic
total_businesses = len(business_data)

radius = 1 # 1 km or other distance

# for each business, use the coordinates to get nearby trees within the specified radius
for idx, business_row in business_data.iterrows():

    # get location of business
    business_location = (business_row['lat'], business_row['long'])

    # initialize tree list for each business
    nearby_trees = []

    # for each tree, check if it is within the radius
    for _, tree_row in tree_data.iterrows():
        tree_location = (tree_row['LATITUDE'], tree_row['LONGITUDE'])
        distance = geodesic(business_location, tree_location).km

        # append within radius
        if distance <= radius:
            nearby_trees.append(tree_row['SPECIES'])

    # Store species of nearby trees
    business_data.at[idx, 'NearbyTreeSpecies'] = ', '.join(nearby_trees)

    # Print progress
    print(f"Processed business {idx + 1} of {total_businesses}")


Processed business 1 of 2808


KeyboardInterrupt: 

Faster way to get distances by converting stuff to radians?

In [5]:
from sklearn.neighbors import BallTree
import numpy as np

# Radius in kilometers
km = 1
radius = km / 6371.0  # Earth radius in km for haversine

# Prepare data for BallTree in radians
business_coords = np.radians(business_data[['lat', 'long']].values)
tree_coords = np.radians(tree_data[['LATITUDE', 'LONGITUDE']].values)

# Create BallTree for the tree locations
tree_ball_tree = BallTree(tree_coords, metric='haversine')

# Function to find nearby tree species for each business
def find_nearby_trees_for_business(business_location):
    # Query the tree BallTree for points within the radius
    indices = tree_ball_tree.query_radius([business_location], r=radius)[0]
    nearby_trees = tree_data.iloc[indices]['SPECIES'].tolist()
    return ', '.join(nearby_trees)

# Apply the function to each business
business_data['NearbyTreeSpecies'] = [find_nearby_trees_for_business(loc) for loc in business_coords]

# Optional: Print progress
print("Completed processing for all businesses.")


Completed processing for all businesses.


In [7]:
from sklearn.neighbors import BallTree
import numpy as np

# Radius in kilometers
radius = 1.0 / 6371.0  # Earth radius in km for haversine

# Prepare data for BallTree in radians
business_coords = np.radians(business_data[['lat', 'long']].values)
tree_coords = np.radians(tree_data[['LATITUDE', 'LONGITUDE']].values)

# Create BallTree for the tree locations
tree_ball_tree = BallTree(tree_coords, metric='haversine')

def find_nearby_trees_for_business(business_location):
    # Query the tree BallTree for points within the radius
    indices = tree_ball_tree.query_radius([business_location], r=radius)[0]

    # If no nearby trees are found within the radius, find the closest tree
    if len(indices) == 0:
        # Use the query method to find the nearest tree outside the radius
        nearest_index = tree_ball_tree.query([business_location], k=1)[1][0][0]
        nearest_tree = tree_data.iloc[nearest_index]['SPECIES']
        return f"{nearest_tree}"

    # Get species of nearby trees found within the radius
    nearby_trees = tree_data.iloc[indices]['SPECIES'].dropna().tolist()  # Remove NaNs if any
    return ', '.join([species for species in nearby_trees if species.strip()])  # Remove empty strings

# Apply the function to each business
business_data['NearbyTreeSpecies'] = [find_nearby_trees_for_business(loc) for loc in business_coords]
business_data.drop(columns=['Unnamed: 0'], inplace=True) # Drop unnecessary column
print(f'Finished processing {len(business_data)} businesses.')
business_data.head()

Finished processing 2808 businesses.


Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,Received.Date,Site.EUI..kBTU.sqft.,Portfolio.Manager.SqFt,Total.Green.House.Gas.Emissions..MtCO2e.,Portfolio.Manager.Organization,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies
0,2017.0,187746,"5126 E 5 ST, AUSTIN TX,","5126 E 5 ST, AUSTIN TX,",7419.0,,1940,0.0,,,,,,,"5126 E 5 ST\nAUSTIN, TX\n(30.251707, -97.697313)","Pantur, Inc.","30.251707, -97.697313",30.251707,-97.697313,"Hackberry, Oak, Live (Southern), Hackberry, Oa..."
1,2017.0,230740,"2500 RIDGEPOINT DR, AUSTIN TX, 78728","2500 RIDGEPOINT DR, AUSTIN TX, 78728",49024.0,INDUSTRIAL (W/sqft > 3),1983,15.58,39.58,,,,,,"2500 RIDGEPOINT DR\nAUSTIN, TX 78728\n(30.3280...",Equitable Commercial Realty,"30.328069, -97.672401",30.328069,-97.672401,"Oak, Live (Southern)"
2,2017.0,729528,"1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741","1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741",101953.0,STORE\GROCERY (W/sqft > 5),2009,32.063,76.26,,,,,,"1700 S PLEASANT VALLEY RD\nAUSTIN, TX 78741\n(...","Mimco, Inc.","30.236494, -97.721013",30.236494,-97.721013,"Oak, Texas Live (Escarpment), Crapemyrtle, Cra..."
3,2017.0,526381,2011 W RUNDBERG LN Austin TX 78758,2011 W RUNDBERG LN Austin TX 78758,146142.0,OFFICE LG > 35000,1999,8.049,4.79,,,,,,"2011 W RUNDBERG LN\nAustin, TX 78758\n(30.3737...",AISD,"30.373732, -97.719236",30.373732,-97.719236,"Oak, Texas Red, Oak, Post, Pecan, Sycamore, Me..."
4,2017.0,200001,"202 W 17 ST, AUSTIN TX, 78701","202 W 17 ST, AUSTIN TX, 78701",15350.0,OFFICE MED 10-35,1998,100.129,,,,,,,"202 W 17 ST\nAUSTIN, TX 78701\n(30.279196, -97...",Charter Communications,"30.279196, -97.740475",30.279196,-97.740475,"Chinese Tallow, Escarpment Live Oak, Pecan, Va..."


Save as csv if necessary

In [27]:
# save as csv
#business_data.to_csv('business_trees_1km.csv', index=False)

Decision Tree Era

In [21]:
from collections import Counter

# read data (if necessary) and make sure list is string
data = pd.read_csv('business_trees_1km_with_weather_clusters.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)  # Drop redundant index col
data['NearbyTreeSpecies'] = data['NearbyTreeSpecies'].astype(str)

# Function to find the most common species
def get_most_common_species(nearby_species):
    if ', ' not in nearby_species:
        return nearby_species  # If there's only one species, return it
    species_list = nearby_species.split(', ')  # Split the string into a list of species
    species_counts = Counter(species_list)  # Count occurrences of each species
    most_common_species, _ = species_counts.most_common(1)[0]  # Get the most common species
    return most_common_species

# Apply the function to create a new column for the predicted species
data['BestTreeSpecies'] = data['NearbyTreeSpecies'].apply(get_most_common_species)

data.head()

Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,Received.Date,...,Total.Green.House.Gas.Emissions..MtCO2e.,Portfolio.Manager.Organization,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies,cluster,BestTreeSpecies
0,2017.0,187746,"5126 E 5 ST, AUSTIN TX,","5126 E 5 ST, AUSTIN TX,",7419.0,,1940,0.0,,,...,,,"5126 E 5 ST\nAUSTIN, TX\n(30.251707, -97.697313)","Pantur, Inc.","30.251707, -97.697313",30.251707,-97.697313,"Hackberry, Oak, Live (Southern), Hackberry, Oa...",2,Pecan
1,2017.0,230740,"2500 RIDGEPOINT DR, AUSTIN TX, 78728","2500 RIDGEPOINT DR, AUSTIN TX, 78728",49024.0,INDUSTRIAL (W/sqft > 3),1983,15.58,39.58,,...,,,"2500 RIDGEPOINT DR\nAUSTIN, TX 78728\n(30.3280...",Equitable Commercial Realty,"30.328069, -97.672401",30.328069,-97.672401,,2,
2,2017.0,729528,"1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741","1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741",101953.0,STORE\GROCERY (W/sqft > 5),2009,32.063,76.26,,...,,,"1700 S PLEASANT VALLEY RD\nAUSTIN, TX 78741\n(...","Mimco, Inc.","30.236494, -97.721013",30.236494,-97.721013,"Oak, Texas Live (Escarpment), Crapemyrtle, Cra...",1,Oak
3,2017.0,526381,2011 W RUNDBERG LN Austin TX 78758,2011 W RUNDBERG LN Austin TX 78758,146142.0,OFFICE LG > 35000,1999,8.049,4.79,,...,,,"2011 W RUNDBERG LN\nAustin, TX 78758\n(30.3737...",AISD,"30.373732, -97.719236",30.373732,-97.719236,"Oak, Texas Red, Oak, Post, Pecan, Sycamore, Me...",2,Southern Live Oak
4,2017.0,200001,"202 W 17 ST, AUSTIN TX, 78701","202 W 17 ST, AUSTIN TX, 78701",15350.0,OFFICE MED 10-35,1998,100.129,,,...,,,"202 W 17 ST\nAUSTIN, TX 78701\n(30.279196, -97...",Charter Communications,"30.279196, -97.740475",30.279196,-97.740475,"Chinese Tallow, Escarpment Live Oak, Pecan, Va...",2,Pecan


In [37]:
data['BestTreeSpecies'].unique()

array(['Pecan', 'nan', 'Oak', 'Southern Live Oak', 'Cedar Elm',
       'Hackberry', 'Stump', 'Ash', 'Crapemyrtle', 'Bur Oak', 'Elm',
       'Cedar', 'Live Oak', 'Huisache', 'Juniper', 'Sugarberry',
       'tbd shade', 'Arizona Ash', 'Unknown', 'Yaupon', 'Buckeye',
       'Crape Myrtle (including hybrids)', 'Red Oak', 'Cypress',
       'Quercus buckley', 'Carolina Buckthorn', 'Plum', 'Post Oak',
       'Bradford Pear', 'Texas', 'Ashe Juniper', 'Chinese Pistache',
       'tbd - shade', 'Monterey Oak', 'Sycamore spp.', 'Texas Ash',
       'Privet', 'Bean', 'Sycamore', 'Ashei juniperus', 'Bur oak',
       'Goldenrain Tree', 'Mexican White Oak', 'Crape Myrtle',
       'Mulberry spp.', 'Mulberry', 'American', 'Mexican',
       'Chinquapin Oak'], dtype=object)

encode for categories

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder


# Feature engineering
encoder = LabelEncoder()
data['Category_encoded'] = encoder.fit_transform(data['Category'])
data['BestTreeSpecies_encoded'] = encoder.fit_transform(data['BestTreeSpecies'])
data.head()

Unnamed: 0,Reported.Year,Property.ID..,Commercial.Property..Property.Street.Address,Building.Name,Total.SqFt,Category,Year.Built,kWh.sqft,Percentile.Rank,Received.Date,...,GeoLocation,Submitted.Name.of.Organization,GeoLocation_clean,lat,long,NearbyTreeSpecies,cluster,BestTreeSpecies,Category_encoded,BestTreeSpecies_encoded
0,2017.0,187746,"5126 E 5 ST, AUSTIN TX,","5126 E 5 ST, AUSTIN TX,",7419.0,,1940,0.0,,,...,"5126 E 5 ST\nAUSTIN, TX\n(30.251707, -97.697313)","Pantur, Inc.","30.251707, -97.697313",30.251707,-97.697313,"Hackberry, Oak, Live (Southern), Hackberry, Oa...",2,Pecan,27,31
1,2017.0,230740,"2500 RIDGEPOINT DR, AUSTIN TX, 78728","2500 RIDGEPOINT DR, AUSTIN TX, 78728",49024.0,INDUSTRIAL (W/sqft > 3),1983,15.58,39.58,,...,"2500 RIDGEPOINT DR\nAUSTIN, TX 78728\n(30.3280...",Equitable Commercial Realty,"30.328069, -97.672401",30.328069,-97.672401,,2,,10,46
2,2017.0,729528,"1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741","1700 S PLEASANT VALLEY RD, AUSTIN TX, 78741",101953.0,STORE\GROCERY (W/sqft > 5),2009,32.063,76.26,,...,"1700 S PLEASANT VALLEY RD\nAUSTIN, TX 78741\n(...","Mimco, Inc.","30.236494, -97.721013",30.236494,-97.721013,"Oak, Texas Live (Escarpment), Crapemyrtle, Cra...",1,Oak,24,30
3,2017.0,526381,2011 W RUNDBERG LN Austin TX 78758,2011 W RUNDBERG LN Austin TX 78758,146142.0,OFFICE LG > 35000,1999,8.049,4.79,,...,"2011 W RUNDBERG LN\nAustin, TX 78758\n(30.3737...",AISD,"30.373732, -97.719236",30.373732,-97.719236,"Oak, Texas Red, Oak, Post, Pecan, Sycamore, Me...",2,Southern Live Oak,16,37
4,2017.0,200001,"202 W 17 ST, AUSTIN TX, 78701","202 W 17 ST, AUSTIN TX, 78701",15350.0,OFFICE MED 10-35,1998,100.129,,,...,"202 W 17 ST\nAUSTIN, TX 78701\n(30.279196, -97...",Charter Communications,"30.279196, -97.740475",30.279196,-97.740475,"Chinese Tallow, Escarpment Live Oak, Pecan, Va...",2,Pecan,18,31


Impute NA

In [23]:
# Fill na
features = ['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']
na_count_before = data[features].isna().sum().sum()
n = data.shape[0]
print(f'NA Value count before imputation: {na_count_before} | Proportion of total: {na_count_before / n:.2f}')

## Continuous features get filled with median
data['Total.SqFt'] = data['Total.SqFt'].fillna(data['Total.SqFt'].median())
data['kWh.sqft'] = data['kWh.sqft'].fillna(data['kWh.sqft'].median())

# Categorical features get filled with mode
data['Year.Built'] = data['Year.Built'].fillna(data['Year.Built'].mode()[0])
data['Category_encoded'] = data['Category_encoded'].fillna(data['Category_encoded'].mode()[0])

print(f'NA value count after imputation: {data[features].isna().sum().sum()}')

NA Value count before imputation: 897 | Proportion of total: 0.32
NA value count after imputation: 0


In [25]:
# Define features and target
X = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long', 'cluster']]
y = data['BestTreeSpecies_encoded']

# convert to numeric for modeling purposes (will have to change the year back later)
X = X.apply(pd.to_numeric, errors='coerce').to_numpy()
y = y.to_numpy()

# Split data into train, val, test sets
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_, y_, test_size=0.5, random_state=42)

# Initialize and train the decision tree (gini)
clf_gini = DecisionTreeClassifier(criterion='gini')
clf_gini.fit(X_train, y_train)

# Initialize and train the decision tree (entropy)
clf_ent = DecisionTreeClassifier(criterion='entropy')  # Use 'entropy' if you'd like to try that
clf_ent.fit(X_train, y_train)

# Evaluate accuracy
accuracy_gini = clf_gini.score(X_test, y_test)
print(f"Decision Tree (gini) Accuracy: {accuracy_gini * 100:.2f}%")

accuracy_ent = clf_ent.score(X_test, y_test)
print(f"Decision Tree (entropy) Accuracy: {accuracy_ent * 100:.2f}%")


Decision Tree (gini) Accuracy: 77.22%
Decision Tree (entropy) Accuracy: 80.43%


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Assuming 'data' is already defined and cleaned

# Define features and target
features = data[['Total.SqFt', 'Category_encoded', 'Year.Built', 'kWh.sqft', 'lat', 'long']]
target = data['BestTreeSpecies_encoded']

# Convert to numeric for modeling purposes
features = features.apply(pd.to_numeric, errors='coerce')

# Initialize lists to store accuracy scores
gini_accuracies = []
entropy_accuracies = []
num_runs = 50  # Number of times to run the model

# Run multiple times to evaluate convergence
for i in range(num_runs):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=i)

    # Initialize and train the decision tree (Gini)
    clf_gini = DecisionTreeClassifier(criterion='gini')
    clf_gini.fit(X_train, y_train)

    # Evaluate accuracy for Gini
    accuracy_gini = clf_gini.score(X_test, y_test)
    gini_accuracies.append(accuracy_gini)

    # Initialize and train the decision tree (Entropy)
    clf_ent = DecisionTreeClassifier(criterion='entropy')
    clf_ent.fit(X_train, y_train)

    # Evaluate accuracy for Entropy
    accuracy_ent = clf_ent.score(X_test, y_test)
    entropy_accuracies.append(accuracy_ent)

# Calculate average accuracies
avg_gini_accuracy = np.mean(gini_accuracies)
avg_entropy_accuracy = np.mean(entropy_accuracies)

# Display results
print(f"Average Decision Tree (gini) Accuracy over {num_runs} runs: {avg_gini_accuracy * 100:.2f}%")
print(f"Average Decision Tree (entropy) Accuracy over {num_runs} runs: {avg_entropy_accuracy * 100:.2f}%")

Average Decision Tree (gini) Accuracy over 50 runs: 72.37%
Average Decision Tree (entropy) Accuracy over 50 runs: 72.08%
