In [2]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [10]:
# read data
business_data = pd.read_csv('business_coordinates.csv')
tree_df = pd.read_csv('Tree_Inventory_20240929.csv')

Clean up tree data for processing and rename some columns (longitude) and drop some unnecessary / repetitive ones:

In [11]:
# Process tree data csv
tree_data = tree_df

# get proper latitude and longitude data
from shapely.wkt import loads

# Convert WKT to shapely Point and extract latitude and longitude
tree_data['geometry'] = tree_data['GEOMETRY'].apply(loads)
tree_data['LATITUDE'] = tree_data['geometry'].apply(lambda point: point.y)
tree_data['LONGITUDE'] = tree_data['geometry'].apply(lambda point: point.x)

# drop unnecessary columns
tree_data = tree_data.drop(columns=['geometry', 'LONGTITUDE', 'New Georeferenced Column'])
tree_data['SPECIES'] = tree_data['SPECIES'].astype(str)

tree_data.head()

Getting the trees within a specified radius (set in kilometers) for each business - this one is slow

In [None]:
# Set radius for businesses to find local trees
from geopy.distance import geodesic
total_businesses = len(business_data)

radius = 1 # 1 km or other distance

# for each business, use the coordinates to get nearby trees within the specified radius
for idx, business_row in business_data.iterrows():

    # get location of business
    business_location = (business_row['lat'], business_row['long'])
    
    # initialize tree list for each business
    nearby_trees = []

    # for each tree, check if it is within the radius
    for _, tree_row in tree_data.iterrows():
        tree_location = (tree_row['LATITUDE'], tree_row['LONGITUDE'])
        distance = geodesic(business_location, tree_location).km

        # append within radius
        if distance <= radius:
            nearby_trees.append(tree_row['SPECIES'])
            
    # Store species of nearby trees
    business_data.at[idx, 'NearbyTreeSpecies'] = ', '.join(nearby_trees)

    # Print progress
    print(f"Processed business {idx + 1} of {total_businesses}")


Faster way to get distances by converting stuff to radians? 

In [23]:
from sklearn.neighbors import BallTree
import numpy as np

# Radius in kilometers
radius = 1.0 / 6371.0  # Earth radius in km for haversine

# Prepare data for BallTree in radians
business_coords = np.radians(business_data[['lat', 'long']].values)
tree_coords = np.radians(tree_data[['LATITUDE', 'LONGITUDE']].values)

# Create BallTree for the tree locations
tree_ball_tree = BallTree(tree_coords, metric='haversine')

# Function to find nearby tree species for each business
def find_nearby_trees_for_business(business_location):
    # Query the tree BallTree for points within the radius
    indices = tree_ball_tree.query_radius([business_location], r=radius)[0]
    nearby_trees = tree_data.iloc[indices]['SPECIES'].tolist()
    return ', '.join(nearby_trees)

# Apply the function to each business
business_data['NearbyTreeSpecies'] = [find_nearby_trees_for_business(loc) for loc in business_coords]

# Optional: Print progress
print("Completed processing for all businesses.")


Completed processing for all businesses.


Save as csv if necessary

In [25]:
# save as csv
# business_data.to_csv('business_trees.csv', index=False)