In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json, ast
from google.colab import files

sns.set()

#### Clean Tree Data

The dataset being cleaned is "City of Pittsburgh Trees" dataset found from Western Pennsylvania Regional Data Center: https://data.wprdc.org/dataset/city-trees. It contains 45,709 entries and 58 features. The data include trees cared for and managed by the City of Pittsburgh Department of Public Works Forestry Division. In this data, the benefits of the trees are quantified to numerical values and are calculated using the National Tree Benefit Calculator Web Service. Here are all of the features for each tree datapoint. Some interesting ones will be commented. 

**_id	| id |type |comments**

1	id	 text

2	address_number	text

3	street	text

4	common_name	text

5	scientific_name	text

6	height	float

7	width	float

8	growth_space_length	float

9	growth_space_width	float

10	growth_space_type	text: *what types of environment the tree is planted in. e.g. well, pit, unrestricted, etc.* 

11	stems	int

12	overhead_utilities	text: *whether there is overhead utilities and whether the tree is conflicting with utilities.*

13	land_use	text: *residential, commercial, park, etc.*

14	condition	text: *condition of the tree. Good, fair, poor, etc.*

15	stormwater_benefits_dollar_value	float: *trees can control stormwater runoff by acting as mini-reservoirs. This value represents the benefit of stromwater runoff control in dollar terms in a year.*

16	stormwater_benefits_runoff_elim	float: *number of gallons of stormwater the tree can intercept annually.*

17	property_value_benefits_dollarvalue	float

18	property_value_benefits_leaf_surface_area	float

19	energy_benefits_electricity_dollar_value	float

20	energy_benefits_gas_dollar_value	float

21	air_quality_benfits_o3dep_dollar_value	float: *dep means deposition. This is the tree absorbing or intercepting the pollutant . o3 is ozone. *

22	air_quality_benfits_o3dep_lbs	float

23	air_quality_benfits_vocavd_dollar_value	float: *voc means volatile organic compounds. avd means avoided. This is the tree lessening the need for creation of these pollutants in the first place by reducing energy production needs.*

24	air_quality_benfits_vocavd_lbs	float

25	air_quality_benfits_no2dep_dollar_value	float: *no2 is nitrogen dioxide.*

26	air_quality_benfits_no2dep_lbs	float

27	air_quality_benfits_no2avd_dollar_value	float

28	air_quality_benfits_no2avd_lbs	float

29	air_quality_benfits_so2dep_dollar_value	float: *so2 is sulfur dioxide.*

30	air_quality_benfits_so2dep_lbs	float

31	air_quality_benfits_so2avd_dollar_value	float

32	air_quality_benfits_so2avd_lbs	float

33	air_quality_benfits_pm10depdollar_value	float: *pm10 are inhalable particles with diameters that are generally 10 micrometers and smaller.*

34	air_quality_benfits_pm10dep_lbs	float

35	air_quality_benfits_pm10avd_dollar_value	float

36	air_quality_benfits_pm10avd_lbs	float

37	air_quality_benfits_total_dollar_value	float

38	air_quality_benfits_total_lbs	float

39	co2_benefits_dollar_value	float

40	co2_benefits_sequestered_lbs	float

41	co2_benefits_sequestered_value	float

42	co2_benefits_avoided_lbs	float

43	co2_benefits_avoided_value	float

44	co2_benefits_decomp_lbs	float: *CO2 released when tree decomposes. A negative number to indicate emission.*

45	co2_benefits_maint_lbs	float: *CO2 released for tree maintenance. A negative number to indicate emission.*

46	co2_benefits_totalco2_lbs	float: *net CO2 benefits*

47	overall_benefits_dollar_value	float

48	neighborhood	text

49	council_district	text

50	ward	text

51	tract	text

52	public_works_division	text

53	pli_division	text

54	police_zone	text

55	fire_zone	text

56	latitude	float

57	longitude	float

58	diameter_base_height	float

In [32]:
df_trees = pd.read_csv("tree.csv", encoding="ISO-8859-1", low_memory=False)

In [33]:
len(df_trees)

45709

In [34]:
df_trees.head(3)

Unnamed: 0,id,address_number,street,common_name,scientific_name,height,width,growth_space_length,growth_space_width,growth_space_type,...,neighborhood,council_district,ward,tract,public_works_division,pli_division,police_zone,fire_zone,latitude,longitude
0,754166088,7428,MONTICELLO ST,Stump,Stump,0.0,0.0,10.0,2.0,Well or Pit,...,Homewood North,9.0,13.0,42003130000.0,2.0,13.0,5.0,3-17,40.458169,-79.889724
1,1946899269,220,BALVER AVE,Linden: Littleleaf,Tilia cordata,0.0,0.0,99.0,99.0,Open or Unrestricted,...,Oakwood,2.0,28.0,42003560000.0,5.0,28.0,6.0,1-19,40.429269,-80.067868
2,1431517397,2822,SIDNEY ST,Maple: Red,Acer rubrum,22.0,6.0,6.0,3.0,Well or Pit,...,South Side Flats,3.0,16.0,42003160000.0,3.0,16.0,3.0,4-24,40.426797,-79.965035


In [35]:
#Some datapoints are missing the basic tree name information. Only a few datapoints (13 in total)
#so decided to drop them. 
df_trees = df_trees.dropna(subset = ['common_name'])
df_trees = df_trees.dropna(subset = ['scientific_name'])

# There are around 300 datapoints that are missing its geo location info. Drop them. 
# Our team is interested in the neighborhood-level granularity so as long as the 
# data point contains neighborhood information, it is valuable to us. 
df_trees = df_trees.dropna(subset = ["latitude"])
df_trees = df_trees.dropna(subset = ["longitude"])
df_trees = df_trees.dropna(subset = ["neighborhood"])

#Also some data points are missing information. Drop these. 
df_trees = df_trees[df_trees["common_name"] != "Non-sufficient space"]
print(len(df_trees))

45345


In [36]:
#These are all numerical benefit values. 
values = ['stormwater_benefits_dollar_value', 'stormwater_benefits_runoff_elim',
       'property_value_benefits_dollarvalue',
       'property_value_benefits_leaf_surface_area',
       'energy_benefits_electricity_dollar_value',
       'energy_benefits_gas_dollar_value',
       'air_quality_benfits_o3dep_dollar_value',
       'air_quality_benfits_o3dep_lbs',
       'air_quality_benfits_vocavd_dollar_value',
       'air_quality_benfits_vocavd_lbs',
       'air_quality_benfits_no2dep_dollar_value',
       'air_quality_benfits_no2dep_lbs',
       'air_quality_benfits_no2avd_dollar_value',
       'air_quality_benfits_no2avd_lbs',
       'air_quality_benfits_so2dep_dollar_value',
       'air_quality_benfits_so2dep_lbs',
       'air_quality_benfits_so2avd_dollar_value',
       'air_quality_benfits_so2avd_lbs',
       'air_quality_benfits_pm10depdollar_value',
       'air_quality_benfits_pm10dep_lbs',
       'air_quality_benfits_pm10avd_dollar_value',
       'air_quality_benfits_pm10avd_lbs',
       'air_quality_benfits_total_dollar_value',
       'air_quality_benfits_total_lbs', 'co2_benefits_dollar_value',
       'co2_benefits_sequestered_lbs', 'co2_benefits_sequestered_value',
       'co2_benefits_avoided_lbs', 'co2_benefits_avoided_value',
       'co2_benefits_decomp_lbs', 'co2_benefits_maint_lbs',
       'co2_benefits_totalco2_lbs', 'overall_benefits_dollar_value']

#Assume tree stumps has no benefit values so replace NaN with 0.0
cond = (df_trees["common_name"] == "Stump") | (df_trees["scientific_name"] == "Stump")

#Assume vacant sites has no benefit values so replace NaN with 0.0
cond2 = (df_trees["common_name"] == 'Vacant Site Small') | (df_trees["common_name"] == 'Vacant Site Medium') | (df_trees["common_name"] == 'Vacant Site Not Suitable') | (df_trees["common_name"] == 'Vacant Site Large') 

for val in values:
  df_trees.loc[cond,val] = df_trees.loc[cond,val].fillna(0.0)
  df_trees.loc[cond2,val] = df_trees.loc[cond2,val].fillna(0.0)

In [37]:
# Some trees are missing some of the height, width, or benefit values. By missing, it means that
# these values are either 0.0 or NaN. 
# Replace them with the average for that tree type so we do not have to drop that tree datapoint.
# These values should be relatively independent of which neighborhood that tree is located in.
# For example, it is unlikely that there will be a statistically significant difference in height
# for the same type of tree across the neighborhoods. Similarly, the air quality value a type of tree provides
# should be independent of the neighborhood. 

independent_cols_to_replace = ['height', 'width', 'growth_space_length','growth_space_width','diameter_base_height','stems',     
                               'air_quality_benfits_o3dep_dollar_value',
       'energy_benefits_electricity_dollar_value',
       'energy_benefits_gas_dollar_value',
       'air_quality_benfits_o3dep_lbs',
       'air_quality_benfits_vocavd_dollar_value',
       'air_quality_benfits_vocavd_lbs',
       'air_quality_benfits_no2dep_dollar_value',
       'air_quality_benfits_no2dep_lbs',
       'air_quality_benfits_no2avd_dollar_value',
       'air_quality_benfits_no2avd_lbs',
       'air_quality_benfits_so2dep_dollar_value',
       'air_quality_benfits_so2dep_lbs',
       'air_quality_benfits_so2avd_dollar_value',
       'air_quality_benfits_so2avd_lbs',
       'air_quality_benfits_pm10depdollar_value',
       'air_quality_benfits_pm10dep_lbs',
       'air_quality_benfits_pm10avd_dollar_value',
       'air_quality_benfits_pm10avd_lbs',
       'air_quality_benfits_total_dollar_value',
       'air_quality_benfits_total_lbs', 'co2_benefits_dollar_value',
       'co2_benefits_sequestered_lbs', 'co2_benefits_sequestered_value',
       'co2_benefits_avoided_lbs', 'co2_benefits_avoided_value',
       'co2_benefits_decomp_lbs', 'co2_benefits_maint_lbs',
       'co2_benefits_totalco2_lbs']


tree_names = df_trees["common_name"].unique()
for val in independent_cols_to_replace:
  for tree_name in tree_names:
    t = df_trees[df_trees["common_name"] == tree_name]
    mean = t[val].mean()
    mask = (df_trees['common_name'] == tree_name) & ((df_trees[val].isna()) | (df_trees[val] == 0.0))
    df_trees.loc[mask, val] = mean
    # df_trees[val].fillna(value=mean, inplace=True)


In [38]:
# Some attributes,on the other hand, may depend on the neighborhood. For example,
# the property value benefits should be heavily influenced by the property value
# in that neighorhood. Similarly, stormwater benefits can vary across neighborhood
# based on the sewage condition. 
dependent_cols_to_replace = ['stormwater_benefits_dollar_value', 'stormwater_benefits_runoff_elim','property_value_benefits_dollarvalue', 'property_value_benefits_leaf_surface_area', 'overall_benefits_dollar_value']

#Build a dictionary that maps neighborhood, tree name, and the means of the different benefit values.
neighborhoods = df_trees["neighborhood"].unique()
tree_name_neighbor_hood_value_average = {}

for neighborhood in neighborhoods:
  tree_name_neighbor_hood_value_average[neighborhood] = {}
  t = df_trees[df_trees["neighborhood"] == neighborhood]
  for tree_name in tree_names:
    tree_name_neighbor_hood_value_average[neighborhood][tree_name] = {}
    k = t[t["common_name"] == tree_name]
    for val in dependent_cols_to_replace:
      mean = k[val].mean()
      tree_name_neighbor_hood_value_average[neighborhood][tree_name][val] = mean

In [None]:
# This cell may take a while to run since it is a triple for-loop. (around 15 mins in Google CoLab)
for neighborhood in neighborhoods:
  for tree_name in tree_names:
    for val in dependent_cols_to_replace:
      mean = tree_name_neighbor_hood_value_average[neighborhood][tree_name][val]
      mask = (df_trees['neighborhood'] == neighborhood) & (df_trees['common_name'] == tree_name) & (df_trees[val].isna() | df_trees[val] == 0.0)
      df_trees.loc[mask, val] = mean

In [None]:
print(df_trees.columns[df_trees.isnull().any()])

In [None]:
# However, some of the data points may still contain NaN in some of the columns 
# because that data point is the only tree of that type in that neighborhood. 
# In that case, just use the entire average for that type of tree. If that tree
# is the only tree out of the entire dataset, then will leave it as NaN. 
for val in dependent_cols_to_replace:
  for tree_name in tree_names:
    t = df_trees[df_trees["common_name"] == tree_name]
    mean = t[val].mean()
    df_trees[val].fillna(value=mean, inplace=True)


print(df_trees.columns[df_trees.isnull().any()])
# There will be some data points that contain NaN in some columns in the end, 
# but those can be processed specifically when those columns are used. 
# Also, data points like "Vacant Site" make sense to contain NaN because there is
# no tree there yet. 

#There are around 5200 entries where the "tree" is actually just a tree stump or 
#a vacant spot of various sizes. However these data are still interesting. 
temp = df_trees[(df_trees["common_name"] == "Stump") | (df_trees["common_name"] == 'Vacant Site Small') | (df_trees["common_name"] == 'Vacant Site Medium') | (df_trees["common_name"] == 'Vacant Site Not Suitable') | (df_trees["common_name"] == 'Vacant Site Large')] 
print(len(temp))

In [None]:
print(len(df_trees))
# save the cleaned data 
df_trees.to_csv('cleaned_tree_data_5.csv', index=False) 

#### Combine Neighborhood Features with Tree Information

In [None]:
# Aggregate tree information over neighborhood

# cleaned tree data
tree_data = pd.read_csv("cleaned_tree_data_5.csv", low_memory=False)

tree_data = pd.read_csv("cleaned_tree_data_5.csv", low_memory=False)
tree_data = tree_data[~tree_data['common_name'].isin(['Stump', 'Vacant Site Large', 'Vacant Site Medium', 'Vacant Site Not Suitable', 'Vacant Site Small'])]
tree_data = tree_data.drop(labels= ['address_number', 'street', 'common_name'], axis = 1)

# categorize benefits by neighborhood
tree_benefits_by_neighborhood = tree_data[['neighborhood', 'stormwater_benefits_dollar_value', 'air_quality_benfits_total_dollar_value', 'co2_benefits_dollar_value', 'overall_benefits_dollar_value']].groupby('neighborhood').sum()
tree_count_by_neighborhood = tree_data[['neighborhood', 'id']].groupby('neighborhood').count().rename(columns={"id": "tree_count"})

tree_info = tree_benefits_by_neighborhood.join(tree_count_by_neighborhood)

tree_info.head(3)

In [None]:
# 2015 education data
# LINK: https://data.wprdc.org/dataset/pittsburgh-american-community-survey-2015-miscellaneous-data/resource/12535b2e-6180-4cdf-b7d8-ec5294259e49

# load data
education_data = pd.read_csv("educational-attainment-for-the-population-25-years-and-over.csv")
# keep relevant columns
education_data = education_data[['Neighborhood', 'Estimate; Total:', 'Estimate; Total: - Regular high school diploma', 'Estimate; Total: - Bachelor\'s degree', 'Estimate; Total: - Master\'s degree']]

# cumulate data; those who have a master's degree will also have a bachelors degree and a high school diploma and so on
education_data["per_master"] = education_data['Estimate; Total: - Master\'s degree']/education_data['Estimate; Total:']
education_data["per_bachelor"] = education_data["per_master"] + education_data['Estimate; Total: - Bachelor\'s degree']/education_data['Estimate; Total:']
education_data["per_diploma"] = education_data["per_bachelor"] + education_data['Estimate; Total: - Regular high school diploma']/education_data['Estimate; Total:']

education_data = education_data.drop(labels = ['Estimate; Total:', 'Estimate; Total: - Regular high school diploma', 'Estimate; Total: - Bachelor\'s degree', 'Estimate; Total: - Master\'s degree'], axis = 1)
education_data = education_data.rename(columns={"Neighborhood": "neighborhood"}).groupby('neighborhood').sum()

education_dat.head(3)

In [None]:
# 2010 area data
# LINK: https://data.wprdc.org/dataset/neighborhoods-with-snap-data/resource/bce22c26-9d3e-4e3f-8405-a35c4b7765b6

area_data = pd.read_csv("Neighborhoods_with_SNAP_Data.csv")[['Neighborhood_2010_HOOD', 'Neighborhood_2010_AREA', 'Pop__2010', 'Est__Pop__Under_Poverty__2010_', 'SNAP_All_csv_Residential', 'SNAP_All_csv_Mixed_Use___Indust', 'SNAP_All_csv_Mixed_Use___Commer', 'SNAP_All_csv_Median_Home__Value', 'SNAP_All_csv_Landslide_Prone___']]
# rename columns 
area_data = area_data.rename(columns={"Neighborhood_2010_AREA": "neighborhood_area", 
                                      "Pop__2010":"population_2010", 
                                      "Est__Pop__Under_Poverty__2010_":"population_under_poverty_2010", 
                                      'SNAP_All_csv_Residential':"per_residential_area", 
                                      'SNAP_All_csv_Mixed_Use___Indust':"per_industrial_area", 
                                      'SNAP_All_csv_Mixed_Use___Commer':"per_commercial_area", 
                                      'Neighborhood_2010_HOOD':"neighborhood",
                                      'SNAP_All_csv_Median_Home__Value':"median_home_value",
                                      'SNAP_All_csv_Landslide_Prone___':"landslide_prone"})

# clean data and extract numeric values from strings
numeric_data = ['per_residential_area', 'per_commercial_area', 'per_industrial_area', 'median_home_value', 'landslide_prone']
for i in numeric_data:
    area_data[i] = area_data[i].str.replace(r'[^\d\.]+', '', regex = True).astype(float)

# get density based numerics
area_data['per_population_under_poverty_2010'] = area_data['population_under_poverty_2010']/area_data['population_2010']
area_data['population_density'] = area_data['population_2010']/area_data['neighborhood_area']

area_data = area_data.groupby('neighborhood').sum()

area_data.head(3)

In [None]:
# combine the data by neighborhood
complete_data = tree_info.join(education_data).join(area_data)

# get area normalized values for tree benefits
for i in ['stormwater_benefits_dollar_value', 'air_quality_benfits_total_dollar_value', 'co2_benefits_dollar_value', 'overall_benefits_dollar_value', 'tree_count']:
    complete_data['area_norm_' + i] = complete_data[i] * 10000/complete_data['neighborhood_area']

In [None]:
# remove raw columns
complete_data = complete_data.drop(labels = ['stormwater_benefits_dollar_value', 'air_quality_benfits_total_dollar_value', 'co2_benefits_dollar_value', 'overall_benefits_dollar_value'], axis = 1)

#### Processed File Description

In [None]:
for i in complete_data.columns:
    print('| ' + i + ' | ')

In [None]:
# save processed file to csv
complete_data.to_csv("neighborhood_features_data.csv", index = True)